def predict(self, new_data, batch_size, pool_size): """ predict for new data """ img_shape = (batch_size, 1, self.image_shape[2], self.image_shape[3]) conv_out = conv.conv2d(input=new_data, filters=self.W, filter_shape=self.filter_shape, image_shape=img_shape) pool_list = [] if self.non_linear == "tanh": conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle("x", 0, "x", "x")) # pad_len = int(self.max_window_len/2) # right_pad_len = int(self.filter_shape[2]/2) # index_shift = pad_len-right_pad_len index_shift = int(self.filter_shape[2] / 2) for i in xrange(batch_size): # partition sentence via pool size e1pos = pool_size[i, 0] + index_shift e2pos = pool_size[i, 1] + index_shift # if T.gt(e1pos, 0): # p1 = conv_out_tanh[i, :, :e1pos, :] # else: # p1 = conv_out_tanh[i, :, 0, :] p1 = conv_out_tanh[i, :, :e1pos, :] p2 = conv_out_tanh[i, :, e1pos:e2pos, :] p3 = conv_out_tanh[i, :, e2pos:, :] p1_pool_out = T.max(p1, axis=1) p2_pool_out = T.max(p2, axis=1) p3_pool_out = T.max(p3, axis=1) temp = T.concatenate([p1_pool_out, p2_pool_out, p3_pool_out], axis=1) pool_list.append(temp.dimshuffle("x", 0, 1)) else: pass output = T.concatenate(pool_list, axis=0) return output
def pool_function(input, axis): input_shape = tuple(input.shape) num_feature_maps_out = input_shape[axis - 1] pool_size = input_shape[axis] pool_shape = (input_shape[:axis] + (num_in_sum, num_in_max) + input_shape[axis + 1:]) # print("make_ghh_pool_conv2d: pool_shape is {}".format(pool_shape)) input_reshaped = input.reshape(pool_shape) # raise NotImplementedError('TODO: use a soft max instead of T.max') # res_after_max = T.max(input_reshaped,axis=axis+1) # Soft max with strength of max_strength res_after_max = np.cast[floatX](1.0) / np.cast[floatX](max_strength) \ * T.log(T.mean(T.exp(max_strength * (input_reshaped - T.max(input_reshaped, axis=axis + 1, keepdims=True))), axis=axis + 1)) \ + T.max(input_reshaped, axis=axis + 1) # Get deltas delta = np.cast[floatX](1.0) - np.cast[floatX](2.0) * \ (T.arange(num_in_sum, dtype=floatX) % np.cast[floatX](2)) target_dimshuffle = ('x',) * axis + (0,) + ('x',) * \ (len(input_shape) - 1 - axis) # print("make_ghh_pool_conv2d: target_dimshuffle is {}".format(target_dimshuffle)) delta = delta.flatten().dimshuffle(*target_dimshuffle) res_after_sum = T.sum(res_after_max * delta, axis=axis) return res_after_sum
def test_optimization_max(self): data = numpy.asarray(numpy.random.rand(2,3),dtype=config.floatX) n = tensor.matrix() f = function([n],tensor.max(n,0), mode=self.mode) topo = f.maker.env.toposort() assert len(topo)==1 assert isinstance(topo[0].op,CAReduce) f(data) f = function([n],tensor.max(-n,0), mode=self.mode) topo = f.maker.env.toposort() assert len(topo)==2 assert isinstance(topo[0].op, Elemwise) assert isinstance(topo[0].op.scalar_op, scalar.Neg) assert isinstance(topo[1].op,CAReduce) f(data) f = function([n],-tensor.max(n,0), mode=self.mode) topo = f.maker.env.toposort() assert len(topo)==2 assert isinstance(topo[0].op,CAReduce) assert isinstance(topo[1].op, Elemwise) assert isinstance(topo[1].op.scalar_op, scalar.Neg) f(data) f = function([n],-tensor.max(-n,0), mode=self.mode) topo = f.maker.env.toposort() assert len(topo)==1 assert isinstance(topo[0].op,CAReduce)#min f(data)
def compile_gpu_func(nan_is_error, inf_is_error, big_is_error): """ compile utility function used by contains_nan and contains_inf """ global f_gpumin, f_gpumax, f_gpuabsmax if not cuda.cuda_available: return guard_input = cuda.fvector("nan_guard") cuda_compile_failed = False if (nan_is_error or inf_is_error) and f_gpumin is None: try: f_gpumin = theano.function([guard_input], T.min(guard_input), mode="FAST_RUN") except RuntimeError: # This can happen if cuda is available, but the # device is in exclusive mode and used by another # process. cuda_compile_failed = True if inf_is_error and not cuda_compile_failed and f_gpumax is None: try: f_gpumax = theano.function([guard_input], T.max(guard_input), mode="FAST_RUN") except RuntimeError: # This can happen if cuda is available, but the # device is in exclusive mode and used by another # process. cuda_compile_failed = True if big_is_error and not cuda_compile_failed and f_gpuabsmax is None: try: f_gpuabsmax = theano.function([guard_input], T.max(T.abs_(guard_input)), mode="FAST_RUN") except RuntimeError: # This can happen if cuda is available, but the # device is in exclusive mode and used by another # process. cuda_compile_failed = True
def _test_layer_stats(self, layer_output): """ DESCRIPTION: This method is called every batch whereby the examples from test or valid set is pass through, the final result will be the mean of all the results from all the batches in an epoch from the test set or valid set. PARAM: layer_output: the output from the layer RETURN: A list of tuples of [('name_a', var_a), ('name_b', var_b)] whereby var is scalar """ w_len = T.sqrt((self.W ** 2).sum(axis=0)) max_length = T.max(w_len) mean_length = T.mean(w_len) min_length = T.min(w_len) return [('max_col_length', max_length), ('mean_col_length', mean_length), ('min_col_length', min_length), ('output_max', T.max(layer_output)), ('output_mean', T.mean(layer_output)), ('output_min', T.min(layer_output)), ('max_W', T.max(self.W)), ('mean_W', T.mean(self.W)), ('min_W', T.min(self.W)), ('max_b', T.max(self.b)), ('mean_b', T.mean(self.b)), ('min_b', T.min(self.b))]
def filterbank_matrices(self, center_y, center_x, delta, sigma): """ Create a Fy and a Fx Parameters ---------- center_y : T.vector (shape: batch_size) center_x : T.vector (shape: batch_size) Y and X center coordinates for the attention window delta : T.vector (shape: batch_size) sigma : T.vector (shape: batch_size) Returns ------- FY, FX """ tol = 1e-4 # construct x and y coordinates for the grid points obj_x = center_x.dimshuffle(0, 'x') + \ (delta.dimshuffle(0, 'x') * self.obj_x) obj_y = center_y.dimshuffle(0, 'x') + \ (delta.dimshuffle(0, 'x') * self.obj_y) # construct unnormalized attention weights for each grid point FX = T.exp( -(self.img_x - obj_x.dimshuffle(0,1,'x'))**2. / \ (2. * sigma.dimshuffle(0,'x','x')**2.) ) FY = T.exp( -(self.img_y - obj_y.dimshuffle([0,1,'x']))**2. / \ (2. * sigma.dimshuffle(0,'x','x')**2.) ) # normalize the attention weights #FX = FX / (FX.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) #FY = FY / (FY.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) FX = FX / (T.max(FX.sum(axis=-1)) + tol) FY = FY / (T.max(FY.sum(axis=-1)) + tol) return FY, FX
def define_network(self, layers_info=None): """ Builds Theano graph of the network. """ self.hidden_layers = [None]*self.n_hidden.size self.params = [] for i, h in enumerate(self.n_hidden): if i == 0: self.hidden_layers[i] = LBNHiddenLayer(self.rng, self.trng, self.x, self.n_in, h, self.det_activation[i], self.stoch_n_hidden, self.stoch_activation, det_activation_name=self.det_activation_names[i], stoch_activation_names=self.stoch_activation_names, m=self.m, det_W=None if layers_info is None else np.array( layers_info['hidden_layers'][i]['LBNlayer']['detLayer']\ ['W']), det_b=None if layers_info is None else np.array(layers_info['hidden_layers'][i]\ ['LBNlayer']['detLayer']['b']), stoch_mlp_info=None if layers_info is None else layers_info['hidden_layers'][i]['LBNlayer']['stochLayer']) else: self.hidden_layers[i] = LBNHiddenLayer(self.rng, self.trng, self.hidden_layers[i-1].output, self.n_hidden[i-1], h, self.det_activation[i], self.stoch_n_hidden, self.stoch_activation, det_activation_name=self.det_activation_names[i], stoch_activation_names=self.stoch_activation_names, det_W=None if layers_info is None else np.array(layers_info['hidden_layers'][i]['LBNlayer']\ ['detLayer']['W']), det_b=None if layers_info is None else np.array(layers_info['hidden_layers'][i]['LBNlayer']\ ['detLayer']['b']), stoch_mlp_info=None if layers_info is None else layers_info['hidden_layers'][i]['LBNlayer']['stochLayer']) self.params.append(self.hidden_layers[i].params) self.output_layer = OutputLayer(self.rng, self.hidden_layers[-1].output, self.n_hidden[-1], self.n_out, self.det_activation[-1], self.det_activation_names[-1], V_values=None if layers_info is None else np.array( layers_info['output_layer']['W'])) self.params.append(self.output_layer.params) self.output = self.output_layer.output exp_value = -0.5*T.sum((self.output - self.y.dimshuffle('x',0,1))**2, axis=2) max_exp_value = theano.ifelse.ifelse(T.lt(T.max(exp_value), -1*T.min(exp_value)), T.max(exp_value), T.min(exp_value)) self.log_likelihood = T.sum(T.log(T.sum(T.exp(exp_value - max_exp_value), axis=0)) + max_exp_value)-\ self.y.shape[0]*(T.log(self.m)+self.y.shape[1]/2.*T.log(2*np.pi)) self.predict = theano.function(inputs=[self.x, self.m], outputs=self.output)
def maxout(z = None): #g = theano.shared(numpy.zeros((hidden_layers_sizes[i],)),name='g',borrow=True) g = T.max(z[0:5]) g = T.stack(g,T.max(z[5:10])) for index in xrange(hidden_layers_sizes[i]-10): g = T.concatenate([g,[T.max(z[5*(index+2):5*(index+3)])]]) return g
def __theano__softmax(self, inp, dim=None, predict=False, issequence=False): if dim is None: assert issequence, "Data dimensionality could not be parsed." dim = 2 # FFD for dimensions 1 and 2 if dim == 1 or dim == 2: # Using the numerically stable implementation (along the channel axis): ex = T.exp(inp - T.max(inp, axis=1, keepdims=True)) y = ex / T.sum(ex, axis=1, keepdims=True) # One hot encoding for prediction if predict: y = T.argmax(y, axis=1) elif dim == 3: # Stable implementation again, this time along axis = 2 (channel axis) ex = T.exp(inp - T.max(inp, axis=2, keepdims=True)) y = ex / T.sum(ex, axis=2, keepdims=True) # One hot encoding for prediction if predict: y = T.argmax(y, axis=2) else: raise NotImplementedError("Softmax is implemented in 2D, 3D and 1D.") return y
def test_max(self): # If we call max directly, we will return an CAReduce object # and he don't have R_op implemented! # self.check_mat_rop_lop(tensor.max(self.mx, axis=[0,1])[0], # ()) self.check_mat_rop_lop(tensor.max(self.mx, axis=0), (self.mat_in_shape[1],)) self.check_mat_rop_lop(tensor.max(self.mx, axis=1), (self.mat_in_shape[0],))
def forward_init(self): obs_ = self.obs_.reshape([self.obs_.shape[0]*self.obs_.shape[1], self.obs_.shape[-1]]) h = eval(self.activ)(tensor.dot(obs_, self.params['W']) + self.params['b'][None,None,:]) self.pi = [] for oi in xrange(self.n_out): pi = tensor.dot(h, self.params['U%d'%oi]) + self.params['c%d'%oi][None,:] pi = tensor.exp(pi - tensor.max(pi,-1,keepdims=True)) self.pi.append(pi / (pi.sum(-1, keepdims=True))) prev = tensor.matrix('prev', dtype='float32') #obs = tensor.matrix('obs', dtype='float32') obs_ = self.obs_.reshape([self.obs_.shape[0]*self.obs_.shape[1], self.obs_.shape[-1]]) obs_ = obs_[0] self.h_init = lambda x: numpy.float32(0.) h = eval(self.activ)(tensor.dot(obs_, self.params['W']) + self.params['b'][None,:]) pi = [] for oi in xrange(self.n_out): pi_ = tensor.dot(h, self.params['U%d'%oi]) + self.params['c%d'%oi][None,:] pi_ = tensor.exp(pi_ - tensor.max(pi_,-1,keepdims=True)) pi.append(pi_ / (pi_.sum(-1, keepdims=True))) self.forward = theano.function([self.obs, prev], [h] + pi, name='forward', on_unused_input='ignore')
def decoder(localt, stm1, cstm1, hmat, Wbeta, Ubeta, vbeta, Wzide, Wzfde, Wzcde, Wzode, Ede, Wxide, Wside, bide, Wxfde, Wsfde, bfde, Wxcde, Wscde, bcde, Wxode, Wsode, bode, L0, Ls, Lz): xt = theano.dot(localt, Ede) # get z from hmat (sentlen * nen), stm1 beta = \ theano.dot( act( theano.dot(hmat,Ubeta) + theano.dot(stm1,Wbeta) ) , vbeta ) alpha = T.exp(beta-T.max(beta)) / T.sum(T.exp(beta-T.max(beta)) ) zt = theano.dot(alpha, hmat) # it = sigma(theano.dot(xt,Wxide) + theano.dot(stm1,Wside) + theano.dot(zt,Wzide) + bide ) ft = sigma(theano.dot(xt,Wxfde) + theano.dot(stm1,Wsfde) + theano.dot(zt,Wzfde) + bfde ) cst = ft * cstm1 + it*act(theano.dot(xt,Wxcde)+theano.dot(stm1,Wscde)+ theano.dot(zt,Wzcde) +bcde ) ot = sigma(theano.dot(xt,Wxode) + theano.dot(stm1,Wsode) + theano.dot(zt,Wzode) +bode ) st = ot * act(cst) # winst = getwins() stfory = st * winst # yt0 = T.dot( (xt + T.dot(stfory, Ls) + T.dot(zt, Lz) ) , L0) #yt0 = theano.dot(st,Wsyde) yt0max = T.max(yt0) #yt0maxvec = T.maximum(yt0, yt0max) yt = T.exp(yt0-yt0max) / T.sum(T.exp(yt0-yt0max)) logyt = yt0-yt0max-T.log(T.sum(T.exp(yt0-yt0max))) #yt = T.exp(yt0-yt0maxvec) / T.sum(T.exp(yt0-yt0maxvec)) #logyt = yt0-yt0maxvec-T.log(T.sum(T.exp(yt0-yt0maxvec))) # yt = T.concatenate([addzero,tempyt],axis=0) return st, cst, yt, logyt
def update_log_p(skip_idxs,zeros,active,log_p_curr,log_p_prev): active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()] active_next = T.cast(T.minimum( T.maximum( active + 1, T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1 ), log_p_curr.shape[0] ), 'int32') common_factor = T.max(log_p_prev[:active]) p_prev = T.exp(log_p_prev[:active] - common_factor) _p_prev = zeros[:active_next] # copy over _p_prev = T.set_subtensor(_p_prev[:active], p_prev) # previous transitions _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1]) # skip transitions _p_prev = T.inc_subtensor( _p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs]) updated_log_p_prev = T.log(_p_prev) + common_factor log_p_next = T.set_subtensor( zeros[:active_next], log_p_curr[:active_next] + updated_log_p_prev ) return active_next, log_p_next
def plotUpdate(self,updates): ''' >>>get update info of each layer >>>type updates: dict >>>para updates: update dictionary ''' maxdict=T.zeros(shape=(self.deep*2+1,)) mindict=T.zeros(shape=(self.deep*2+1,)) meandict=T.zeros(shape=(self.deep*2+1,)) for i in xrange(self.deep): updw=updates[self.layers[i].w]-self.layers[i].w maxdict=T.set_subtensor(maxdict[2*i],T.max(updw)) mindict=T.set_subtensor(mindict[2*i],T.min(updw)) meandict=T.set_subtensor(meandict[2*i],T.mean(updw)) updb=updates[self.layers[i].b]-self.layers[i].b maxdict=T.set_subtensor(maxdict[2*i+1],T.max(updb)) mindict=T.set_subtensor(mindict[2*i+1],T.min(updb)) meandict=T.set_subtensor(meandict[2*i+1],T.mean(updb)) updw=updates[self.classifier.w]-self.classifier.w maxdict=T.set_subtensor(maxdict[self.deep*2],T.max(updw)) mindict=T.set_subtensor(mindict[self.deep*2],T.min(updw)) meandict=T.set_subtensor(meandict[self.deep*2],T.mean(updw)) return [maxdict,mindict,meandict]
def _activation(self, Y, L, M, W): """Returns the activation for a given input. Derived from the generative model formulation of hierarchical Poisson mixtures, the formular for the activation in the network reads as follows: I_c = \sum_d \log(W_{cd})y_d + \log(M_{lc}) for labeled data \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data s_c = softmax(I_c) """ # first: complete inference to find label # Input integration: I = T.tensordot(Y,T.log(W),axes=[1,1]) # recurrent term: vM = M[L] L_index = T.eq(L,-1).nonzero() vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0)) # numeric trick to prevent overflow in the exp-function max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32'))) scale = T.switch( T.gt(T.max(I, axis=1, keepdims=True), max_exponent), T.max(I, axis=1, keepdims=True) - max_exponent, 0.) # numeric approximation to prevent underflow in the exp-function: # map too low values of I to a fixed minimum value min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32'))) I = T.switch( T.lt(I-scale, min_exponent), scale+min_exponent, I) # activation: recurrent softmax with overflow protection s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale), axis=1, keepdims=True) return s
def norm(x,ord): x = as_tensor_variable(x) ndim = x.ndim if ndim == 0: raise ValueError("'axis' entry is out of bounds.") elif ndim == 1: if ord == None: return tensor.sum(x**2)**0.5 elif ord == 'inf': return tensor.max(abs(x)) elif ord == '-inf': return tensor.min(abs(x)) elif ord == 0: return x[x.nonzero()].shape[0] else: try: z = tensor.sum(abs(x**ord))**(1./ord) except TypeError: raise ValueError("Invalid norm order for vectors.") return z elif ndim == 2: if ord == None or ord == 'fro': return tensor.sum(abs(x**2))**(0.5) elif ord == 'inf': return tensor.max(tensor.sum(abs(x), 1)) elif ord == '-inf': return tensor.min(tensor.sum(abs(x), 1)) elif ord == 1: return tensor.max(tensor.sum(abs(x), 0)) elif ord == -1: return tensor.min(tensor.sum(abs(x),0)) else: raise ValueError(0) elif ndim > 2: raise NotImplementedError("We don't support norm witn ndim > 2")
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX ) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = - TT.sum(logli * advantage_var * valid_var) / TT.sum(valid_var) mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) else: surr_obj = - TT.mean(logli * advantage_var) mean_kl = TT.mean(kl) max_kl = TT.max(kl) input_list = [obs_var, action_var, advantage_var] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list) f_kl = ext.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict( f_kl=f_kl, )
def pos_mf_iteration(g1, h1, v, pos_counter): h2 = self.h_hat(g1, v) s2_1 = self.s1_hat(g1, v) s2_0 = self.s0_hat(g1, v) g2 = self.g_hat(h2, s2_1, s2_0) # stopping criterion stop = T.maximum(T.max(g2 - g1), T.max(h2 - h1)) return [g2, h2, s2_1, s2_0, v, pos_counter + 1], theano.scan_module.until(stop < eps)
def rmax(x): xmax = T.ge(x, T.max(x, axis = 1).reshape((x.shape[0],1))) shift = (T.ones_like(x) - xmax) * x max2 = T.max(shift,axis = 1).reshape((x.shape[0],1)) out = T.nnet.relu(x - max2) return out
def __call__(self, x): if x.ndim == 2: x = T.max([x[:, n::self.n_pool] for n in range(self.n_pool)], axis=0) elif x.ndim == 4: x = T.max([x[:, n::self.n_pool, :, :] for n in range(self.n_pool)], axis=0) elif x.ndim == 3: x = T.max([x[:, :, n::self.n_pool] for n in range(self.n_pool)], axis=0) return x
def __call__(self, x): if x.ndim == 2: x = T.max([x[:, n::self.n_pool] for n in range(self.n_pool)], axis=0) elif x.ndim == 4: x = T.max([x[:, n::self.n_pool, :, :] for n in range(self.n_pool)], axis=0) else: raise NotImplementedError return x
def Max_pooling(inp): """ Finding max across rows; inp is a 2D matrix """ if inp.ndim==1: return T.max(inp) else: return T.max(inp,axis=0)
def __call__(self, x): if x.ndim == 2: x = T.max([x[:, n :: self.n_pool] for n in range(self.n_pool)], axis=0) elif x.ndim == 4: x = T.max([x[:, n :: self.n_pool, :, :] for n in range(self.n_pool)], axis=0) elif x.ndim == 3: print "assuming standard rnn 3tensor" x = T.max([x[:, :, n :: self.n_pool] for n in range(self.n_pool)], axis=0) return x
def pos_mf_iteration(g1, h1, v, pos_counter): h2 = self.h_hat(g1, v) s2_1 = self.s1_hat(g1, v) s2_0 = self.s0_hat(g1, v) g2 = self.g_hat(h2, s2_1, s2_0) # stopping criterion dl_dghat = T.max(abs(self.dlbound_dg(g2, h2, s2_1, s2_0, v))) dl_dhhat = T.max(abs(self.dlbound_dh(g2, h2, s2_1, s2_0, v))) stop = T.maximum(dl_dghat, dl_dhhat) return [g2, h2, s2_1, s2_0, v, pos_counter + 1], theano.scan_module.until(stop < eps)
def _step_test(self, x_t, xi_t, xf_t, xo_t, xc_t, mask_tm1, pred1_tm1, pred2_tm1, pred3_tm1, pred4_tm1, h_tm1, c_tm1, ctx_tm1, u_i, u_f, u_o, u_c, x_encoder, attention_encoder, x_img, B_W, B_U, B_Wimg, B_Wctx): outer1 = pred1_tm1[:, :, np.newaxis] * pred2_tm1[:, np.newaxis, :] outer1 = outer1.reshape((outer1.shape[0],-1)) outer2 = pred3_tm1[:, :, np.newaxis] * pred4_tm1[:, np.newaxis, :] outer2 = outer2.reshape((outer2.shape[0],-1)) pred = outer1[:, :, np.newaxis] * outer2[:, np.newaxis, :] pred = pred.reshape((pred.shape[0],-1)) x_t = self.W_embedding[T.argmax(pred, axis = 1)] * B_W[4] h_mask_tm1 = mask_tm1 * h_tm1 c_mask_tm1 = mask_tm1 * c_tm1 attention_x = T.dot(x_t, self.W_x2a) attention_total = attention_x[:,None,:] + attention_encoder if self.prev_context: attention_prev = T.dot(ctx_tm1,self.W_ctx2a) attention_total += attention_prev[:,None,:] attention_activation = T.dot( T.tanh(attention_total), self.V) # attention -> scores attention_alpha = T.nnet.softmax(attention_activation[:,:,0]) # scores -> weights ctx_t = (x_encoder * attention_alpha[:,:,None]).sum(axis = 1) # weighted average of context vectors xi_t = T.dot(x_t * B_W[0], self.W_i) + self.b_i + T.dot(x_img * B_Wimg[0], self.Wimg_i) + T.dot(ctx_t * B_Wctx[0], self.Wctx_i) xf_t = T.dot(x_t * B_W[1], self.W_f) + self.b_f + T.dot(x_img * B_Wimg[1], self.Wimg_f) + T.dot(ctx_t * B_Wctx[1], self.Wctx_f) xc_t = T.dot(x_t * B_W[2], self.W_c) + self.b_c + T.dot(x_img * B_Wimg[2], self.Wimg_c) + T.dot(ctx_t * B_Wctx[2], self.Wctx_c) xo_t = T.dot(x_t * B_W[3], self.W_o) + self.b_o + T.dot(x_img * B_Wimg[3], self.Wimg_o) + T.dot(ctx_t * B_Wctx[3], self.Wctx_o) i_t = self.inner_activation(xi_t + T.dot(h_mask_tm1 * B_U[0], u_i)) f_t = self.inner_activation(xf_t + T.dot(h_mask_tm1 * B_U[1], u_f)) c_t = f_t * c_mask_tm1 + i_t * self.activation(xc_t + T.dot(h_mask_tm1 * B_U[2], u_c)) o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1 * B_U[3], u_o)) h_t = o_t * self.activation(c_t) pred1_t = T.dot(h_t, self.U_p1) + self.b_p1 pred1_t = T.nnet.softmax(pred1_t.reshape((-1, pred1_t.shape[-1]))).reshape(pred1_t.shape) pred2_t = T.dot(h_t, self.U_p2) + self.b_p2 pred2_t = T.nnet.softmax(pred2_t.reshape((-1, pred2_t.shape[-1]))).reshape(pred2_t.shape) pred3_t = T.dot(h_t, self.U_p3) + self.b_p3 pred3_t = T.nnet.softmax(pred3_t.reshape((-1, pred3_t.shape[-1]))).reshape(pred3_t.shape) pred4_t = T.dot(h_t, self.U_p4) + self.b_p4 pred4_t = T.nnet.softmax(pred4_t.reshape((-1, pred4_t.shape[-1]))).reshape(pred4_t.shape) pred1_t = T.ge(pred1_t, T.max(pred1_t, axis = 1).reshape((pred1_t.shape[0],1)))*1.0 pred2_t = T.ge(pred2_t, T.max(pred2_t, axis = 1).reshape((pred2_t.shape[0],1)))*1.0 pred3_t = T.ge(pred3_t, T.max(pred3_t, axis = 1).reshape((pred3_t.shape[0],1)))*1.0 pred4_t = T.ge(pred4_t, T.max(pred4_t, axis = 1).reshape((pred4_t.shape[0],1)))*1.0 return pred1_t, pred2_t, pred3_t, pred4_t, h_t, c_t, ctx_t
def logsoftmax(x, axis=None): ''' Applies logsoftmax to x over the given axis (i.e. exp/sum(exp)). ''' if isinstance(axis, int): m = T.max(x, axis=axis, keepdims=True) else: m = T.max(x) exp_x = T.exp(x - m) Z = T.sum(exp_x, axis=axis, keepdims=True) return x - m - T.log(Z)
def update_t(t, LLForward, alphas, scorematrix, queryseq, blank, T, L2): start = tensor.max([0, L2 - 2 * (T - t)]) end = tensor.min([2 * t + 2, L2]) s = tensor.arange(start, end) results, _ = theano.scan(fn=update_s, sequences=[s], non_sequences=[scorematrix, queryseq, blank, t], outputs_info=[alphas], name='scan_along_s') alphas = results[-1] c = tensor.sum(alphas[start:end, t]) c = tensor.max([1e-15, c]) alphas = tensor.set_subtensor(alphas[start:end, t], alphas[start:end, t] / c) LLForward += tensor.log(c) return LLForward, alphas
def Convolution(self, x, mask): xe = self.approx_embedder(x) _mask = self.tmp[mask] _res1, _ = theano.scan(self.ConvLayer1, sequences=[xe]) _res2, _ = theano.scan(self.ConvLayer2, sequences=[xe[:-1], xe[1:]]) _res3, _ = theano.scan(self.ConvLayer3, sequences=[xe[:-2],xe[1:-1],xe[2:]]) hidden1 = T.tanh(T.max(_res1*_mask, axis=0)).dimshuffle('x',0,1) hidden2 = T.tanh(T.max(_res2*_mask[:-1], axis=0)).dimshuffle('x',0,1) hidden3 = T.tanh(T.max(_res3*_mask[:-2], axis=0)).dimshuffle('x',0,1) return T.mean(T.concatenate([hidden1, hidden2, hidden3], axis=0), axis=0)
def get_monitoring_channels(self, V): vb, hb, weights = self.get_params() norms = theano_norms(weights) return {'W_min': tensor.min(weights), 'W_max': tensor.max(weights), 'W_norm_mean': tensor.mean(norms), 'bias_hid_min' : tensor.min(hb), 'bias_hid_mean' : tensor.mean(hb), 'bias_hid_max' : tensor.max(hb), 'bias_vis_min' : tensor.min(vb), 'bias_vis_mean' : tensor.mean(vb), 'bias_vis_max': tensor.max(vb), }
def __init__(self, sample_fn, free_energy_fn, v_sample0, n_runs, log_int=500): """ Initialized the AIS object. Parameters ---------- sample_fn: compiled theano function, sample_fn(beta, v_sample) returns new model samples, at inverse temperature `beta`. Internally, we do this by performing block gibbs sampling using Eq.(15-17) (implemented in rbm_ais_gibbs_for_v) starting from configuration v_sample. free_energy_fn: theano function, free_energy_fn(beta,v_sample) Computes the free-energy of of configuration v_sample at the interpolating distribution p_a^(1-beta) p_b^(beta). v_sample0: numpy.ndarray initial samples from model A. n_runs: int number of AIS runs (i.e. minibatch size) log_int: int log standard deviation of log ais weights every `log_int` temperatures. """ self.sample_fn = sample_fn self.free_energy_fn = free_energy_fn self.v_sample0 = v_sample0 self.n_runs = n_runs self.log_int = log_int # initialize log importance weights self.log_ais_w = numpy.zeros(n_runs, dtype=config.floatX) # utility function for safely computing log-mean of the ais weights ais_w = tensor.vector() dlogz = ( tensor.log(tensor.mean(tensor.exp(ais_w - tensor.max(ais_w)))) \ + tensor.max(ais_w) ) self.log_mean = theano.function([ais_w], dlogz, allow_input_downcast=False)
def compile_update_svdd(nnet, inputs, targets): """ create a Deep SVDD loss for network given in argument """ floatX = Cfg.floatX ndim = nnet.data._X_train.ndim C = Cfg.C C_rec = Cfg.C_rec nu = Cfg.nu # initialize R if nnet.R_init > 0: nnet.Rvar = shared(floatX(nnet.R_init), name="R") else: nnet.Rvar = shared(floatX(1), name="R") # initialization with R=1 # Final Layer of the network final_layer = nnet.all_layers[-1] # SVDD Loss feature_layer = nnet.feature_layer rep = lasagne.layers.get_output(feature_layer, inputs=inputs, deterministic=False) # initialize c (0.5 in every feature representation dimension) rep_dim = feature_layer.num_units # maximum likehood volume = T.cast(floatX(-0.5) * T.sum(((rep)**2), axis=1, dtype='floatX'), dtype='floatX') log_pro = T.cast(T.log(floatX(2 * np.pi) * T.exp(volume)), dtype='floatX') # log_likehood =T.mean(log_pro) # volume = T.cast(T.sum(floatX(-0.5) *(T.log(floatX(2 * np.pi)) + (rep ** 2)) , axis=1, dtype='floatX'), dtype='floatX') # log_pro = T.cast(T.exp(volume), dtype='floatX') # log_likehood = T.mean(log_pro) # # calculate entropy throught Kernel Density Estimation # rep_tranpose = T.transpose(rep, (1, 0)) # rep_reshape = T.reshape(rep_tranpose, [rep_dim, Cfg.batch_size, 1]) # transfer_vector = theano.shared(np.ones([1, Cfg.batch_size], dtype='float32')) # result = T.dot(rep_reshape, transfer_vector) # result1 = T.transpose(result, (0, 2, 1)) # subtract = result - result1 # KL_volume = T.cast(floatX(-0.5) * (T.log(floatX(2 * np.pi)) + (subtract ** 2)), dtype='floatX') # KL_volume = T.sum(KL_volume, axis=0) # KL_pro = T.cast(T.exp(KL_volume), dtype='floatX') # KL_pro_average = T.mean(KL_pro, axis=1) # log_KL_pro = T.log(KL_pro_average) # entropy = T.mean(log_KL_pro) # nnet.cvar = shared(floatX(np.ones(rep_dim) * (1. / (rep_dim ** 0.5))), # name="c") nnet.cvar = shared(floatX(np.ones(rep_dim) * 0.5), name="c") dist = T.sum(((rep - nnet.cvar.dimshuffle('x', 0))**2), axis=1, dtype='floatX') scores = dist - nnet.Rvar stack = T.stack([T.zeros_like(scores), scores], axis=1) loss = T.cast(T.sum(T.max(stack, axis=1)) / (inputs.shape[0] * nu), dtype='floatX') y_pred = T.argmax(stack, axis=1) acc = T.cast((T.sum(T.eq(y_pred.flatten(), targets), dtype='int32') * 1. / targets.shape[0]), 'floatX') # Network weight decay if Cfg.weight_decay: l2_penalty = (1 / C) * get_l2_penalty(nnet) else: l2_penalty = T.cast(0, dtype='floatX') # Reconstruction regularization if Cfg.reconstruction_penalty: reconstruction = lasagne.layers.get_output(final_layer, inputs=inputs, deterministic=False) # use l2 or binary crossentropy loss (features are scaled to [0,1]) if Cfg.ae_loss == "l2": rec_loss = lasagne.objectives.squared_error(reconstruction, inputs) if Cfg.ae_loss == "ce": rec_loss = lasagne.objectives.binary_crossentropy( reconstruction, inputs) rec_loss = T.sum(rec_loss, axis=range(1, ndim), dtype='floatX') rec_penalty = (1 / C_rec) * T.mean(rec_loss) else: rec_penalty = T.cast(0, dtype='floatX') trainable_params = lasagne.layers.get_all_params(final_layer, trainable=True) #Deep Gaussian Model updates_deep_kde = get_updates(nnet, deep_kde_loss, trainable_params, solver=nnet.solver) # nnet.backprop_deep_kde = theano.function([inputs, targets], [log_likehood, entropy], updates=updates_deep_kde, # on_unused_input='warn') # Backpropagation (hard-margin: only minimizing everything to a ball centered at c) if not Cfg.center_fixed: trainable_params.append( nnet.cvar ) # add center c to trainable parameters if it should not be fixed. avg_dist = T.mean(dist, dtype="floatX") obj_ball = T.cast(floatX(0.5) * (l2_penalty + rec_penalty) + avg_dist, dtype='floatX') updates_ball = get_updates(nnet, obj_ball, trainable_params, solver=nnet.solver) nnet.backprop_ball = theano.function([inputs, targets], [obj_ball, acc], updates=updates_ball, on_unused_input='warn') # Backpropagation (without training R) obj = T.cast(floatX(0.5) * (l2_penalty + rec_penalty) + nnet.Rvar + loss, dtype='floatX') updates = get_updates(nnet, obj, trainable_params, solver=nnet.solver) nnet.backprop_without_R = theano.function([inputs, targets], [obj, acc], updates=updates, on_unused_input='warn') # Backpropagation (with training R) trainable_params.append(nnet.Rvar) # add radius R to trainable parameters updates = get_updates(nnet, obj, trainable_params, solver=nnet.solver) nnet.backprop = theano.function([inputs, targets], [obj, acc], updates=updates, on_unused_input='warn') # Forwardpropagation test_rep = lasagne.layers.get_output(feature_layer, inputs=inputs, deterministic=True) test_rep_norm = test_rep.norm(L=2, axis=1) test_dist = T.sum(((test_rep - nnet.cvar.dimshuffle('x', 0))**2), axis=1, dtype='floatX') test_scores = test_dist - nnet.Rvar test_stack = T.stack([T.zeros_like(test_scores), test_scores], axis=1) test_loss = T.cast(T.sum(T.max(test_stack, axis=1)) / (inputs.shape[0] * nu), dtype='floatX') test_y_pred = T.argmax(test_stack, axis=1) test_acc = T.cast( (T.sum(T.eq(test_y_pred.flatten(), targets), dtype='int32') * 1. / targets.shape[0]), dtype='floatX') # Reconstruction regularization (with determinisitc=True) if Cfg.reconstruction_penalty: test_reconstruction = lasagne.layers.get_output(final_layer, inputs=inputs, deterministic=True) # use l2 or binary crossentropy loss (features are scaled to [0,1]) if Cfg.ae_loss == "l2": test_rec_loss = lasagne.objectives.squared_error( test_reconstruction, inputs) if Cfg.ae_loss == "ce": test_rec_loss = lasagne.objectives.binary_crossentropy( test_reconstruction, inputs) test_rec_loss = T.sum(test_rec_loss, axis=range(1, ndim), dtype='floatX') test_rec_penalty = (1 / C_rec) * T.mean(test_rec_loss) else: test_reconstruction = lasagne.layers.get_output(final_layer, inputs=inputs, deterministic=True) test_rec_penalty = T.cast(0, dtype='floatX') test_obj = T.cast(floatX(0.5) * (l2_penalty + test_rec_penalty) + nnet.Rvar + test_loss, dtype='floatX') nnet.forward = theano.function([inputs, targets], [ test_obj, test_acc, test_scores, floatX(0.5) * l2_penalty, floatX(0.5) * test_rec_penalty, test_rep, test_rep_norm, test_reconstruction, test_loss, nnet.Rvar ], on_unused_input='warn')
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() z = self.z = T.bmatrix() z = z.dimshuffle((0, 1, "x")) # batch*nclasses y = self.y = T.fmatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] depth = args.depth layer_type = args.layer.lower() for i in xrange(depth): if layer_type == "rcnn": l = ExtRCNN(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = ExtLSTM(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) layers.append(l) # len * batch * 1 masks = T.cast( T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z, theano.config.floatX) # batch * 1 cnt_non_padding = T.sum(masks, axis=0) + 1e-8 # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) pooling = args.pooling lst_states = [] h_prev = embs for l in layers: # len*batch*n_d h_next = l.forward_all(h_prev, z) if pooling: # batch * n_d masked_sum = T.sum(h_next * masks, axis=0) lst_states.append(masked_sum / cnt_non_padding) # mean pooling else: lst_states.append(h_next[-1]) # last state h_prev = apply_dropout(h_next, dropout) if args.use_all: size = depth * n_d # batch * size (i.e. n_d*depth) h_final = T.concatenate(lst_states, axis=1) else: size = n_d h_final = lst_states[-1] h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer(n_in=size, n_out=self.nclasses, activation=sigmoid) # batch * nclasses preds = self.preds = output_layer.forward(h_final) # batch loss_mat = self.loss_mat = (preds - y)**2 loss = self.loss = T.mean(loss_mat) pred_diff = self.pred_diff = T.mean( T.max(preds, axis=1) - T.min(preds, axis=1)) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost cost = self.cost = loss * 10 + l2_cost
def set_network_trainer(input_data, input_mask, target_data, target_mask, num_outputs, network, updater, learning_rate, grad_max_norm=10., l2_lambda=1e-5, load_updater_params=None): # get one hot target one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1), nb_class=num_outputs, dtype=floatX) # get network output data predict_data = get_output(network, deterministic=False) num_seqs = predict_data.shape[0] # get prediction cost predict_data = T.reshape(x=predict_data, newshape=(-1, num_outputs), ndim=2) predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True) predict_data = predict_data - T.log( T.sum(T.exp(predict_data), axis=-1, keepdims=True)) train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data), axis=-1) train_predict_cost = train_predict_cost * T.flatten(target_mask, 1) train_model_cost = train_predict_cost.sum() / num_seqs train_frame_cost = train_predict_cost.sum() / target_mask.sum() # get regularizer cost train_regularizer_cost = regularize_network_params(network, penalty=l2) # get network parameters network_params = get_all_params(network, trainable=True) # get network gradients network_grads = theano.grad(cost=train_model_cost + train_regularizer_cost * l2_lambda, wrt=network_params) if grad_max_norm > 0.: network_grads, network_grads_norm = total_norm_constraint( tensor_vars=network_grads, max_norm=grad_max_norm, return_norm=True) else: network_grads_norm = T.sqrt( sum(T.sum(grad**2) for grad in network_grads)) # set updater train_updates, trainer_params = updater( loss_or_grads=network_grads, params=network_params, learning_rate=learning_rate, load_params_dict=load_updater_params) # get training (update) function training_fn = theano.function( inputs=[input_data, input_mask, target_data, target_mask], outputs=[train_frame_cost, network_grads_norm], updates=train_updates) return training_fn, trainer_params
def initialize_network(self): """ :description: this method initializes the network, updates, and theano functions for training and retrieving q values. Here's an outline: 1. build the q network and target q network 2. initialize theano symbolic variables used for compiling functions 3. initialize the theano numeric variables used as input to functions 4. formulate the symbolic loss 5. formulate the symbolic updates 6. compile theano functions for training and for getting q_values """ build_network = self.get_build_network() batch_size, input_shape = self.batch_size, self.input_shape lasagne.random.set_rng(self.rng) # 1. build the q network and target q network self.l_out = build_network(input_shape, self.sequence_length, batch_size, self.num_actions) self.next_l_out = build_network(input_shape, self.sequence_length, batch_size, self.num_actions) self.reset_target_network() # 2. initialize theano symbolic variables used for compiling functions states = T.tensor3('states') actions = T.icol('actions') rewards = T.col('rewards') next_states = T.tensor3('next_states') # terminals are used to indicate a terminal state in the episode and hence a mask over the future # q values i.e., Q(s',a') terminals = T.icol('terminals') # 3. initialize the theano numeric variables used as input to functions or in functions self.states_shape = (batch_size,) + (self.sequence_length,) + (self.input_shape, ) self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # 4. formulate the symbolic loss q_vals = lasagne.layers.get_output(self.l_out, states) next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector' diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) # a lot of the recent work clips the td error at 1 so we do that here # the problem is that gradient backpropagating through this minimum node # will be zero if diff is larger then 1.0 (because changing params before # the minimum does not impact the output of the minimum). To account for # this we take the part of the td error (magnitude) greater than 1.0 and simply # add it to the loss, which allows gradient to backprop but just linearly # in the td error rather than quadratically quadratic_part = T.minimum(abs(diff), 1.0) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + linear_part loss = T.sum(loss) # 5. formulate the symbolic updates params = lasagne.layers.helper.get_all_params(self.l_out) updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate) # 6. compile theano functions for training and for getting q_values and hid init givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._get_q_values = theano.function([], [q_vals], givens={states: self.states_shared})
def get_output_for(self, deterministic=False): if deterministic: deterministic_flag = T.constant(1) else: deterministic_flag = T.constant(0) batch_size = self.pred.shape[0] time_steps = self.pred.shape[1] label_num = self.pred.shape[2] ## the start state to first label pred_t1 = self.pred[:, 0] # shape: (batch size, label num) gs_t1 = self.gs[:, 0] - 1 mask_t1 = self.masks[:, 0] score_t0 = T.zeros((batch_size, label_num)) index_t0 = T.zeros((batch_size, label_num), dtype='int64') init_flag = T.constant(1) # return shape: (batch size, label num), (batch size, label num) score_t1, index_t1 = self.score_one_step(pred_t1, gs_t1, mask_t1, score_t0, index_t0, self.init_t, self.tran_t, deterministic_flag, init_flag) print 'score_t1', score_t1.eval() print 'index_t1', index_t1.eval() pred = self.pred.dimshuffle(1, 0, 2) gs = self.gs.dimshuffle(1, 0) mask = self.masks.dimshuffle(1, 0) init_flag = T.constant(0) # init_flag = T.constant(0) # score_t2, index_t2 = self.score_one_step(pred[1], gs[1]-1, # mask[1], score_t1, index_t1, self.init_t, self.tran_t, deterministic_flag, init_flag) # print 'score_t2', score_t2.eval() # print 'index_t2', index_t2.eval() # print pred[1:].eval().shape # print (gs[1:]-1).eval().shape # print mask[1:].eval().shape # return shape: (time steps - 1, batch size, label num) ..., (time steps - 1, batch size) step_scores, step_indexs = theano.scan( fn=self.score_one_step, outputs_info=[score_t1, index_t1], sequences=[pred[1:], gs[1:] - 1, mask[1:]], non_sequences=[ self.init_t, self.tran_t, deterministic_flag, init_flag ])[0] # # print step_scores.eval().shape # # print step_indexs.eval().shape print 'score_t2', step_scores.dimshuffle(1, 0, 2)[:, 0].eval() print 'index_t2', step_indexs.dimshuffle(1, 0, 2)[:, 0].eval() print 'score_t3', step_scores.dimshuffle(1, 0, 2)[:, 1].eval() print 'index_t3', step_indexs.dimshuffle(1, 0, 2)[:, 1].eval() # shape: (batch size, ) last_step_max_score = T.max(step_scores[-1], axis=-1) last_step_max_index = T.argmax(step_scores[-1], axis=-1) def track_one_step(index_t, max_index_t): # example_indexs shape: (batch size, label num) # step_max_index shape: (batch size, ) def scan_example(index_t_e, max_index_t_e): max_index_tm1_e = index_t_e[max_index_t_e] return max_index_tm1_e # return shape: (batch size, ) max_index_tm1 = theano.scan(fn=scan_example, sequences=[index_t, max_index_t])[0] return max_index_tm1 # reverse time step, shape: (time steps - 1, batch size, label num) #step_indexs = step_indexs[::-1] # return shape: (time steps - 1, batch size) index_chain = theano.scan(fn=track_one_step, sequences=step_indexs, outputs_info=last_step_max_index, go_backwards=True)[0] # return shape: (batch size, time steps - 1) index_chain = index_chain.dimshuffle(1, 0) # shape: (batch size, time steps) index_chain_reverse = self.aggregateTensor(last_step_max_index, index_chain) # add 1 for label index (which index from 1) # return shape: (batch size, time steps) index_chain = (index_chain_reverse + T.ones_like(index_chain_reverse))[:, ::-1] print 'index chain', index_chain.eval() def one_step_cost(step_index, pred_t, gs_t, index_chain_t, mask_t, cost_tm1, gs_tm1, index_chain_tm1, init_tran, tran): # step_index: (1,) # pred_t: (batch size, label num) # gs_t_e: (batch size, ) # index_chain_t: (batch size, ) # mask_t: (batch size, ) # cost_tm1: (batch size, ) # gs_tm1: (batch size, ) # index_chain_tm1: (batch size, ) def scan_example(pred_t_e, gs_t_e, index_chain_t_e, mask_t_e, cost_tm1_e, gs_tm1_e, index_chain_tm1_e, step_index, init_tran, tran): # pred_t_e: (label num, ) # gs_t_e: (1, ) # index_chain_t_e: (1, ) # mask_t_e: (1, ) # gs_tm1_e: (1, ) # index_chain_tm1_e: (1, ) # init_tran: (label num, ) # tran: (label num, label num) cost_t_e = None cost_t_e = theano.ifelse.ifelse( T.eq(step_index, 0), theano.printing.Print('\ninit step pred_t_e\n')(pred_t_e[ theano.printing.Print('\ninit step index_chain_t_e\n') (index_chain_t_e)]) + theano.printing.Print('\n initstep init_tran\n')( init_tran[index_chain_t_e]) - theano.printing.Print('\ninit step pred_t_e\n')( pred_t_e[theano.printing.Print('\ninit step gs_t_e\n') (gs_t_e)]) - theano.printing.Print('\ninit step init_tran\n')( init_tran[gs_t_e]), theano.printing.Print('\nother pred_t_e\n')( pred_t_e[theano.printing.Print( '\nother index_chain_t_e\n')(index_chain_t_e)]) + theano.printing.Print('\nother tran\n') (tran[theano.printing.Print('\nother index_chain_tm1_e\n') (index_chain_tm1_e)][index_chain_t_e]) - theano.printing.Print('\nother pred_t_e\n')(pred_t_e[ theano.printing.Print('\nother gs_t_e\n')(gs_t_e)]) - theano.printing.Print('\nother tran\n')( tran[theano.printing.Print('\nother gs_tm1_e\n') (gs_tm1_e)][gs_t_e])) # if T.eq(step_index, 0) == T.constant(1): # cost_t_e = pred_t_e[index_chain_t_e] + init_tran[index_chain_t_e]\ # - pred_t_e[gs_t_e] - init_tran[gs_t_e] # else: # cost_t_e = pred_t_e[index_chain_t_e] + tran[index_chain_t_e][index_chain_tm1_e]\ # - pred_t_e[gs_t_e] - tran[gs_tm1_e][gs_t_e] cost_t_e = cost_t_e * mask_t_e # return shape: (1, ) return theano.printing.Print('\ncost_t_e\n')( cost_t_e), gs_t_e, index_chain_t_e # return shape: (batch size, )... cost_t, _, _ = theano.scan( fn=scan_example, sequences=[ pred_t, gs_t, index_chain_t, mask_t, cost_tm1, gs_tm1, index_chain_tm1 ], non_sequences=[step_index, init_tran, tran])[0] # return shape: (batch size, )... return cost_t, gs_t, index_chain_t # return shape: (time steps, batch size) index_chain_sff = index_chain.dimshuffle(1, 0) gs_t0 = T.zeros((batch_size, ), dtype='int64') cost_t0 = T.zeros((batch_size, ), dtype='float64') index_chain_t0 = T.zeros((batch_size, ), dtype='int64') # return shape: (time steps, batch size) print(gs - 1).eval() print(index_chain_sff - 1).eval() steps_cost, _, _ = theano.scan( fn=one_step_cost, outputs_info=[cost_t0, gs_t0, index_chain_t0], sequences=[ T.arange(time_steps), pred, gs - 1, index_chain_sff - 1, mask ], non_sequences=[self.init_t, self.tran_t])[0] # return shape: (batch size, ) cost = T.sum(steps_cost.dimshuffle(1, 0), axis=-1) # # return shape: (batch size, time steps - 1) # step_gs_scores = step_gs_scores.dimshuffle(1, 0) # # return shape: (batch size, ) # last_gs_score = step_gs_scores[:, -1] # print 'score_t2', step_scores.dimshuffle(1, 0, 2)[:, 0].eval() # print 'index_t2', step_indexs.dimshuffle(1, 0, 2)[:, 0].eval() # print 'gs_score_t2', step_gs_scores[:, 0].eval() # print 'score_t3', step_scores.dimshuffle(1, 0, 2)[:, 1].eval() # print 'index_t3', step_indexs.dimshuffle(1, 0, 2)[:, 1].eval() # print 'gs_score_t3', step_gs_scores[:, 1].eval() # print index_chain.eval() # print last_step_max_score.eval() # print last_gs_score.eval() # return shape: (exmaple num, time steps), (batch size, ), (batch size, ) #return [index_chain, last_step_max_score, last_gs_score] print 'cost', cost.eval() # return shape: (batch size, ) return cost
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], type_hidden_units=[200, 100, 6], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, print_freq=5, sen_reg=False, L2=False): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) num_sens = len(dataset[0][0][0]) print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") type_y = T.ivector("y_type") pop_y = T.ivector("y_pop") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(( x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm )) ######################### # Construct Sen Vec ##### ######################### conv_layers = [] filter_shape = (num_maps, 1, filter_hs[0], emb_dm) pool_size = (input_height - filter_hs[0] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) sen_vecs = conv_layer.output.reshape((x.shape[0], x.shape[1], num_maps)) conv_layers.append(conv_layer) ######################## ## Task 1: populaiton### ######################## pop_layer_sizes = zip(hidden_units, hidden_units[1:]) pop_layer_input = sen_vecs pop_drop_input = sen_vecs pop_hidden_outs = [] pop_drop_outs = [] pop_hidden_layers = [] pop_drop_layers = [] droprate = 0.5 for layer_size in pop_layer_sizes[:-1]: U_value = np.random.random(layer_size).astype(theano.config.floatX) b_value = np.zeros((layer_size[-1],), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") pop_hidden_layer = nn.HiddenLayer(rng, pop_layer_input, layer_size[0], layer_size[1], ReLU, U * (1 - droprate), b) pop_drop_hidden_layer = nn.DropoutHiddenLayer(rng, pop_drop_input, layer_size[0], layer_size[1], ReLU, droprate, U, b) pop_hidden_layers.append(pop_hidden_layer) pop_drop_layers.append(pop_drop_hidden_layer) pop_hidden_out = pop_hidden_layer.output pop_drop_out = pop_drop_hidden_layer.output pop_layer_input = pop_hidden_out pop_drop_input = pop_drop_out pop_hidden_outs.append(pop_hidden_out) pop_drop_outs.append(pop_drop_out) # construct pop classifier n_in, n_out = pop_layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out,), dtype=theano.config.floatX) pop_W = theano.shared(W_value, borrow=True, name="pop_W") pop_b = theano.shared(b_value, borrow=True, name="pop_b") pop_act = T.dot(pop_hidden_outs[-1], pop_W * (1 - droprate)) + pop_b pop_drop_act = T.dot(pop_drop_outs[-1], pop_W) + pop_b pop_max_act = T.max(pop_act, axis=1).flatten(2) pop_drop_max_act = T.max(pop_drop_act, axis=1).flatten(2) pop_sen_max = T.argmax(T.max(pop_act, axis=2).flatten(2), axis=1) pop_drop_sen_max = T.argmax(T.max(pop_drop_act, axis=2).flatten(2), axis=1) pop_probs = T.nnet.softmax(pop_max_act) pop_drop_probs = T.nnet.softmax(pop_drop_max_act) pop_y_pred = T.argmax(pop_probs, axis=1) pop_drop_y_pred = T.argmax(pop_drop_probs, axis=1) pop_neg_loglikelihood = -T.mean(T.log(pop_probs)[T.arange(pop_y.shape[0]), pop_y]) pop_drop_neg_loglikelihood = -T.mean(T.log(pop_drop_probs)[T.arange(pop_y.shape[0]), pop_y]) pop_errors = T.mean(T.neq(pop_y_pred, pop_y)) pop_errors_detail = T.neq(pop_y_pred, pop_y) pop_cost = pop_neg_loglikelihood pop_drop_cost = pop_drop_neg_loglikelihood ######################## ## Task 1: event type### ######################## type_layer_sizes = zip(type_hidden_units, type_hidden_units[1:]) type_layer_input = sen_vecs type_drop_input = sen_vecs type_hidden_outs = [] type_drop_outs = [] type_hidden_layers = [] type_drop_layers = [] droprate = 0.5 for layer_size in type_layer_sizes[:-1]: U_value = np.random.random(layer_size).astype(theano.config.floatX) b_value = np.zeros((layer_size[-1],), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") type_hidden_layer = nn.HiddenLayer(rng, type_layer_input, layer_size[0], layer_size[1], ReLU, U * (1 - droprate), b) type_drop_hidden_layer = nn.DropoutHiddenLayer(rng, type_drop_input, layer_size[0], layer_size[1], ReLU, droprate, U, b) type_hidden_layers.append(type_hidden_layer) type_drop_layers.append(type_drop_hidden_layer) type_hidden_out = type_hidden_layer.output type_drop_out = type_drop_hidden_layer.output type_layer_input = type_hidden_out type_drop_input = type_drop_out type_hidden_outs.append(type_hidden_out) type_drop_outs.append(type_drop_out) # construct pop classifier n_in, n_out = type_layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out,), dtype=theano.config.floatX) type_W = theano.shared(W_value, borrow=True, name="pop_W") type_b = theano.shared(b_value, borrow=True, name="pop_b") type_act = T.dot(type_hidden_outs[-1], type_W * (1 - droprate)) + type_b type_drop_act = T.dot(type_drop_outs[-1], type_W) + type_b type_max_act = T.max(type_act, axis=1).flatten(2) type_drop_max_act = T.max(type_drop_act, axis=1).flatten(2) type_sen_max = T.argmax(T.max(type_act, axis=2).flatten(2), axis=1) type_drop_sen_max = T.argmax(T.max(type_drop_act, axis=2).flatten(2), axis=1) type_probs = T.nnet.softmax(type_max_act) type_drop_probs = T.nnet.softmax(type_drop_max_act) type_y_pred = T.argmax(type_probs, axis=1) type_drop_y_pred = T.argmax(type_drop_probs, axis=1) type_neg_loglikelihood = -T.mean(T.log(type_probs)[T.arange(type_y.shape[0]), type_y]) type_drop_neg_loglikelihood = -T.mean(T.log(type_drop_probs)[T.arange(type_y.shape[0]), type_y]) type_errors = T.mean(T.neq(type_y_pred, type_y)) type_errors_detail = T.neq(type_y_pred, type_y) type_cost = type_neg_loglikelihood type_drop_cost = type_drop_neg_loglikelihood ################################### ## Choose the max sens in two task# ################################### pop_drop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), pop_drop_sen_max] type_drop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), type_drop_sen_max] simi_drop_cost = T.mean(T.sum((pop_drop_choosed_sens - type_drop_choosed_sens) ** 2, axis=1)) pop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), pop_sen_max] type_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), type_sen_max] simi_cost = T.mean(T.sum((pop_choosed_sens - type_choosed_sens) ** 2, axis=1)) ################################## # Collect all the parameters ##### ################################## params = [] # convolution layer params for conv_layer in conv_layers: params += conv_layer.params # params for population task for layer in pop_drop_layers: params += layer.params params.append(pop_W) params.append(pop_b) # params for event type task for layer in type_drop_layers: params += layer.params params.append(type_W) params.append(type_b) if non_static: params.append(words) total_cost = pop_cost + type_cost total_drop_cost = pop_drop_cost + type_drop_cost if sen_reg: simi_weight = 0.05 total_cost += simi_weight * simi_cost total_drop_cost += simi_weight * simi_drop_cost if L2: l2_norm = 0.1 * T.sum(pop_W ** 2) + 0.1 * T.sum(type_W ** 2) for drop_layer in type_drop_layers: l2_norm += 0.1 * T.sum(drop_layer.W ** 2) for drop_layer in pop_drop_layers: l2_norm += 0.1 * T.sum(drop_layer.W ** 2) total_cost += l2_norm total_drop_cost += l2_norm total_grad_updates = sgd_updates_adadelta(params, total_drop_cost, lr_decay, 1e-6, sqr_norm_lim) total_preds = [pop_y_pred, type_y_pred] total_errors_details = [pop_errors_detail, type_errors_detail] total_choosed_sens = [pop_sen_max, type_sen_max] total_out = total_preds + total_errors_details + total_choosed_sens ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_pop_y, train_type_y = shared_dataset(dataset[0]) test_x, test_pop_y, test_type_y = shared_dataset(dataset[1]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function([index], total_drop_cost, updates=total_grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], pop_y: train_pop_y[index*batch_size:(index+1)*batch_size], type_y:train_type_y[index*batch_size:(index+1)*batch_size] }) train_pred_detail = function([index], total_out, givens={ x:train_x[index*batch_size:(index+1)*batch_size], pop_y:train_pop_y[index*batch_size:(index+1)*batch_size], type_y:train_type_y[index*batch_size:(index+1)*batch_size] }) test_pred_detail = function([index], total_out, givens={ x:test_x[index*batch_size:(index+1)*batch_size], pop_y:test_pop_y[index*batch_size:(index+1)*batch_size], type_y:test_type_y[index*batch_size:(index+1)*batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_test = len(dataset[1][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'w') print "Start to train the model....." total_score = 0.0 while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) if epoch % print_freq == 0: # do test pop_preds = [] type_preds = [] pop_errors = [] type_errors = [] pop_sens = [] type_sens = [] for i in xrange(n_test_batches): test_pop_pred, test_type_pred, test_pop_error, test_type_error, test_pop_sen, test_type_sen = test_pred_detail(i) pop_preds.append(test_pop_pred) type_preds.append(test_type_pred) pop_errors.append(test_pop_error) type_errors.append(test_type_error) pop_sens.append(test_pop_sen) type_sens.append(test_type_sen) pop_preds = np.concatenate(pop_preds) type_preds = np.concatenate(type_preds) pop_errors = np.concatenate(pop_errors) type_errors = np.concatenate(type_errors) pop_sens = np.concatenate(pop_sens) type_sens = np.concatenate(type_sens) pop_perf = 1 - np.mean(pop_errors) type_perf = 1 - np.mean(type_errors) # dumps the predictions and the choosed sentences with open(os.path.join(perf_fn, "%s_%d.pop_pred" % (exp_name, epoch)), 'w') as epf: for p in pop_preds: epf.write("%d\n" % int(p)) with open(os.path.join(perf_fn, "%s_%d.type_pred" % (exp_name, epoch)), 'w') as epf: for p in type_preds: epf.write("%d\n" % int(p)) with open(os.path.join(perf_fn, "%s_%d.test_pop_sens" % (exp_name, epoch)), 'w') as epf: for s in pop_sens: epf.write("%d\n" % int(s)) with open(os.path.join(perf_fn, "%s_%d.test_type_sens" % (exp_name, epoch)), 'w') as epf: for s in type_sens: epf.write("%d\n" % int(s)) train_pop_sens = [] train_type_sens = [] for i in xrange(n_train_batches): train_pop_pred, train_type_pred, train_pop_error, train_type_error, train_pop_sen, train_type_sen = train_pred_detail(i) train_pop_sens.append(train_pop_sen) train_type_sens.append(train_type_sen) pop_sens = np.concatenate(train_pop_sens) type_sens = np.concatenate(train_type_sens) with open(os.path.join(perf_fn, "%s_%d.train_pop_sens" % (exp_name, epoch)), 'w') as epf: for s in pop_sens: epf.write("%d\n" % int(s)) with open(os.path.join(perf_fn, "%s_%d.train_type_sens" % (exp_name, epoch)), 'w') as epf: for s in type_sens: epf.write("%d\n" % int(s)) message = "Epoch %d test pop perf %f, type perf %f, training_cost %f" % (epoch, pop_perf, type_perf, np.mean(costs)) print message log_file.write(message + "\n") log_file.flush() if (pop_perf + type_perf) > total_score and False: total_score = pop_perf + type_perf # save the model model_name = os.path.join(perf_fn, "%s_%d.best_model" % (exp_name, epoch)) with open(model_name, 'wb') as mn: for param in params: cPickle.dump(param.get_value(), mn) end_time = timeit.default_timer() print "Finish one iteration using %f m" % ((end_time - start_time)/60.) # output the final model params print "Output the final model" model_name = os.path.join(perf_fn, "%s_%d.final_model" % (exp_name, epoch)) with open(model_name, 'wb') as mn: for param in params: cPickle.dump(param.get_value(), mn) log_file.flush() log_file.close()
def target_function(self, x_neg, word_emb, x_local, x, x_g): score = self.forward(word_emb, x_local, x, x_g) score_neg = self.forward(word_emb, x_local, x_neg, x_g) return T.max([0, 1 - score + score_neg])
def comp_one(self, param, info): return T.max(T.abs_(get_p(param)))
def comp_one(self, param, grad=None, diff=None): return T.max(T.abs_(get_p(param)))
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=40, cand_size=10, hidden_size=[300, 300], max_pred_pick=5): model_options = locals().copy() print "model options", model_options pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'} root = '/save/wenpeng/datasets/FEVER/' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train( sent_len, claim_len, cand_size) train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev( sent_len, claim_len, cand_size, word2id) test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) train_sents = np.asarray(train_sents, dtype='int32') train_3th_sents = np.asarray(train_3th_sents, dtype='int32') joint_train_sents = np.concatenate((train_sents, train_3th_sents)) test_sents = np.asarray(test_sents, dtype='int32') test_3th_sents = np.asarray(test_3th_sents, dtype='int32') joint_test_sents = np.concatenate((test_sents, test_3th_sents)) train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX) train_3th_sent_masks = np.asarray(train_3th_sent_masks, dtype=theano.config.floatX) joint_train_sent_masks = np.concatenate( (train_sent_masks, train_3th_sent_masks)) test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX) test_3th_sent_masks = np.asarray(test_3th_sent_masks, dtype=theano.config.floatX) joint_test_sent_masks = np.concatenate( (test_sent_masks, test_3th_sent_masks)) train_sent_labels = np.asarray(train_sent_labels, dtype='int32') train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32') joint_train_sent_labels = np.concatenate( (train_sent_labels, train_3th_sent_labels)) test_sent_labels = np.asarray(test_sent_labels, dtype='int32') test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32') joint_test_sent_labels = np.concatenate( (test_sent_labels, test_3th_sent_labels)) train_claims = np.asarray(train_claims, dtype='int32') train_3th_claims = np.asarray(train_3th_claims, dtype='int32') joint_train_claims = np.concatenate((train_claims, train_3th_claims)) test_claims = np.asarray(test_claims, dtype='int32') test_3th_claims = np.asarray(test_3th_claims, dtype='int32') joint_test_claims = np.concatenate((test_claims, test_3th_claims)) train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX) train_3th_claim_mask = np.asarray(train_3th_claim_mask, dtype=theano.config.floatX) joint_train_claim_mask = np.concatenate( (train_claim_mask, train_3th_claim_mask)) test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX) test_3th_claim_mask = np.asarray(test_3th_claim_mask, dtype=theano.config.floatX) joint_test_claim_mask = np.concatenate( (test_claim_mask, test_3th_claim_mask)) train_labels = np.asarray(train_labels, dtype='int32') train_3th_labels = np.asarray(train_3th_labels, dtype='int32') joint_train_labels = np.concatenate((train_labels, train_3th_labels)) test_labels = np.asarray(test_labels, dtype='int32') test_3th_labels = np.asarray(test_3th_labels, dtype='int32') joint_test_labels = np.concatenate((test_labels, test_3th_labels)) joint_train_size = len(joint_train_claims) joint_test_size = len(joint_test_claims) train_size = len(train_claims) test_size = len(test_claims) test_3th_size = len(test_3th_claims) vocab_size = len(word2id) + 1 print 'joint_train size: ', joint_train_size, ' joint_test size: ', joint_test_size print 'train size: ', train_size, ' test size: ', test_size print 'vocab size: ', vocab_size rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids = T.itensor3() #(batch, cand_size, sent_len) sents_mask = T.ftensor3() sents_labels = T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() joint_sents_ids = T.itensor3() #(batch, cand_size, sent_len) joint_sents_mask = T.ftensor3() joint_sents_labels = T.imatrix() #(batch, cand_size) joint_claim_ids = T.imatrix() #(batch, claim_len) joint_claim_mask = T.fmatrix() joint_labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_att_conv_W, task1_att_conv_b = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_conv_W_context, task1_conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) att_conv_W, att_conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W, att_conv_b, task1_conv_W_context, conv_W_context ] conv_model_sents = Conv_with_Mask( rng, input_tensor3=embed_input_sents, mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask( rng, input_tensor3=embed_input_claim, mask_matrix=claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings = conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1), cand_size, axis=1) ''' attentive conv for task1 ''' task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= embed_input_sents, #batch_size*cand_size, emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=task1_att_conv_W, b=task1_att_conv_b, W_context=task1_conv_W_context, b_context=task1_conv_b_context) task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r concate_claim_sent = T.concatenate([ batch_claim_emb, batch_sent_emb, T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x') ], axis=2) concate_2_matrix = concate_claim_sent.reshape( (batch_size * cand_size, hidden_size[0] * 2 + 1)) LR_input = T.concatenate([ concate_2_matrix, task1_attentive_sent_embeddings_l, task1_attentive_sent_embeddings_r ], axis=1) LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2 # LR_input = concate_2_matrix # LR_input_size = hidden_size[0]*2+1 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para = [U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(LR_input.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix) # loss = -T.mean(T.log(prob_pos)) #f1 as loss batch_overlap = T.sum(sents_labels * inter_matrix, axis=1) batch_recall = batch_overlap / T.sum(sents_labels, axis=1) batch_precision = batch_overlap / T.sum(inter_matrix, axis=1) batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall + batch_precision) loss = -T.mean(T.log(batch_f1)) # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean() ''' training task2, predict 3 labels ''' joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM joint_embed_input_claim = init_embeddings[ joint_claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) joint_conv_model_sents = Conv_with_Mask( rng, input_tensor3=joint_embed_input_sents, mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_sent_embeddings = joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_batch_sent_emb = joint_sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) joint_premise_emb = T.sum(joint_batch_sent_emb * joint_sents_labels.dimshuffle(0, 1, 'x'), axis=1) #(batch, hidden_size) joint_conv_model_claims = Conv_with_Mask( rng, input_tensor3=joint_embed_input_claim, mask_matrix=joint_claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_claim_embeddings = joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_premise_hypo_emb = T.concatenate( [joint_premise_emb, joint_claim_embeddings], axis=1) #(batch, 2*hidden_size) ''' attentive conv in task2 ''' joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) joint_sents_dot = T.batched_dot( joint_sents_tensor3, joint_sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) joint_sents_dot_2_matrix = T.nnet.softmax( joint_sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) joint_sents_context = T.batched_dot( joint_sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), joint_sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= joint_add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0), mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle( 0, 1, 'x') masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle( 0, 1, 'x') fine_max = T.concatenate([ T.max(masked_sents_attconv, axis=1), T.max(masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) "Logistic Regression layer" joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1) joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class joint_LR_para = [joint_U_a, joint_LR_b] joint_layer_LR = LogisticRegression( rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector joint_loss = joint_layer_LR.negative_log_likelihood( joint_labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' # binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size masked_inter_matrix = inter_matrix * sents_labels #(batch, cand_size) test_premise_emb = T.sum(batch_sent_emb * masked_inter_matrix.dimshuffle(0, 1, 'x'), axis=1) test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings], axis=1) #fine-maxsum sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) sents_dot_2_matrix = T.nnet.softmax( sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) sents_context = T.batched_dot( sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) add_sents_context = embed_input_sents + sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_fine_max = T.concatenate([ T.max(test_masked_sents_attconv, axis=1), T.max(test_masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max], axis=1) test_LR_input_size = joint_LR_input_size test_layer_LR = LogisticRegression( rng, input=test_LR_input, n_in=test_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector params = [init_embeddings] + NN_para + LR_para + joint_LR_para cost = loss + joint_loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False joint_n_train_batches = joint_train_size / batch_size joint_train_batch_start = list( np.arange(joint_n_train_batches) * batch_size) + [joint_train_size - batch_size] n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] n_test_3th_batches = test_3th_size / batch_size test_3th_batch_start = list(np.arange(n_test_3th_batches) * batch_size) + [test_3th_size - batch_size] max_strict_acc = 0.0 max_test_f1 = 0.0 max_all_acc = 0.0 cost_i = 0.0 joint_train_indices = range(joint_train_size) train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( joint_train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed random.Random(100).shuffle(train_indices) iter_accu = 0 for joint_batch_id in joint_train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1 iter_accu += 1 joint_train_id_batch = joint_train_indices[ joint_batch_id:joint_batch_id + batch_size] for i in range(3): batch_id = random.choice(train_batch_start) train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents[train_id_batch], train_sent_masks[train_id_batch], train_sent_labels[train_id_batch], train_claims[train_id_batch], train_claim_mask[train_id_batch], #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels joint_train_sents[joint_train_id_batch], joint_train_sent_masks[joint_train_id_batch], joint_train_sent_labels[joint_train_id_batch], joint_train_claims[joint_train_id_batch], joint_train_claim_mask[joint_train_id_batch], joint_train_labels[joint_train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] for test_batch_id in test_batch_start: # for each test batch batch_prob, error_i, pred_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_sent_masks[test_batch_id:test_batch_id + batch_size], test_sent_labels[test_batch_id:test_batch_id + batch_size], test_claims[test_batch_id:test_batch_id + batch_size], test_claim_mask[test_batch_id:test_batch_id + batch_size], test_labels[test_batch_id:test_batch_id + batch_size]) error_sum += error_i batch_sent_labels = test_sent_labels[ test_batch_id:test_batch_id + batch_size] batch_sent_names = test_sent_names[ test_batch_id:test_batch_id + batch_size] batch_ground_names = test_ground_names[ test_batch_id:test_batch_id + batch_size] batch_ground_labels = test_labels[ test_batch_id:test_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 if strict_score > max_strict_acc and f1 > max_test_f1: max_strict_acc = strict_score max_test_f1 = f1 writefile_2class = codecs.open( root + 'class_2_erroranalysis.txt', 'w', 'utf-8') for dic in predictions: json.dump(dic, writefile_2class) writefile_2class.write('\n') writefile_2class.close() print 'writefile_2class write over' for test_batch_id in test_3th_batch_start: # for each test batch _, error_i, pred_i = test_model( test_3th_sents[test_batch_id:test_batch_id + batch_size], test_3th_sent_masks[test_batch_id:test_batch_id + batch_size], test_3th_sent_labels[test_batch_id:test_batch_id + batch_size], test_3th_claims[test_batch_id:test_batch_id + batch_size], test_3th_claim_mask[test_batch_id:test_batch_id + batch_size], test_3th_labels[test_batch_id:test_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 if label_accuracy > max_all_acc: max_all_acc = label_accuracy writefile_3class = codecs.open( root + 'class_3_erroranalysis.txt', 'w', 'utf-8') for dic in predictions: json.dump(dic, writefile_3class) writefile_3class.write('\n') writefile_3class.close() print 'writefile_3class write over' print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, action_selection, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng if action_selection == 'epsilon-greedy': self.choose_action = self.choose_action_epsilon_greedy elif action_selection == 'softmax': self.choose_action = self.choose_action_softmax else: raise ValueError( "Unrecognized action selection: {}".format(action_selection)) lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1, ))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part else: loss = 0.5 * diff**2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def logsumexp(x, axis=None): # Adapted from https://github.com/Theano/Theano/issues/1563 x_max = tt.max(x, axis=axis, keepdims=True) return tt.log(tt.sum(tt.exp(x - x_max), axis=axis, keepdims=True)) + x_max
def get_output(self, train=False): X = self.get_input(train) # -- don't need activation since it's just linear. output = T.max(T.dot(X, self.W) + self.b, axis=1) return output
def log_sum_exp(x, axis=None): x_max = T.max(x, axis=axis, keepdims=True) return T.log(T.sum(T.exp(x - x_max), axis=axis, keepdims=True)) + x_max
def max(x, axis=None, keepdims=False): return T.max(x, axis=axis, keepdims=keepdims)
def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, clip=None, grad_fn=None): """ Adam optimizer. Returns a set of gradient descent updates. This is ported from the GitHub Gist by Alec Radford https://gist.github.com/Newmu/acb738767acb4788bac3 (MIT License). TODO: Track which parameter(s) triggers the rescaling. This would help debugging / setting fitting parameters: if it's always the same parameter triggering clipping, its learning rate should probably be reduced. .. Caution:: The values of `b1` and `b2` are equivalent to 1-β1, 1-β2, where β1 and β2 are their corresponding values in Kingma et al. (2014). Parameters ---------- cost: theano variable We want to minimize this cost. params: List[Shared] | List[Tuple[Shared, mask]] | Dict[Shared, mask] List of Theano shared variables. Any element may be specified instead as a tuple pair, whose first element is the shared variable, and the second is a boolean mask array. If given, the mask array should be of the same shape as the shared variable – False entries indicate that we are not fitting for this parameter component, and so its gradient is to be set to zero. lr: float, > 0 Learning rate. b1, b2: float, between 0 (exclusive) and 1 (inclusive) Decay rates for the mean (`b1`) and variance (`b2`) of the gradient. Specifically, if we think of the optimization step i as continuous, then the gradient mean `m` decays roughly as dm/di = -b m & m(i) = m(0) exp(-bi) A plain SGD optimizer with no momentum can be obtained by setting both `b1` and `b2` to zero. e: float, >0. Default: 1e-8 Epsilon. This value is used in the following calculation to ensure numerical stability:: g_t = m_t / (tt.sqrt(v_t) + e) where `g_t` is the ultimately returned gradient, `m_t` its inertial mean and `v_t` its inertial variance. clip: positive float Clip gradients such that no components are greater than this value. ADAM provides some automatic adjustment of the gradient based. For cases where the cost exhibits cliffs however (as is common with RNNs), this might not be sufficient, as very large gradients can overpower ADAM's adaptation. In this case clipping the final gradient can help stabilize the optimization algorithm. Clipping is done on the gradient's L∞ norm, so the direction is conserved. Specifically, the gradient for each parameter `p` is independently divided by `clip`; the largest of these ratios, if it exceeds 1, is used to rescale the whole gradient. This allows us to have different learning rates for different parameters, and for the clipping to scale reasonably with the number of parameters. Clip value can be chosen by what we think is the maximum reasonable parameter change in one iteration, since this change is roughly bounded by `lr` x `clip`. Note that we clip the raw gradient, so the internal `m` and `v` variables are updated with the clipped gradient; this is why we say "roughly bounded" above. We do this because `m` and `v` are momentum variable, and so should reflect the actual movement of the 'particle'. We haven't however made extensive tests to check whether this is the most reasonable choice in practice. Setting `clip` to `None` disables clipping completely. This is the default. grad_fn: function If specified, use this instead of `T.grad` to compute the cost's gradient. Should have the same signature (i.e. `grad_fn(cost, params)`) and return a result of the same shape as `T.grad`. Returns ------- Theano update dictionary for the parameters in `params` """ # The MIT License (MIT) # Copyright (c) 2015 Alec Radford # Copyright (c) 2018-2020 Alexandre René # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. tmpparams = [] param_masks = [] # Standardize the form of params if isinstance(params, shim.config.GraphTypes): params = [params] if isinstance(params, dict): # Convert dictionary to a list of (param, mask_descriptor) tuples params = list(params.items()) else: _params = [] for p in params: if isinstance(p, tuple): _params.append(p) else: # Param has no mask: set it to None _params.append((p, None)) params = _params # `params` is a list of size 2 tuples assert all(isinstance(p, tuple) and len(p) == 2 for p in params) # Standardize the learning rate form errmsg = ("Learning rate must be specified either as a scalar, " "or as a dictionary with a key matching each parameter. " "Provided learning rate: {}".format(lr)) if shim.isscalar(lr): lr = {p[0]: lr for p in params} elif not isinstance(lr, dict): raise ValueError(errmsg) _lr = lr.copy() for key, plr in _lr.items(): if isinstance(key, str): # We expect lr to be indexed by variable, not variable name for p, mask in params: if p.name == key: lr[p] = plr del lr[key] break if not isinstance(lr, dict) or not all(p[0] in lr for p in params): raise ValueError(errmsg) # Extract the gradient mask for each parameter for p in params: tmpparams.append(p[0]) if p[1] is not None: if isinstance(p[1], bool): param_masks.append( np.ones(p[0].get_value().shape, dtype=int) * p[1]) else: if p[1].shape != p[0].get_value().shape: raise ValueError( "Provided mask (shape {}) for parameter {} " "(shape {}) has a different shape.".format( p[1].shape, p[0].name, p[0].get_value().shape)) param_masks.append(p[1]) else: param_masks.append(None) params = tmpparams updates = OrderedDict() gs = {} lrs = {} if grad_fn is None: try: grads = tt.grad(cost, params) except theano.gradient.DisconnectedInputError as e: disconnected_inputs = set(params).difference( shim.graph.shared_inputs(cost)) raise theano.gradient.DisconnectedInputError( "The following parameters do not appear in the expression for " "the cost: {}.".format(disconnected_inputs)) else: grads = grad_fn(cost, params) # Clip gradients if clip is not None: # Rescale is set by the component which most exceeds `clip` rescale = tt.max([1] + [tt.max(abs(g / clip)) for g in grads]) rescale.name = "rescale" # rescale = shim.print(rescale) for i in range(len(grads)): grads[i] /= rescale # DEBUG This is useful for finding which gradients are returning NaN, # but is this the best place / way ? newp = {p: p for p in params} # Need to keep handle to original shared var # which may be overwritten by print if 'print grads' in debug_flags: for i, p in enumerate(params): if (debug_flags['print grads'] is True or p.name in debug_flags['print grads']): newp[p] = shim.print(p) grads[i] = shim.ifelse( shim.eq(rescale, 1), shim.print(grads[i], 'gradient ' + p.name), shim.print(grads[i], 'gradient ' + p.name + ' RESCALED')) # for p in params: # gs[p] = shim.ifelse(shim.eq(rescale, 1), # shim.print(gs[p], 'g_t (' + p.name + ')'), # shim.print(gs[p], 'g_t (' + p.name + ') RESCALED') # ) # Mask out the gradient for parameters we aren't fitting for i, mask in enumerate(param_masks): if mask is not None: grads[i] = grads[i] * mask # `mask` is an array of ones and zeros i = theano.shared(shim.cast_floatX(0.), name='adam_i') i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t for p, g in zip(params, grads): g = shim.cast_floatX(g) # FIXME: prior logp's still have dtype='float64', # no matter the value of floatX. # This is probably due to some internal constants # which are double precision. # Until this is fixed we need the explicit cast if not shim.all((0 < shim.eval(b1) <= 1)) and shim.all( (0 < shim.eval(b2) <= 1)): raise ValueError("Arguments `b1` and `b2` to the Adam optimizer " "must be within (0, 1]. Received:\n" f"b1: {b1}\nb2: {b2}") lr_t = lr[p] * (tt.sqrt(fix2) / fix1) initval = shim.cast_floatX(p.get_value() * 0.) if p.name is not None: namem = 'adam_' + p.name + '_m' namev = 'adam_' + p.name + '_v' else: p.name = "" namem = namev = None if hasattr(p, 'broadcastable'): m = shim.shared(initval, broadcastable=p.broadcastable, name=namem) v = shim.shared(initval, broadcastable=p.broadcastable, name=namev) else: m = shim.shared(initval, name=namem) v = shim.shared(initval, name=namev) m_t = (b1 * g) + ((1. - b1) * m) # m_t = shim.print(m_t, 'm_t (' + p.name + ')') v_t = (b2 * tt.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tt.sqrt(v_t) + e) # ms[p] = [m, m_t] # vs[p] = [v, v_t] updates[m] = m_t updates[v] = v_t # lrs[p] = lr_t # gs[p] = g_t # lr_t = shim.print(lr_t, 'lr_t (' + p.name + ')') p_t = newp[p] - (lr_t * g_t) # Using newp allows printing, if it was requested if newp[p] != p: # We printed p, so also print the updated value p_t = shim.print(p_t, p.name + ' (updated)') updates[p] = shim.cast(p_t, p.dtype) updates[i] = i_t return updates
def __init__(self, num_actions, phi_length, width, height, discount=.9, learning_rate=.01, batch_size=32, approximator='none'): self._batch_size = batch_size self._num_input_features = phi_length self._phi_length = phi_length self._img_width = width self._img_height = height self._discount = discount self.num_actions = num_actions self.learning_rate = learning_rate self.scale_input_by = 255.0 # CONSTRUCT THE LAYERS self.q_layers = [] self.q_layers.append( layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_height, self._img_width, self.scale_input_by)) if approximator == 'cuda_conv': self.q_layers.append( cc_layers.ShuffleBC01ToC01BLayer(self.q_layers[-1])) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=16, filter_size=8, stride=4, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=32, filter_size=4, stride=2, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.ShuffleC01BToBC01Layer(self.q_layers[-1])) elif approximator == 'conv': self.q_layers.append( layers.StridedConv2DLayer(self.q_layers[-1], n_filters=16, filter_width=8, filter_height=8, stride_x=4, stride_y=4, weights_std=.01, init_bias_value=0.01)) self.q_layers.append( layers.StridedConv2DLayer(self.q_layers[-1], n_filters=32, filter_width=4, filter_height=4, stride_x=2, stride_y=2, weights_std=.01, init_bias_value=0.01)) if approximator == 'cuda_conv' or approximator == 'conv': self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=256, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.rectify)) self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=num_actions, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.identity)) if approximator == 'none': self.q_layers.append(\ layers.DenseLayerNoBias(self.q_layers[-1], n_outputs=num_actions, weights_std=0.00, dropout=0, nonlinearity=layers.identity)) self.q_layers.append(layers.OutputLayer(self.q_layers[-1])) for i in range(len(self.q_layers) - 1): print self.q_layers[i].get_output_shape() # Now create a network (using the same weights) # for next state q values self.next_layers = copy_layers(self.q_layers) self.next_layers[0] = layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_width, self._img_height, self.scale_input_by) self.next_layers[1].input_layer = self.next_layers[0] self.rewards = T.col() self.actions = T.icol() # Build the loss function ... q_vals = self.q_layers[-1].predictions() next_q_vals = self.next_layers[-1].predictions() next_maxes = T.max(next_q_vals, axis=1, keepdims=True) target = self.rewards + discount * next_maxes target = theano.gradient.consider_constant(target) diff = target - q_vals # Zero out all entries for actions that were not chosen... mask = build_mask(T.zeros_like(diff), self.actions, 1.0) diff_masked = diff * mask error = T.mean(diff_masked**2) self._loss = error * diff_masked.shape[1] # self._parameters = layers.all_parameters(self.q_layers[-1]) self._idx = T.lscalar('idx') # CREATE VARIABLES FOR INPUT AND OUTPUT self.states_shared = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.states_shared_next = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (1, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((1, 1), dtype='int32'), broadcastable=(False, True)) self._givens = \ {self.q_layers[0].input_var: self.states_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.next_layers[0].input_var: self.states_shared_next[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.rewards: self.rewards_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :], self.actions: self.actions_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :] } self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\ self._loss, self._parameters, learning_rate=self.learning_rate, rho=0.9, momentum=0.9, epsilon=1e-6) self._train = theano.function([self._idx], self._loss, givens=self._givens, updates=self._updates) self._compute_loss = theano.function([self._idx], self._loss, givens=self._givens) self._compute_q_vals = \ theano.function([self.q_layers[0].input_var], self.q_layers[-1].predictions(), on_unused_input='ignore')
def SoftMax(x): x = T.exp(x - T.max(x, axis=x.ndim - 1, keepdims=True)) return x / T.sum(x, axis=x.ndim - 1, keepdims=True)
def _max_along_time(input, **kwargs): return T.max(input, axis=1)
def SSL2(coding_dist, true_dist): def set_inf_in2dim(j, coding_dist, true_label_id): """ Search true_label_id==j,and set coding_dist[i][j]="-inf" """ return T.switch(T.eq(j, true_label_id), T.constant(float("-inf")), coding_dist[j]) def set_inf_in1dim(i, coding_dist, true_label_id): # coding_dist[:,label_id] doesn't become "-0.0" loss_margin, updates = theano.scan(set_inf_in2dim, \ outputs_info=None, \ sequences=T.arange(coding_dist.shape[1]), \ non_sequences=[coding_dist[i], true_label_id[i]]) return loss_margin if true_dist.ndim == coding_dist.ndim: ''' #Calculation: predictioin to true_label y_pre2true=T.sum(true_dist * coding_dist, axis=1) #Calculation: prediction to false_label y_pre2false=T.max((1-true_dist) * coding_dist, axis=1) loss=1+y_pre2true-y_pre2false ''' # Calculation: predictioin to true_label # y_pre2true=T.sum(true_dist * T.log(1+T.exp(2*(3-coding_dist))),axis=1) y_pre2true_softmax = T.sum(true_dist * T.nnet.softmax(coding_dist), axis=1) true_pre = T.sum(true_dist * coding_dist, axis=1) y_pre2true = T.sum(true_dist * T.exp((3 - coding_dist)), axis=1) # #Negative loss in y_pre2true # y_pre2true=T.nnet.sigmoid(y_pre2true)*y_pre2true # search the true label id true_label_id = T.argmax(true_dist, axis=1) # persist the false label in coding_dist coding_dist = (1 - true_dist) * coding_dist # set true label to "-inf" coding_dist_true2inf, updates = theano.scan(set_inf_in1dim, \ outputs_info=None, \ sequences=T.arange(coding_dist.shape[0]), \ non_sequences=[coding_dist, true_label_id]) # search the max in false label coding_dist_true2inf = T.max(coding_dist_true2inf, axis=1) # Calculation: predictioin to false_label # y_pre2false=T.log(1+T.exp(2*(0.5+coding_dist_true2inf))) y_pre2false = T.exp((0.5 + coding_dist_true2inf)) # Negative loss in y_pre2false # y_pre2false=T.nnet.sigmoid(k*y_pre2false)*y_pre2false stimulative = T.exp(2 + coding_dist_true2inf - true_pre) loss = 4 * T.nnet.sigmoid(y_pre2true) * T.nnet.sigmoid( y_pre2false) * stimulative * T.log(1 + y_pre2true + y_pre2false) # loss=2*T.nnet.sigmoid(y_pre2true)*T.nnet.sigmoid(y_pre2false)*T.log(1+y_pre2true+y_pre2false) return loss, stimulative, y_pre2false else: print "true_dist.ndim != coding_dist.ndim"
def MyLogSumExp(x, axis=None): x_max = tt.max(x, axis=axis, keepdims=True) return tt.log(tt.sum(tt.exp(x - x_max), axis=axis, keepdims=True)) + x_max
def SSL_mutual2(coding_dist, true_dist): def set_inf_in2dim(j, coding_dist, true_label_id): """ Search true_label_id==j,and set coding_dist[i][j]="-inf" """ return T.switch(T.eq(j, true_label_id), T.constant(float("-inf")), coding_dist[j]) def set_inf_in1dim(i, coding_dist, true_label_id): # coding_dist[:,label_id] doesn't become "-0.0" loss_margin, updates = theano.scan(set_inf_in2dim, \ outputs_info=None, \ sequences=T.arange(coding_dist.shape[1]), \ non_sequences=[coding_dist[i], true_label_id[i]]) return loss_margin if true_dist.ndim == coding_dist.ndim: """""" coding_dist1 = T.tanh(coding_dist) y_pre2true = T.sum(true_dist * T.exp((-coding_dist1)), axis=1) # search the true label id true_label_id = T.argmax(true_dist, axis=1) # persist the false label in coding_dist coding_dist_false = (1 - true_dist) * coding_dist1 # set true label to "-inf" coding_dist_true2inf, updates = theano.scan(set_inf_in1dim, \ outputs_info=None, \ sequences=T.arange(coding_dist_false.shape[0]), \ non_sequences=[coding_dist_false, true_label_id]) # search the max in false label coding_dist_true2inf = T.max(coding_dist_true2inf, axis=1) y_pre2false = T.exp((coding_dist_true2inf)) """stimulative""" coding_dist = T.nnet.softmax(coding_dist) # Calculation: predictioin to true_label true_pre = T.sum(true_dist * coding_dist, axis=1) # y_pre2true=T.sum(true_dist * T.exp((3-coding_dist)),axis=1) # search the true label id true_label_id = T.argmax(true_dist, axis=1) # persist the false label in coding_dist coding_dist_false = (1 - true_dist) * coding_dist # set true label to "-inf" coding_dist_true2inf, updates = theano.scan(set_inf_in1dim, \ outputs_info=None, \ sequences=T.arange(coding_dist_false.shape[0]), \ non_sequences=[coding_dist_false, true_label_id]) # search the max in false label coding_dist_true2inf = T.max(coding_dist_true2inf, axis=1) # y_pre2false=T.exp((0.25+coding_dist_true2inf)) # SSL stimulative = 1 + coding_dist_true2inf - true_pre # loss=stimulative*(-T.log(1e-8+true_pre)) loss = stimulative * T.log(1 + y_pre2true + y_pre2false) return loss, y_pre2false, y_pre2true, stimulative else: print "true_dist.ndim != coding_dist.ndim"
def get_output_for(self, input, **kwargs): R = (T.max(input, axis=1) - T.min(input, axis=1)).dimshuffle(0, 'x') input = self.k * input / T.maximum(R, 0.1) return T.nnet.softmax(input)
def get_output_for(self, input, **kwargs): return T.clip(T.max(input, axis=1), 0.01, 0.99)
def max_cross_corrs(filters, things_to_scan, min_overlap, batch_size=50, func_params_size=1000000, progress_update=1000): """ func_params_size: when compiling functions """ #reverse the patterns as the func is a conv not a cross corr assert len(filters.shape) == 3, "Did you pass in filters of unequal len?" assert filters.shape[-1] == things_to_scan.shape[-1] filters = filters.astype("float32")[:, ::-1, ::-1] to_return = np.zeros((filters.shape[0], len(things_to_scan))) #compile the number of filters that result in a function with #params equal to func_params_size params_per_filter = np.prod(filters[0].shape) filter_batch_size = int(func_params_size / params_per_filter) filter_length = filters.shape[1] filter_idx = 0 while filter_idx < filters.shape[0]: if (progress_update is not None): print("On filters", filter_idx, "to", min((filter_idx + filter_batch_size), len(filters))) sys.stdout.flush() filter_batch = filters[filter_idx:min(( filter_idx + filter_batch_size), len(filters))] padding_amount = int((filter_length) * (1 - min_overlap)) padded_input = [ np.pad(array=x, pad_width=((padding_amount, padding_amount), (0, 0)), mode="constant") for x in things_to_scan ] input_var = theano.tensor.TensorType(dtype=theano.config.floatX, broadcastable=[False] * 3)("input") theano_filters = theano.tensor.as_tensor_variable(x=filter_batch, name="filters") conv_out = theano.tensor.nnet.conv2d( input=input_var[:, None, :, :], filters=theano_filters[:, None, ::-1, ::-1], border_mode='valid')[:, :, :, 0] max_out = T.max(conv_out, axis=-1) max_cross_corr_func = theano.function([input_var], max_out, allow_input_downcast=True) max_cross_corrs = np.array( run_function_in_batches(func=max_cross_corr_func, input_data_list=[padded_input], batch_size=batch_size, progress_update=progress_update)) assert len(max_cross_corrs.shape) == 2, max_cross_corrs.shape to_return[filter_idx: min((filter_idx+filter_batch_size),len(filters)),:] =\ np.transpose(max_cross_corrs) filter_idx += filter_batch_size return to_return
def log_sum_exp(x, axis=1): m = T.max(x, axis=axis) return m + T.log(T.sum(T.exp(x - m.dimshuffle(0, 'x')), axis=axis))
def log_sum_exp(x, axis=1): m = T.max(x, axis=axis, keepdims=True) return m+T.log(T.sum(T.exp(x-m), axis=axis) + 1e-9)
def fmask(df: pandas.DataFrame) -> pandas.DataFrame: # ############################### DataFrame column aliases ########################################## df_swir1 = df['band6_reflectance_corrected'] df_cirrus = df['band9_reflectance_corrected'] df_bt1 = df['band10_bt'] # ################ Formulae from [Zhu 2012] ####################################################### """ This test cuts out pixels that are clearly just snow or vegetation, or that are too warm to be clouds. """ print('Formula1') """ Formula 5 from [Zhu 2012] is split into three parts, as each may be useful in its own right. This one is true if the pixel suggests thin clouds over water. """ print('Formula5a') """ This one suggests clear skies over water if true. """ print('Formula5b') """ This one evaluates to True if it is definitely water, either with clear skies or thin cloud. False if it is land, thick clouds over land, or thick clouds over water. """ print('Formula5c') """ This test produces true values for pixels that have a high probability of being cloud. It labels it as a Potential Cloud Pixel (PCP). """ print('Formula6') df['pcp'] = df['basic_test'] & df['whiteness_test'] & df['hot_test'] & df[ 'b4b5_test'] """ This further refines the Water Test of formula 5 to take advantage of the newer, second short-wave infrared band. """ # TODO: Shouldn't this just be folded into the original test then? print('Formula7') """ For pixels which are water under clear skies, estimate the temperature """ print('Formula8') # TODO: What if all the water is under clouds? What if there's no water at all? # noinspection PyTypeChecker """ """ print('Formula10') e10_brightness_prob = tt.clip(v0_swir1 / C10_MAX_WATER_REFLECTANCE, -999999, 1.0) df['brightness_prob'] = theano.function([v0_swir1], e10_brightness_prob)(df_swir1) """ From [Zhu, 2015] This uses the cirrus cloud band 9 to account for high-altitude clouds. See: https://landsat.usgs.gov/how-is-landsat-8s-cirrus-band-9-used """ print('2015, Formula1') e20151_cirrus_cloud_probability = v0_cirrus / C20151_CIRRUS_REFLECTANCE_THRESHOLD df['cirrus_cloud_probability'] = theano.function( [v0_cirrus], e20151_cirrus_cloud_probability)(df_cirrus) """ """ print('Formula11 replaced by 2015 Formula 2') e11_w_cloud_prob = v10_brightness_prob + v20151_cirrus_cloud_probability df['w_cloud_prob'] = theano.function( [v10_brightness_prob, v20151_cirrus_cloud_probability], e11_w_cloud_prob)(df['brightness_prob'], df['cirrus_cloud_probability']) """ """ print('Formula12') df['clearsky_land'] = ~df['pcp'] & ~df['water_test'] """ """ print('Formula13') df13_clearskyland = df[df['clearsky_land']] df13_clearskyland_bt = df13_clearskyland['band6_reflectance_corrected'] # noinspection PyTypeChecker c13_t_lo = numpy.percentile(df13_clearskyland_bt, C13_LOWER_PERCENTILE_FOR_CLEARSKY_LAND) # noinspection PyTypeChecker c13_t_hi = numpy.percentile(df13_clearskyland_bt, C13_UPPER_PERCENTILE_FOR_CLEARSKY_LAND) """ """ print('Formula14') c14_temperature_magnitude = c13_t_hi - c13_t_lo e14_l_temperature_prob = (c13_t_hi + 4 - v0_bt1) / c14_temperature_magnitude df['l_temperature_prob'] = theano.function([v0_bt1], e14_l_temperature_prob)(df_bt1) """ """ print("Formula15") # TODO: The whitepaper explanation is weird about this one. It's talking about saturation of one band, and another # band being larger than the other... but I think it's basically just saying that negative values for ndvi and # ndsi are cropped to zero. At which point the absolute values don't do anything. And we don't even need to modify # the ndsi/ndvi values, we can just make zero a minimum for our max function. Is that right??? e15_variability_prob = (tt.max([0, v1_ndvi, v1_ndsi, v2_whiteness])) df['variability_prob'] = theano.function( [v1_ndvi, v1_ndsi, v2_whiteness], e15_variability_prob)(df['ndvi'], df['ndsi'], df['whiteness']) """ """ print("Formula16") e16_l_cloud_prob = v14_l_temperature_prob * v15_variability_prob df['l_cloud_prob'] = theano.function( [v14_l_temperature_prob, v15_variability_prob], e16_l_cloud_prob)(df['l_temperature_prob'], df['variability_prob']) return df
def normalize(input,newmin=-1,newmax=1): mini = T.min(input) maxi = T.max(input) return (input-mini)*(newmax-newmin)/(maxi-mini)+newmin
t_b.append(_shared(ones((n_out,), dtype=floatX)*0.1)) t_conv = t_conv + t_b[-1].dimshuffle('x',0) t_conv = activation(t_conv) conv_length = prod(traj_shape[1:-1])*trajconv.res_shape t_conv = t_conv.reshape((batch.micro, conv_length)) if trajconv.append: traj_ = T.concatenate([t.flatten(2), t_conv.flatten(2)], axis=1) else: traj_ = t_conv.flatten(2) n_in_MLP -= traj_size n_in_MLP += conv_length elif use.traj: traj_ = t.flatten(2) insp = T.stack(T.min(vid_), T.mean(vid_), T.max(vid_), T.std(vid_))#, T.min(traj_), T.mean(traj_), T.max(traj_), T.std(traj_)) # dropout if use.drop: if use.traj: traj_ = DropoutLayer(traj_, rng=rng, p=drop.p_traj).output vid_ = DropoutLayer(vid_, rng=rng, p=drop.p_vid).output # MLP # ------------------------------------------------------------------------------ # fusion if net.fusion == "early": if use.traj: out = T.concatenate([vid_, traj_], axis=1) else: out = vid_ # hidden layer layers.append(HiddenLayer(out, n_in=n_in_MLP, n_out=net.hidden, rng=rng,