def test_path_layer_train(): init_rng() class_num = 3 trans_mat = np.zeros((class_num, class_num)) trans_mat[0, 0] = 0.8 trans_mat[0, 1] = 0.1 trans_mat[0, 2] = 0.1 trans_mat[1, 0] = 0.1 trans_mat[1, 1] = 0.1 trans_mat[1, 2] = 0.8 trans_mat[2, 0] = 0.1 trans_mat[2, 1] = 0.1 trans_mat[2, 2] = 0.8 def gen_train(trans_mat, sample_num, class_num): X = [] Y = [] x = [1.0 / class_num for i in xrange(class_num)] pre_y = 0 for _i in xrange(sample_num): X.append(x) trans_prob = trans_mat[pre_y] y = np.random.choice(class_num, 1, p=trans_prob)[0] Y.append(y) pre_y = y return np.asarray(X, dtype=np.float32), np.asarray(Y, dtype=np.int32) X, Y = gen_train(trans_mat, 100, class_num) print Counter(Y) trans_mat_prior = np.zeros((class_num + 1, class_num)) trans_mat_prior[0] = X[0] trans_mat_prior[1:] = trans_mat print "init trains mat" print trans_mat_prior # layer = PathTransitionLayer(class_num,trans_mat_prior) layer = PathTransitionLayer(class_num) train_x = T.fmatrix("x") train_y = T.ivector("y") cost = layer.cost(train_x, train_y) params = layer.params() gparams = T.grad(cost, params) updates = [] learning_rate = 0.01 for param, gparam in zip(params, gparams): updates.append((param, param - learning_rate * gparam)) iternum = 10 for i in xrange(iternum): X, Y = gen_train(trans_mat, 100, class_num) foo = theano.function(inputs=[train_x, train_y], outputs=[cost], updates=updates) print "%d,cost=%f" % (i, foo(X, Y)[0]) trained_trans_mat = layer.params()[0].eval() print trained_trans_mat print "training done."
def test_path_transition_layer(): init_rng() sample_num = 5 class_num = 10 X = theano.tensor.nnet.softmax(np.random.random((sample_num, class_num))) y = (9, 9, 9, 9, 9) layer = PathTransitionLayer(class_num) cost = layer.cost(X, y).eval() y_pred = layer.predict(X).eval() y_pred_cost = layer.cost(X, y_pred).eval() print "optimized cost = ", cost print "y_pred = ", y_pred print "y_pred_cost", y_pred_cost assert y_pred_cost < cost y_score = 1000 logadd_score = 0.0 trans_mat = theano.tensor.nnet.softmax(layer.tag_trans_matrix).eval() X = X.eval() for path in itertools.product(range(class_num), repeat=sample_num): score = trans_mat[0, path[0]] + X[0, path[0]] for idx in range(1, sample_num): score += trans_mat[path[idx - 1] + 1, path[idx]] + X[idx, path[idx]] score = score # .eval() logadd_score += math.exp(score) if path == y: y_score = score logadd_score = math.log(logadd_score) bruteforce_cost = logadd_score - y_score print "bruteforce cost = {0} with logadd = {1} and selected_path_score = {2}".format( bruteforce_cost, logadd_score, y_score ) bruteforce_y_pred = np.argmax(X, axis=1) # because trans_mat is const matrix print "brueforce y_pred = ", bruteforce_y_pred assert math.fabs(bruteforce_cost - cost) < 1e-6 assert not np.any(y_pred - bruteforce_y_pred)
def test_path_transition_layer2(): init_rng() sample_num = 5 class_num = 10 y1 = (9, 9, 9, 9, 9) X1 = np.zeros((sample_num, class_num)) X1[range(sample_num), y1] = 1 y2 = (0, 1, 2, 3, 4) X2 = np.zeros((sample_num, class_num)) X2[range(sample_num), y2] = 1 layer = PathTransitionLayer(class_num) cost1 = layer.cost(X1, y1).eval() cost2 = layer.cost(X2, y2).eval() cost1_2 = layer.cost(X1, y2).eval() cost2_1 = layer.cost(X2, y1).eval() y_pred1 = layer.predict(X1).eval() y_pred2 = layer.predict(X2).eval() print "X1 = ", X1 print "X2 = ", X2 print "y1 = ", y1 print "y2 = ", y2 print "cost1 = ", cost1 print "cost2 = ", cost2 print "cost1_2 = ", cost1_2 print "cost2_1 = ", cost2_1 print "y_pred1 = ", y_pred1 print "y_pred2 = ", y_pred2
def __init__(self,rng,x,y,sent_length,masks, model_params): # x shpae: (batch_size,max_term_per_sent+3,max_sentence_length) # ,where max_sentence_length = max_term_per_sent + window_size - 1 self.L1_reg = model_params['L1_reg'] self.L2_reg = model_params['L2_reg'] self.x = x self.y = y self.sent_length = sent_length self.masks = masks self.max_sentence_length = model_params['max_sentence_length'] self.window_size = model_params['window_size'] self.max_term_per_sent = self.max_sentence_length - self.window_size + 1 self.word_num = model_params['word_num'] self.POS_num = model_params['POS_num'] self.verbpos_num = model_params['verbpos_num'] self.wordpos_num = model_params['wordpos_num'] self.word_feature_num = model_params['word_feature_num'] self.POS_feature_num = model_params['POS_feature_num'] self.wordpos_feature_num = model_params['wordpos_feature_num'] self.verbpos_feature_num = model_params['verbpos_feature_num'] self.conv_window = model_params['conv_window'] self.conv_hidden_feature_num = model_params['conv_hidden_feature_num'] self.hidden_layer_size = model_params['hidden_layer_size'] self.tags_num = model_params['tags_num'] # we have 4 lookup tables here: # 1,word vector # output shape: (batch size,1,max_sentence_length * word_feature_num) # 2,POS tag vector # output shape: (batch size,1,max_sentence_length * POS_feature_num) # 3,verb position vector # output shape: (batch size,1,max_sentence_length * verbpos_feature_num) # 4,word position vector # output shape: (batch size,max_term_per_sent,1,max_sentence_length * wordpos_feature_num) self.wordvec = LookupTableLayer(inputs = x[:,0:1,:], table_size = self.word_num, window_size = self.max_sentence_length, feature_num = self.word_feature_num, reshp = (x.shape[0],1,1,x.shape[2] * self.word_feature_num)) self.POSvec = LookupTableLayer(inputs = x[:,1:2,:], table_size = self.POS_num, window_size = self.max_sentence_length, feature_num = self.POS_feature_num, reshp = (x.shape[0],1,1,x.shape[2] * self.POS_feature_num)) self.verbpos_vec = LookupTableLayer(inputs = x[:,2:3,:], table_size = self.verbpos_num, window_size = self.max_sentence_length, feature_num = self.verbpos_feature_num, reshp = (x.shape[0],1,1,x.shape[2] * self.verbpos_feature_num)) self.wordpos_vec = LookupTableLayer(inputs = x[:,3:,:], table_size = self.wordpos_num, window_size = self.max_sentence_length, feature_num = self.wordpos_feature_num, reshp = (x.shape[0],self.max_term_per_sent,1,x.shape[2] * self.wordpos_feature_num)) # conv_word.out.shape = (batch_size,1,conv_hidden_feature_num,max_sentence_length-conv_window+1) # conv_POS.out.shape = (batch_size,1,conv_hidden_feature_num,max_sentence_length-conv_window+1) # conv_verbpos.out.shape = (batch_size,1,conv_hidden_feature_num,max_sentence_length-conv_window+1) # conv_wordpos.out.shape = (batch_size,max_sentence_length,conv_hidden_feature_num,max_sentence_length-conv_window+1) # note. all output above have been seted 'dimshuffle' self.conv_word = Conv1DLayer('conv_word',rng,self.wordvec.output,\ self.conv_hidden_feature_num,1,self.conv_window,self.word_feature_num) self.conv_POS = Conv1DLayer('conv_POS',rng,self.POSvec.output,\ self.conv_hidden_feature_num,1,self.conv_window,self.POS_feature_num) self.conv_verbpos = Conv1DLayer('conv_verbpos',rng,self.verbpos_vec.output,\ self.conv_hidden_feature_num,1,self.conv_window,self.verbpos_feature_num) self.conv_wordpos = Conv1DLayer('conv_wordpos',rng,self.wordpos_vec.output,\ self.conv_hidden_feature_num,self.max_term_per_sent,self.conv_window,self.wordpos_feature_num) # the first max_sentence_length means each element of it is one prediction for that word # the second max_sentence_length means each element of it is one output of conv # conv_out shape: (batch_size,max_term_per_sent,conv_hidden_feature_num,max_term_per_sent) self.conv_out = self.conv_word.output + self.conv_POS.output + self.conv_verbpos.output + self.conv_wordpos.output self.conv_out = self.conv_out.dimshuffle(1,0,2,3,4).reshape((x.shape[0],self.max_term_per_sent,self.conv_hidden_feature_num,-1)) # max_out shape: (batch_size,max_term_per_sent,conv_hidden_feature_num) self.max_out = T.max(self.conv_out,axis=3).reshape((self.conv_out.shape[0],self.max_term_per_sent,-1)) # hidden layer # hidden layer perform one linear map and one nolinear transform # ,then in likelihood, it performs another linear map. This is # what senna do (P.7, figure 1). # hidden_layer OUTPUT SHAPE: (batch_size, max_term_per_sent, hidden_layer_size) self.hidden_layer = HiddenLayer(rng=rng, input=self.max_out, n_in = self.conv_hidden_feature_num, n_out = self.hidden_layer_size, activation=T.tanh) # TODO we use poitwise likelihood here self.sentce_loglikelihood = PathTransitionLayer(rng,self.hidden_layer.output, self.y,self.masks, self.max_term_per_sent, self.hidden_layer_size, self.tags_num) self._likelihood = self.sentce_loglikelihood.negative_log_likelihood_pointwise() self._errors = self.sentce_loglikelihood.errors()