def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], type_hidden_units=[200, 100, 6], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, print_freq=5, sen_reg=False, L2=False): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) num_sens = len(dataset[0][0][0]) print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") type_y = T.ivector("y_type") pop_y = T.ivector("y_pop") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm)) ######################### # Construct Sen Vec ##### ######################### conv_layers = [] filter_shape = (num_maps, 1, filter_hs[0], emb_dm) pool_size = (input_height - filter_hs[0] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) sen_vecs = conv_layer.output.reshape((x.shape[0], x.shape[1], num_maps)) conv_layers.append(conv_layer) ######################## ## Task 1: populaiton### ######################## pop_layer_sizes = zip(hidden_units, hidden_units[1:]) pop_layer_input = sen_vecs pop_drop_input = sen_vecs pop_hidden_outs = [] pop_drop_outs = [] pop_hidden_layers = [] pop_drop_layers = [] droprate = 0.5 for layer_size in pop_layer_sizes[:-1]: U_value = np.random.random(layer_size).astype(theano.config.floatX) b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") pop_hidden_layer = nn.HiddenLayer(rng, pop_layer_input, layer_size[0], layer_size[1], ReLU, U * (1 - droprate), b) pop_drop_hidden_layer = nn.DropoutHiddenLayer(rng, pop_drop_input, layer_size[0], layer_size[1], ReLU, droprate, U, b) pop_hidden_layers.append(pop_hidden_layer) pop_drop_layers.append(pop_drop_hidden_layer) pop_hidden_out = pop_hidden_layer.output pop_drop_out = pop_drop_hidden_layer.output pop_layer_input = pop_hidden_out pop_drop_input = pop_drop_out pop_hidden_outs.append(pop_hidden_out) pop_drop_outs.append(pop_drop_out) # construct pop classifier n_in, n_out = pop_layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out, ), dtype=theano.config.floatX) pop_W = theano.shared(W_value, borrow=True, name="pop_W") pop_b = theano.shared(b_value, borrow=True, name="pop_b") pop_act = T.dot(pop_hidden_outs[-1], pop_W * (1 - droprate)) + pop_b pop_drop_act = T.dot(pop_drop_outs[-1], pop_W) + pop_b #pop_max_act = T.max(pop_act, axis=1).flatten(2) #pop_drop_max_act = T.max(pop_drop_act, axis=1).flatten(2) pop_sum_act = T.sum(pop_act, axis=1).flatten(2) pop_drop_sum_act = T.sum(pop_drop_act, axis=1).flatten(2) pop_sen_max = T.argmax(T.max(pop_act, axis=2).flatten(2), axis=1) pop_drop_sen_max = T.argmax(T.max(pop_drop_act, axis=2).flatten(2), axis=1) #pop_probs = T.nnet.softmax(pop_max_act) #pop_drop_probs = T.nnet.softmax(pop_drop_max_act) pop_probs = T.nnet.softmax(pop_sum_act) pop_drop_probs = T.nnet.softmax(pop_drop_sum_act) pop_y_pred = T.argmax(pop_probs, axis=1) pop_drop_y_pred = T.argmax(pop_drop_probs, axis=1) pop_neg_loglikelihood = -T.mean( T.log(pop_probs)[T.arange(pop_y.shape[0]), pop_y]) pop_drop_neg_loglikelihood = -T.mean( T.log(pop_drop_probs)[T.arange(pop_y.shape[0]), pop_y]) pop_errors = T.mean(T.neq(pop_y_pred, pop_y)) pop_errors_detail = T.neq(pop_y_pred, pop_y) pop_cost = pop_neg_loglikelihood pop_drop_cost = pop_drop_neg_loglikelihood ######################## ## Task 1: event type### ######################## type_layer_sizes = zip(type_hidden_units, type_hidden_units[1:]) type_layer_input = sen_vecs type_drop_input = sen_vecs type_hidden_outs = [] type_drop_outs = [] type_hidden_layers = [] type_drop_layers = [] droprate = 0.5 for layer_size in type_layer_sizes[:-1]: U_value = np.random.random(layer_size).astype(theano.config.floatX) b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") type_hidden_layer = nn.HiddenLayer(rng, type_layer_input, layer_size[0], layer_size[1], ReLU, U * (1 - droprate), b) type_drop_hidden_layer = nn.DropoutHiddenLayer(rng, type_drop_input, layer_size[0], layer_size[1], ReLU, droprate, U, b) type_hidden_layers.append(type_hidden_layer) type_drop_layers.append(type_drop_hidden_layer) type_hidden_out = type_hidden_layer.output type_drop_out = type_drop_hidden_layer.output type_layer_input = type_hidden_out type_drop_input = type_drop_out type_hidden_outs.append(type_hidden_out) type_drop_outs.append(type_drop_out) # construct pop classifier n_in, n_out = type_layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out, ), dtype=theano.config.floatX) type_W = theano.shared(W_value, borrow=True, name="pop_W") type_b = theano.shared(b_value, borrow=True, name="pop_b") type_act = T.dot(type_hidden_outs[-1], type_W * (1 - droprate)) + type_b type_drop_act = T.dot(type_drop_outs[-1], type_W) + type_b #type_max_act = T.max(type_act, axis=1).flat2en(2) #type_drop_max_act = T.max(type_drop_act, axis=1).flatten(2) type_sum_act = T.sum(type_act, axis=1).flatten(2) type_drop_sum_act = T.sum(type_drop_act, axis=1).flatten(2) type_sen_max = T.argmax(T.max(type_act, axis=2).flatten(2), axis=1) type_drop_sen_max = T.argmax(T.max(type_drop_act, axis=2).flatten(2), axis=1) #type_probs = T.nnet.softmax(type_max_act) #type_drop_probs = T.nnet.softmax(type_drop_max_act) type_probs = T.nnet.softmax(type_sum_act) type_drop_probs = T.nnet.softmax(type_drop_sum_act) type_y_pred = T.argmax(type_probs, axis=1) type_drop_y_pred = T.argmax(type_drop_probs, axis=1) type_neg_loglikelihood = -T.mean( T.log(type_probs)[T.arange(type_y.shape[0]), type_y]) type_drop_neg_loglikelihood = -T.mean( T.log(type_drop_probs)[T.arange(type_y.shape[0]), type_y]) type_errors = T.mean(T.neq(type_y_pred, type_y)) type_errors_detail = T.neq(type_y_pred, type_y) type_cost = type_neg_loglikelihood type_drop_cost = type_drop_neg_loglikelihood ################################### ## Choose the max sens in two task# ################################### pop_drop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), pop_drop_sen_max] type_drop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), type_drop_sen_max] simi_drop_cost = T.mean( T.exp( T.sum((pop_drop_choosed_sens - type_drop_choosed_sens)**2, axis=1))) pop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), pop_sen_max] type_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), type_sen_max] simi_cost = T.mean( T.exp(T.sum((pop_choosed_sens - type_choosed_sens)**2, axis=1))) ################################## # Collect all the parameters ##### ################################## params = [] # convolution layer params for conv_layer in conv_layers: params += conv_layer.params # params for population task for layer in pop_drop_layers: params += layer.params params.append(pop_W) params.append(pop_b) # params for event type task for layer in type_drop_layers: params += layer.params params.append(type_W) params.append(type_b) if non_static: params.append(words) total_cost = pop_cost + type_cost total_drop_cost = pop_drop_cost + type_drop_cost if sen_reg: simi_weight = 0.05 total_cost += simi_weight * simi_cost total_drop_cost += simi_drop_cost if L2: l2_norm = 0.1 * T.sum(pop_W**2) + 0.1 * T.sum(type_W**2) for drop_layer in type_drop_layers: l2_norm += 0.1 * T.sum(drop_layer.W**2) for drop_layer in pop_drop_layers: l2_norm += 0.1 * T.sum(drop_layer.W**2) total_cost += l2_norm total_drop_cost += l2_norm total_grad_updates = sgd_updates_adadelta(params, total_drop_cost, lr_decay, 1e-6, sqr_norm_lim) total_preds = [pop_y_pred, type_y_pred] total_errors_details = [pop_errors_detail, type_errors_detail] total_choosed_sens = [pop_sen_max, type_sen_max] total_out = total_preds + total_errors_details + total_choosed_sens ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_pop_y, train_type_y = shared_dataset(dataset[0]) valid_x, valid_pop_y, valid_type_y = shared_dataset(dataset[1]) test_x, test_pop_y, test_type_y = shared_dataset(dataset[2]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function( [index], total_drop_cost, updates=total_grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], pop_y: train_pop_y[index * batch_size:(index + 1) * batch_size], type_y: train_type_y[index * batch_size:(index + 1) * batch_size] }) valid_train_func = function( [index], total_drop_cost, updates=total_grad_updates, givens={ x: valid_x[index * batch_size:(index + 1) * batch_size], pop_y: valid_pop_y[index * batch_size:(index + 1) * batch_size], type_y: valid_type_y[index * batch_size:(index + 1) * batch_size] }) test_pred_detail = function( [index], total_out, givens={ x: test_x[index * batch_size:(index + 1) * batch_size], pop_y: test_pop_y[index * batch_size:(index + 1) * batch_size], type_y: test_type_y[index * batch_size:(index + 1) * batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_valid = len(dataset[1][0]) n_test = len(dataset[2][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'w') print "Start to train the model....." total_score = 0.0 while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) # do validatiovalidn valid_cost = [ valid_train_func(i) for i in np.random.permutation(xrange(n_valid_batches)) ] if epoch % print_freq == 0: # do test pop_preds = [] type_preds = [] pop_errors = [] type_errors = [] pop_sens = [] type_sens = [] for i in xrange(n_test_batches): test_pop_pred, test_type_pred, test_pop_error, test_type_error, test_pop_sen, test_type_sen = test_pred_detail( i) pop_preds.append(test_pop_pred) type_preds.append(test_type_pred) pop_errors.append(test_pop_error) type_errors.append(test_type_error) pop_sens.append(test_pop_sen) type_sens.append(test_type_sen) pop_preds = np.concatenate(pop_preds) type_preds = np.concatenate(type_preds) pop_errors = np.concatenate(pop_errors) type_errors = np.concatenate(type_errors) pop_sens = np.concatenate(pop_sens) type_sens = np.concatenate(type_sens) pop_perf = 1 - np.mean(pop_errors) type_perf = 1 - np.mean(type_errors) # dumps the predictions and the choosed sentences with open( os.path.join(perf_fn, "%s_%d.pop_pred" % (exp_name, epoch)), 'w') as epf: for p in pop_preds: epf.write("%d\n" % int(p)) with open( os.path.join(perf_fn, "%s_%d.type_pred" % (exp_name, epoch)), 'w') as epf: for p in type_preds: epf.write("%d\n" % int(p)) print pop_sens with open( os.path.join(perf_fn, "%s_%d.pop_sens" % (exp_name, epoch)), 'w') as epf: for s in pop_sens: epf.write("%d\n" % int(s)) with open( os.path.join(perf_fn, "%s_%d.type_sens" % (exp_name, epoch)), 'w') as epf: for s in type_sens: epf.write("%d\n" % int(s)) message = "Epoch %d test pop perf %f, type perf %f, training_cost %f" % ( epoch, pop_perf, type_perf, np.mean(costs)) print message log_file.write(message + "\n") log_file.flush() if (pop_perf + type_perf) > total_score: total_score = pop_perf + type_perf # save the model model_name = os.path.join( perf_fn, "%s_%d.best_model" % (exp_name, epoch)) with open(model_name, 'wb') as mn: for param in params: cPickle.dump(param.get_value(), mn) end_time = timeit.default_timer() print "Finish one iteration using %f m" % ( (end_time - start_time) / 60.) # output the final model params print "Output the final model" model_name = os.path.join(perf_fn, "%s_%d.final_model" % (exp_name, epoch)) with open(model_name, 'wb') as mn: for param in params: cPickle.dump(param.get_value(), mn) log_file.flush() log_file.close()
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] drop_rate_sentence = self.options["drop_rate_sentence"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] sentence_len = len(dataset[0][0][0][0]) # compute the sentence flags train_flags, test_flags = construct_sentence_flag(dataset) train_flags = theano.shared(value=np.asarray(train_flags, dtype=theano.config.floatX), borrow=True) test_flags = theano.shared(value=np.asarray(test_flags, dtype=theano.config.floatX), borrow=True) # define the parameters x = T.tensor3("x") y = T.ivector("y") sen_flags = T.matrix("flag") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0]*x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word*k_max_word dropout_sent_vec = dropout_word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb*(1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) # construct sentence level classifier n_in = sent_vec_dim n_out = 1 sen_W_values = np.zeros((n_in, n_out), dtype=theano.config.floatX) sen_W = theano.shared(value=sen_W_values, borrow=True, name="logis_W") sen_b_value = nn.as_floatX(0.0) sen_b = theano.shared(value=sen_b_value, borrow=True, name="logis_b") drop_sent_prob = T.nnet.sigmoid(T.dot(dropout_sent_vec, sen_W) + sen_b) sent_prob = T.nnet.sigmoid(T.dot(sent_vec, sen_W*(1-drop_rate_sentence)) + sen_b) # reform the sent vec to doc level drop_sent_prob = drop_sent_prob.reshape((x.shape[0], x.shape[1])) sent_prob = sent_prob.reshape((x.shape[0], x.shape[1])) # the pos probability bag label is the avg of the probs drop_doc_prob = T.sum(drop_sent_prob * sen_flags, axis=1) / T.sum(sen_flags, axis=1) doc_prob = T.sum(sent_prob * sen_flags, axis=1) / T.sum(sen_flags, axis=1) drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) """ # the pos probability bag label equals to 1 - all negative drop_doc_prob = T.prod(drop_sent_prob, axis=1) drop_doc_prob = T.set_subtensor(drop_doc_prob[:,1], 1 - drop_doc_prob[:,0]) doc_prob = T.prod(sent_prob, axis=1) doc_prob = T.set_subtensor(doc_prob[:,1], 1 - doc_prob[:,0]) # the pos probability bag label is the most positive probability drop_doc_prob = T.max(drop_sent_prob, axis=1) drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) doc_prob = T.max(sent_prob, axis=1) doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) """ doc_preds = doc_prob > 0.5 # instance level cost drop_sent_cost = T.sum(T.maximum(0.0, nn.as_floatX(.5) - T.sgn(drop_sent_prob.reshape((x.shape[0]*x.shape[1], n_out)) - nn.as_floatX(0.6)) * T.dot(dropout_sent_vec, sen_W)) * sen_flags.reshape((x.shape[0]*x.shape[1], n_out))) / T.sum(sen_flags) # we need that the most positive instance at least 0.7 in pos bags # and at most 0.1 in neg bags # we want the number of positive instance should at least ... # and non of the positive instances in the negative bags # compute the number of positive instance positive_count = T.sum((drop_sent_prob * sen_flags) > 0.5, axis=1) pos_cost = T.maximum(nn.as_floatX(0.0), nn.as_floatX(2) - positive_count) neg_cost = T.maximum(nn.as_floatX(0.0), positive_count) """ most_positive_prob = T.max(drop_sent_prob, axis=1) pos_cost = T.maximum(0.0, nn.as_floatX(0.6) - most_positive_prob) neg_cost = T.maximum(0.0, most_positive_prob - nn.as_floatX(0.05)) """ penal_cost = T.mean(pos_cost * y + neg_cost * (nn.as_floatX(1.0) - y)) # add the sentence similarity constrains sen_sen = T.dot(dropout_sent_vec, dropout_sent_vec.T) sen_sqr = T.sum(dropout_sent_vec ** 2, axis=1) sen_sqr_left = sen_sqr.dimshuffle(0, 'x') sen_sqr_right = sen_sqr.dimshuffle('x', 0) sen_sim_matrix = sen_sqr_left - 2 * sen_sqr + sen_sqr_right sen_sim_matrix = T.exp(-1 * sen_sim_matrix) sen_sim_prob = drop_sent_prob.reshape((x.shape[0]*x.shape[1], 1)) - drop_sent_prob.flatten() sen_sim_prob = sen_sim_prob ** 2 sen_sim_flag = T.dot(sen_flags.reshape((x.shape[0]*x.shape[1],1)), sen_flags.reshape((1,x.shape[0]*x.shape[1]))) sen_sim_cost = T.sum(sen_sim_matrix * sen_sim_prob * sen_sim_flag) / T.sum(sen_sim_flag) # bag level cost drop_bag_cost = T.mean(-y * T.log(drop_doc_prob) * nn.as_floatX(0.6) - (1 - y) * T.log(1 - drop_doc_prob) * nn.as_floatX(0.4)) #drop_cost = drop_bag_cost * nn.as_floatX(3.0) + drop_sent_cost + nn.as_floatX(2.0) * penal_cost drop_cost = drop_bag_cost * nn.as_floatX(0.6) + drop_sent_cost * nn.as_floatX(0.1) + penal_cost * nn.as_floatX(0.5) + sen_sim_cost * nn.as_floatX(0.0001) # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params.append(sen_W) self.params.append(sen_b) grad_updates = nn.sgd_updates_adadelta(self.params, drop_cost, rho, epsilon, norm_lim) # construct the dataset # random the train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function([index], [drop_cost, drop_bag_cost, drop_sent_cost, penal_cost, sen_sim_cost], updates=grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size], sen_flags: train_flags[index*batch_size:(index+1)*batch_size] }) test_func = theano.function([index], doc_preds, givens={ x:test_x[index*batch_size:(index+1)*batch_size], sen_flags: test_flags[index*batch_size:(index+1)*batch_size] }) get_train_sent_prob = theano.function([index], sent_prob, givens={ x:train_x[index*batch_size:(index+1)*batch_size] }) get_test_sent_prob = theano.function([index], sent_prob, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) epoch = 0 best_score = 0 log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost, train_sim_cost = zip(*costs) print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f sim cost %f\n" % (epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost), np.mean(train_sim_cost)) if epoch % 1 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support(test_cpu_y, test_preds, pos_label=1) if beta[1] > best_score or epoch % 5 == 0: best_score = beta[1] # save the sentence vectors train_sens = [get_train_sent_prob(i) for i in range(n_train_batches)] test_sens = [get_test_sent_prob(i) for i in range(n_test_batches)] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent_%d.vec" % (exp_name, epoch) out_test_sent_file = "./results/%s_test_sent_%d.vec" % (exp_name, epoch) with open(out_test_sent_file, 'w') as test_f, open(out_train_sent_file, 'w') as train_f: cPickle.dump(train_sens, train_f) cPickle.dump(test_sens, test_f) print "Get best performace at %d iteration %f" % (epoch, test_score) log_file.write("Get best performance at %d iteration %f \n" % (epoch, test_score)) end_time = timeit.default_timer() print "Iteration %d , precision, recall, f1" % epoch, precision, recall, beta log_file.write("Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , neg f1 %f, pos f1 %f, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], beta[0], beta[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost))) print "Using time %f m" % ((end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % ( epoch, (end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) log_file.flush() log_file.close()
def construct_model(params, datasets, filter_hs=[3, 4, 5], batch_size=200): rng = np.random.RandomState(1234) input_height = len(datasets[0][0]) - 2 input_width = params["embedding"].shape[1] filter_shapes = [p[0].shape for p in params["convs"]] pool_sizes = [(input_height - s[2] + 1, input_width - s[3] + 1) for s in filter_shapes] param_sizes = { "input_height": input_height, "input_width": input_width, "filter_shapes": filter_shapes, "pool_sizes": pool_sizes } print "Param sizes: ", param_sizes index = T.iscalar() x = T.matrix('x') y = T.ivector('y') print '....Construct model' word_embedding = params["embedding"] words = shared(word_embedding, name='embedding') layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(\ (x.shape[0], 1, x.shape[1], words.shape[1])) # construct layers conv_layers = [] conv_params = params["convs"] layer1_inputs = [] for i, filter_h in enumerate(filter_hs): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_W = shared(value=np.asarray(conv_params[i][0], dtype=theano.config.floatX), borrow=True, name='conv_W') conv_b = shared(value=np.asarray(conv_params[i][1], dtype=theano.config.floatX), borrow=True, name='conv_b') conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=(batch_size, 1, input_height, input_width), filter_shape=filter_shape, pool_size=pool_size, activation=ReLU, W=conv_W, b=conv_b) conv_layers.append(conv_layer) layer1_input = conv_layer.output.flatten(2) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) # population classifier pop_hidden_units = [300, 13] clf_w, clf_b = params["clf"] Ws = [ shared(value=np.asarray(clf_w, dtype=theano.config.floatX), borrow=True, name='logis_w') ] bs = [ shared(value=np.asarray(clf_b, dtype=theano.config.floatX), borrow=True, name='logis_b') ] pop_classifier = nn.MLPDropout(rng, input=layer1_input, layer_sizes=pop_hidden_units, dropout_rates=[0.5], activations=[ReLU], Ws=Ws, bs=bs) pop_loss = pop_classifier.errors(y) pop_pred = pop_classifier.preds # construct data set if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = dataset[0] new_data = np.random.permutation(new_data) n_batches = new_data.shape[0] / batch_size n_train_batches = int(np.round(n_batches * 0.9)) train_set = new_data[:n_train_batches * batch_size, :] train_set_x = theano.shared(np.asarray(train_set[:, :input_height], dtype=theano.config.floatX), borrow=True) train_set_pop_y = T.cast( theano.shared(np.asarray(train_set[:, -2], dtype=theano.config.floatX), borrow=True), 'int32') print '...construct test function' test_fn = function( inputs=[index], outputs=[pop_loss, pop_pred], givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_pop_y[index * batch_size:(index + 1) * batch_size] }) results = [test_fn(i) for i in xrange(n_train_batches)] pop_losses = [r[0] for r in results] pop_train_perf = 1 - np.mean(pop_losses) pop_predictions = np.concatenate([r[1] for r in results]) rs = {} rs["pop_preds"] = list(pop_predictions) rs["pop_truth"] = list(map(int, train_set[:, -2])) print "Population Train Performance %f" % pop_train_perf return rs
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, k=0, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, print_freq=5): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) num_sens = len(dataset[0][0][0]) print "--input height ", input_height num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." word_x = T.tensor3("word_x") freq_x = T.tensor3("freq_x") pos_x = T.tensor3("pos_x") sent_x = T.matrix("sent_x") y_event = T.ivector("y_event") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) sym_dim = 20 # the frequency embedding is 21 * sym_dim matrix freq_val = np.random.random((21, sym_dim)).astype(theano.config.floatX) freqs = shared(value=freq_val, borrow=True, name="freqs") pos_val = np.random.random((21, sym_dim)).astype(theano.config.floatX) poss = shared(value=pos_val, borrow=True, name="poss") # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(emb_dm, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) freq_zero_tensor = T.vector() freq_zero_vec = np.zeros(sym_dim, dtype=theano.config.floatX) freq_set_zero = function([freq_zero_tensor], updates=[(freqs, T.set_subtensor(freqs[0, :], freq_zero_tensor))]) pos_zero_tensor = T.vector() pos_zero_vec = np.zeros(sym_dim, dtype=theano.config.floatX) pos_set_zero = function([pos_zero_tensor], updates=[(poss, T.set_subtensor(poss[0, :], pos_zero_tensor))]) word_x_emb = words[T.cast(word_x.flatten(), dtype="int32")].reshape( (word_x.shape[0] * word_x.shape[1], 1, word_x.shape[2], emb_dm)) freq_x_emb = freqs[T.cast(freq_x.flatten(), dtype="int32")].reshape( (freq_x.shape[0] * freq_x.shape[1], 1, freq_x.shape[2], sym_dim)) pos_x_emb = poss[T.cast(pos_x.flatten(), dtype="int32")].reshape( (pos_x.shape[0] * pos_x.shape[1], 1, pos_x.shape[2], sym_dim)) layer0_input = T.concatenate([word_x_emb, freq_x_emb, pos_x_emb], axis=3) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = (num_maps, 1, filter_hs[i], emb_dm + sym_dim + sym_dim) pool_size = (input_height - filter_hs[i] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) sen_vecs = conv_layer.output.reshape( (word_x.shape[0], 1, word_x.shape[1], num_maps)) # construct multi-layer sentence vectors conv_layers.append(conv_layer) layer1_inputs.append(sen_vecs) sen_vec = T.concatenate(layer1_inputs, 3) # score the sentences theta_value = np.random.random((len(filter_hs) * num_maps, 1)) theta = shared(value=np.asarray(theta_value, dtype=theano.config.floatX), name="theta", borrow=True) weighted_sen_vecs, sen_score = keep_max(sen_vec, theta, k, sent_x) sen_score_cost = T.mean(T.sum(sen_score, axis=2).flatten(1)) doc_vec = T.sum(weighted_sen_vecs, axis=2) layer1_input = doc_vec.flatten(2) final_sen_score = sen_score.flatten(2) ############## # classifier pop# ############## params = [] for conv_layer in conv_layers: params += conv_layer.params params.append(theta) params.append(words) params.append(freqs) params.append(poss) gamma = as_floatX(0.001) beta1 = as_floatX(0.000) beta2 = as_floatX(0.000) total_cost = gamma * sen_score_cost total_dropout_cost = gamma * sen_score_cost print "Construct classifier ...." hidden_units[0] = num_maps * len(filter_hs) model = nn.MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, dropout_rates=[dropout_rate], activations=[activation]) params += model.params cost = model.negative_log_likelihood(y_event) dropout_cost = model.dropout_negative_log_likelihood(y_event) total_cost += cost + beta1 * model.L1 total_dropout_cost += dropout_cost + beta1 * model.L1 # using adagrad total_grad_updates = sgd_updates_adadelta(params, total_dropout_cost, lr_decay, 1e-6, sqr_norm_lim) total_preds = model.preds ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" train_word_x, train_freq_x, train_pos_x, train_sent_x, train_event_y = shared_dataset( dataset[0]) test_word_x, test_freq_x, test_pos_x, test_sent_x, test_event_y = shared_dataset( dataset[1]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function( [index], total_cost, updates=total_grad_updates, givens={ word_x: train_word_x[index * batch_size:(index + 1) * batch_size], freq_x: train_freq_x[index * batch_size:(index + 1) * batch_size], pos_x: train_pos_x[index * batch_size:(index + 1) * batch_size], sent_x: train_sent_x[index * batch_size:(index + 1) * batch_size], y_event: train_event_y[index * batch_size:(index + 1) * batch_size], }) test_pred = function( [index], total_preds, givens={ word_x: test_word_x[index * batch_size:(index + 1) * batch_size], freq_x: test_freq_x[index * batch_size:(index + 1) * batch_size], pos_x: test_pos_x[index * batch_size:(index + 1) * batch_size], sent_x: test_sent_x[index * batch_size:(index + 1) * batch_size] }) test_sentence_est = function( [index], final_sen_score, givens={ word_x: test_word_x[index * batch_size:(index + 1) * batch_size], freq_x: test_freq_x[index * batch_size:(index + 1) * batch_size], pos_x: test_pos_x[index * batch_size:(index + 1) * batch_size], sent_x: test_sent_x[index * batch_size:(index + 1) * batch_size] }) train_sentence_est = function( [index], final_sen_score, givens={ word_x: train_word_x[index * batch_size:(index + 1) * batch_size], freq_x: train_freq_x[index * batch_size:(index + 1) * batch_size], pos_x: train_pos_x[index * batch_size:(index + 1) * batch_size], sent_x: train_sent_x[index * batch_size:(index + 1) * batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_test = len(dataset[1][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'w') print "Start to train the model....." cpu_tst_event_y = np.asarray(dataset[1][4]) def compute_score(true_list, pred_list): mat = np.equal(true_list, pred_list) score = np.mean(mat) return score best_score = 0.0 while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) freq_set_zero(freq_zero_vec) pos_set_zero(pos_zero_vec) if epoch % 1 == 0: # do test test_event_preds = np.concatenate( [test_pred(i) for i in xrange(n_test_batches)]) test_event_score = compute_score(cpu_tst_event_y, test_event_preds) precision, recall, beta, support = precision_recall_fscore_support( cpu_tst_event_y, test_event_preds, pos_label=1) with open( os.path.join(perf_fn, "%s_%d.event_pred" % (exp_name, epoch)), 'w') as epf: for p in test_event_preds: epf.write("%d\n" % int(p)) message = "Epoch %d test event perf %f, precision [%f, %f], recall[%f %f] , f1[%f, %f], train cost %f" % ( epoch, test_event_score, precision[0], precision[1], recall[0], recall[1], beta[0], beta[1], np.mean(costs)) evl_score = beta[1] print message log_file.write(message + "\n") log_file.flush() if (evl_score > best_score): best_score = evl_score # save the sentence score test_sen_score = [ test_sentence_est(i) for i in xrange(n_test_batches) ] score_file = "./results/%s_%d_test.score" % (exp_name, epoch) with open(score_file, "wb") as sm: cPickle.dump(test_sen_score, sm) end_time = timeit.default_timer() print "Finish one iteration using %f m" % ( (end_time - start_time) / 60.) log_file.flush() log_file.close()
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, sen_weight=False): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) # number of words in the sentences num_sens = len(dataset[0][0][0]) # number of sentences print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") y = T.ivector("y") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) # the input for the sentence level conv layers layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm)) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = (num_maps, 1, filter_hs[i], emb_dm) pool_size = (input_height - filter_hs[i] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) sen_vecs = conv_layer.output.reshape( (x.shape[0], x.shape[1], num_maps)) sen_vecs = sen_vecs.dimshuffle(0, 2, 1) # construct the weighted sentences if sen_weight: # using sentence weight #s_w = 1. / T.arange(1, x.shape[1] + 1) s_w = T.arange(1, x.shape[1] + 1) s_w = (1.0 * x.shape[0] - s_w) / T.sum(s_w) sen_vecs = sen_vecs * s_w # using max in each dimension to represent the document vec doc_vec = T.sum(sen_vecs, axis=2).flatten(2) layer1_inputs.append(doc_vec) conv_layers.append(conv_layer) """ doc_filter_shape = (num_maps, 1, 2, num_maps) doc_pool_size = (num_sens - 2 + 1, 1) doc_conv_layer = nn.ConvPoolLayer(rng, input=sen_vecs, input_shape=None, filter_shape=doc_filter_shape, pool_size=doc_pool_size, activation=activation) layer1_input = doc_conv_layer.output.flatten(2) conv_layers.append(conv_layer) conv_layers.append(doc_conv_layer) layer1_inputs.append(layer1_input) """ layer1_input = T.concatenate(layer1_inputs, 1) ############## # classifier # ############## print "Construct classifier ...." hidden_units[0] = num_maps * len(filter_hs) model = nn.MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, dropout_rates=[dropout_rate], activations=[activation]) params = model.params for conv_layer in conv_layers: params += conv_layer.params if non_static: params.append(words) cost = model.negative_log_likelihood(y) dropout_cost = model.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_y = shared_dataset(dataset[0]) valid_x, valid_y = shared_dataset(dataset[1]) test_x, test_y = shared_dataset(dataset[2]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function( [index], cost, updates=grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size] }) valid_train_func = function( [index], cost, updates=grad_updates, givens={ x: valid_x[index * batch_size:(index + 1) * batch_size], y: valid_y[index * batch_size:(index + 1) * batch_size] }) train_pred = function( [index], model.preds, givens={x: train_x[index * batch_size:(index + 1) * batch_size]}) valid_pred = function([index], model.preds, givens={ x: valid_x[index * batch_size:(index + 1) * batch_size], }) test_pred = function([index], model.preds, givens={ x: test_x[index * batch_size:(index + 1) * batch_size], }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_valid = len(dataset[1][0]) n_test = len(dataset[2][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'a') print "Start to train the model....." cpu_trn_y = np.asarray(dataset[0][1]) cpu_val_y = np.asarray(dataset[1][1]) cpu_tst_y = np.asarray(dataset[2][1]) def compute_score(true_list, pred_list): mat = np.equal(true_list, pred_list) score = np.mean(mat) return score best_test_score = 0. while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) # do validatiovalidn valid_cost = [ valid_train_func(i) for i in np.random.permutation(xrange(n_valid_batches)) ] if epoch % 5 == 0: # do test test_preds = np.concatenate( [test_pred(i) for i in xrange(n_test_batches)]) test_score = compute_score(cpu_tst_y, test_preds) with open(os.path.join(perf_fn, "%s_%d.pred" % (exp_name, epoch)), 'w') as epf: for p in test_preds: epf.write("%d\n" % int(p)) message = "Epoch %d test perf %f" % (epoch, test_score) print message log_file.write(message + "\n") log_file.flush() # store the best model if test_score > best_test_score: best_test_score = test_score # save the model model_name = "%s_%d.model" % (exp_name, epoch) with open(model_name, 'wb') as bm: for p in params: cPickle.dump(p.get_value(), bm) end_time = timeit.default_timer() print "Finish one iteration using %f m" % ( (end_time - start_time) / 60.) log_file.flush() log_file.close()
def train_cnn_encoder(datasets, word_embedding, input_width=64, filter_hs=[3, 4, 5], hidden_units=[100, 2], dropout_rate=[0.5], shuffle_batch=True, n_epochs=100, batch_size=50, lr_decay=0.95, activations=[ReLU], sqr_norm_lim=9, non_static=True): start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(datasets[0][0]) - 2 filter_width = input_width feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_width)) pool_sizes.append( (input_height - filter_h + 1, input_width - filter_width + 1)) parameters = [("Input Shape", input_height, input_width), ("Filter Shape", filter_shapes), ("Pool Sizes", pool_sizes), ("dropout rate", dropout_rate), ("hidden units", hidden_units), ("shuffle_batch", shuffle_batch), ("n_epochs", n_epochs), ("batch size", batch_size)] print parameters # construct the model index = T.iscalar() x = T.matrix("x") y = T.ivector("y") words = shared(value=word_embedding, name="embedding") zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0], 1, x.shape[1], words.shape[1])) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=(batch_size, 1, input_height, input_width), filter_shape=filter_shape, pool_size=pool_size, activation=ReLU) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) ################### # Population Task # ################### hidden_units[0] = feature_maps * len(filter_hs) pop_classifier = nn.MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, dropout_rates=dropout_rate, activations=activations) pop_params = pop_classifier.params for conv_layer in conv_layers: pop_params += conv_layer.params if non_static: pop_params.append(words) pop_cost = pop_classifier.negative_log_likelihood(y) pop_dropout_cost = pop_classifier.dropout_negative_log_likelihood(y) pop_grad_updates = sgd_updates_adadelta(pop_params, pop_dropout_cost, lr_decay, 1e-6, sqr_norm_lim) ################### # EventType Task # ################### event_type_hidden_units = [feature_maps * len(filter_hs), 12] type_classifier = nn.MLPDropout(rng, input=layer1_input, layer_sizes=event_type_hidden_units, dropout_rates=dropout_rate, activations=activations) type_params = type_classifier.params for conv_layer in conv_layers: type_params += conv_layer.params if non_static: type_params.append(words) type_cost = type_classifier.negative_log_likelihood(y) type_dropout_cost = type_classifier.dropout_negative_log_likelihood(y) type_grad_updates = sgd_updates_adadelta(type_params, type_dropout_cost, lr_decay, 1e-6, sqr_norm_lim) ###################### # Construct Data Set # ###################### np.random.seed(1234) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = datasets[0] new_data = np.random.permutation(new_data) n_batches = new_data.shape[0] / batch_size n_train_batches = int(np.round(n_batches * 0.9)) # divide the train set intp train/val sets if datasets[1].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[1].shape[0] % batch_size test_set = np.random.permutation(datasets[1]) extra_data = test_set[:extra_data_num] new_test_data = np.append(datasets[1], extra_data, axis=0) else: new_test_data = datasets[1] test_set_x = new_test_data[:, :input_height] test_set_pop_y = np.asarray(new_test_data[:, -2], "int32") test_set_type_y = np.asarray(new_test_data[:, -1], "int32") train_set = new_data[:n_train_batches * batch_size, :] val_set = new_data[n_train_batches * batch_size:, :] print train_set[:, -1] borrow = True train_set_x = theano.shared(np.asarray(train_set[:, :input_height], dtype=theano.config.floatX), borrow=borrow) train_set_pop_y = T.cast( theano.shared(np.asarray(train_set[:, -2], dtype=theano.config.floatX), borrow=borrow), 'int32') train_set_type_y = T.cast( theano.shared(np.asarray(train_set[:, -1], dtype=theano.config.floatX), borrow=borrow), 'int32') val_set_x = theano.shared(np.asarray(val_set[:, :input_height], dtype=theano.config.floatX), borrow=borrow) val_set_pop_y = T.cast( theano.shared(np.asarray(val_set[:, -2], dtype=theano.config.floatX), borrow=borrow), 'int32') val_set_type_y = T.cast( theano.shared(np.asarray(val_set[:, -1], dtype=theano.config.floatX), borrow=borrow), 'int32') n_val_batches = n_batches - n_train_batches n_test_batches = test_set_x.shape[0] / batch_size print 'n_test_batches: %d' % n_test_batches # transform the data into shared varibale for GPU computing test_set_x = theano.shared(np.asarray(test_set_x, dtype=theano.config.floatX), borrow=borrow) test_set_pop_y = theano.shared(test_set_pop_y, borrow=True) test_set_type_y = theano.shared(test_set_type_y, borrow=True) #################### # Train Model Func # #################### # population model val_pop_model = function( [index], pop_classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_pop_y[index * batch_size:(index + 1) * batch_size] }) test_pop_model = function( [index], pop_classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_pop_y[index * batch_size:(index + 1) * batch_size] }) real_test_pop_model = function( [index], pop_classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_pop_y[index * batch_size:(index + 1) * batch_size] }) train_pop_model = function( [index], pop_cost, updates=pop_grad_updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_pop_y[index * batch_size:(index + 1) * batch_size] }) # event type model val_type_model = function( [index], type_classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_type_y[index * batch_size:(index + 1) * batch_size] }) test_type_model = function( [index], type_classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_type_y[index * batch_size:(index + 1) * batch_size] }) real_test_type_model = function( [index], type_classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_type_y[index * batch_size:(index + 1) * batch_size] }) train_type_model = function( [index], type_cost, updates=type_grad_updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_type_y[index * batch_size:(index + 1) * batch_size] }) """ test_pred_layers = [] test_size = test_set_x.shape[0] test_layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((test_size, 1, input_height, input_width)) for conv_layer in conv_layers: test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_pop_y_pred = pop_classifier.predict(test_layer1_input) test_pop_error = T.mean(T.neq(test_pop_y_pred, y)) test_pop_model_all = function([x, y], test_pop_error) test_type_y_pred = type_classifier.predict(test_layer1_input) test_type_error = T.mean(T.neq(test_type_y_pred, y)) test_type_model_all = function([x, y], test_type_error) """ # start to training the model print "Start training the model...." epoch = 0 best_pop_val_perf = 0 best_type_val_perf = 0 while (epoch < n_epochs): epoch += 1 if shuffle_batch: for minibatch_index in np.random.permutation( range(n_train_batches)): if minibatch_index % 10 == 0: print minibatch_index cost_pop_epoch = train_pop_model(minibatch_index) set_zero(zero_vec) cost_type_epoch = train_type_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): cost_pop_epoch = train_pop_model(minibatch_index) set_zero(zero_vec) cost_type_epoch = train_type_model(minibatch_index) set_zero(zero_vec) train_pop_losses = [test_pop_model(i) for i in xrange(n_train_batches)] train_pop_perf = 1 - np.mean(train_pop_losses) train_type_losses = [ test_type_model(i) for i in xrange(n_train_batches) ] train_type_perf = 1 - np.mean(train_type_losses) val_pop_losses = [val_pop_model(i) for i in xrange(n_val_batches)] val_pop_perf = 1 - np.mean(val_pop_losses) val_type_losses = [val_type_model(i) for i in xrange(n_val_batches)] val_type_perf = 1 - np.mean(val_type_losses) print('epoch %i, train pop perf %f %%, val pop perf %f' % (epoch, train_pop_perf * 100., val_pop_perf * 100.)) print('epoch %i, train type perf %f %%, val type perf %f' % (epoch, train_type_perf * 100., val_type_perf * 100.)) if val_pop_perf >= best_pop_val_perf: best_pop_val_perf = val_pop_perf #test_pop_losses = test_pop_model_all(test_set_x, test_set_pop_y) test_pop_losses = [ real_test_pop_model(i) for i in xrange(n_test_batches) ] test_pop_perf = 1 - np.mean(test_pop_losses) print "Test POP Performance %f under Current Best Valid perf %f" % ( test_pop_perf, val_pop_perf) if val_type_perf >= best_type_val_perf: best_type_val_perf = val_type_perf #test_type_losses = test_type_model_all(test_set_x, test_set_type_y) test_type_losses = [ real_test_type_model(i) for i in xrange(n_test_batches) ] test_type_perf = 1 - np.mean(test_type_losses) print "Test Type Performance %f under Current Best Valid perf %f" % ( test_type_perf, val_type_perf) end_time = timeit.default_timer() print "Epoch %d finish take time %fm " % (epoch, (end_time - start_time) / 60.) start_time = timeit.default_timer() return test_pop_perf, test_type_perf
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, k=0, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, print_freq=5): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) num_sens = len(dataset[0][0][0]) print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") y_type = T.ivector("y_type") y_pop = T.ivector("y_pop") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(( x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm )) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = (num_maps, 1, filter_hs[i], emb_dm) pool_size = (input_height - filter_hs[i] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) sen_vecs = conv_layer.output.reshape((x.shape[0], 1, x.shape[1], num_maps)) # construct multi-layer sentence vectors conv_layers.append(conv_layer) layer1_inputs.append(sen_vecs) sen_vec = T.concatenate(layer1_inputs, 3) # score the sentences theta_value = np.random.random((len(filter_hs) * num_maps, 1)) theta = shared(value=np.asarray(theta_value, dtype=theano.config.floatX), name="theta", borrow=True) weighted_sen_vecs, sen_score = keep_max(sen_vec, theta, k) doc_vec = T.max(weighted_sen_vecs, axis=2) layer1_input = doc_vec.flatten(2) final_sen_score = sen_score.flatten(2) ############## # classifier pop# ############## print "Construct classifier ...." hidden_units[0] = num_maps * len(filter_hs) model = nn.MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, dropout_rates=[dropout_rate], activations=[activation]) params = model.params for conv_layer in conv_layers: params += conv_layer.params params.append(theta) if non_static: params.append(words) cost = model.negative_log_likelihood(y_pop) dropout_cost = model.dropout_negative_log_likelihood(y_pop) ####################### # classifier Type ##### ####################### type_hidden_units = [num for num in hidden_units] type_hidden_units[-1] = 5 type_model = nn.MLPDropout(rng, input=layer1_input, layer_sizes=type_hidden_units, dropout_rates=[dropout_rate], activations=[activation]) params += type_model.params type_cost = type_model.negative_log_likelihood(y_type) type_dropout_cost = type_model.dropout_negative_log_likelihood(y_type) total_cost = cost + type_cost total_dropout_cost = dropout_cost + type_dropout_cost # using adagrad lr = 0.01 """ total_grad_updates = nn.optimizer(total_dropout_cost, params, lr, method="adadelta" ) """ total_grad_updates = sgd_updates_adadelta(params, total_dropout_cost, lr_decay, 1e-6, sqr_norm_lim) total_preds = [model.preds, type_model.preds] ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_pop_y, train_type_y = shared_dataset(dataset[0]) test_x, test_pop_y, test_type_y = shared_dataset(dataset[1]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function([index], total_cost, updates=total_grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y_pop: train_pop_y[index*batch_size:(index+1)*batch_size], y_type:train_type_y[index*batch_size:(index+1)*batch_size] }) test_pred = function([index], total_preds, givens={ x:test_x[index*batch_size:(index+1)*batch_size], }) test_sentence_est = function([index], final_sen_score, givens={ x: test_x[index*batch_size:(index+1)*batch_size] }) train_sentence_est = function([index], final_sen_score, givens={ x: train_x[index*batch_size:(index+1)*batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_test = len(dataset[1][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'w') print "Start to train the model....." cpu_tst_pop_y = np.asarray(dataset[1][1]) cpu_tst_type_y = np.asarray(dataset[1][2]) def compute_score(true_list, pred_list): mat = np.equal(true_list, pred_list) score = np.mean(mat) return score total_score = 0.0 while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) if epoch % print_freq == 0: # do test test_pop_preds, test_type_preds = map(np.concatenate, zip(*[test_pred(i) for i in xrange(n_test_batches)])) test_pop_score = compute_score(cpu_tst_pop_y, test_pop_preds) test_type_score = compute_score(cpu_tst_type_y, test_type_preds) with open(os.path.join(perf_fn, "%s_%d.pop_pred" % (exp_name, epoch)), 'w') as epf: for p in test_pop_preds: epf.write("%d\n" % int(p)) with open(os.path.join(perf_fn, "%s_%d.type_pred" % (exp_name, epoch)), 'w') as epf: for p in test_type_preds: epf.write("%d\n" % int(p)) message = "Epoch %d test pop perf %f, type perf %f" % (epoch, test_pop_score, test_type_score) print message log_file.write(message + "\n") log_file.flush() if ((test_pop_score + test_type_score) > total_score) or (epoch % 15 == 0): total_score = test_pop_score + test_type_score # save the sentence score test_sen_score = [test_sentence_est(i) for i in xrange(n_test_batches)] score_file = "./results/%s_%d_test.score" % (exp_name, epoch) with open(score_file, "wb") as sm: cPickle.dump(test_sen_score, sm) train_sen_score = [train_sentence_est(i) for i in xrange(n_train_batches)] score_file = "./results/%s_%d_train.score" % (exp_name, epoch) with open(score_file, "wb") as sm: cPickle.dump(train_sen_score, sm) end_time = timeit.default_timer() print "Finish one iteration using %f m" % ((end_time - start_time)/60.) log_file.flush() log_file.close()
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] num_maps_sentence = self.options["num_maps_sentence"] drop_rate_sentence = self.options["drop_rate_sentence"] sentence_window = self.options["sentence_window"] k_max_sentence = self.options["k_max_sentence"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] sentence_len = len(dataset[0][0][0][0]) sentence_num = len(dataset[0][0][0]) # define the parameters x = T.tensor3("x") y = T.ivector("y") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function( [zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word * k_max_word dropout_sent_vec = dropout_word_conv.output.reshape( (x.shape[0], 1, x.shape[1], sent_vec_dim)) dropout_sent_vec = nn.dropout_from_layer(rng, dropout_sent_vec, drop_rate_sentence) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb * (1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape( (x.shape[0], 1, x.shape[1], sent_vec_dim)) # construct the convolution layer on sentences sent_filter_shape = (num_maps_sentence, 1, sentence_window, sent_vec_dim) sent_pool_size = (sentence_num - sentence_window + 1, 1) dropout_sent_conv = nn.ConvPoolLayer(rng, input=dropout_sent_vec, input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence) sent_conv = nn.ConvPoolLayer(rng, input=sent_vec * (1 - drop_rate_sentence), input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence, W=dropout_sent_conv.W, b=dropout_sent_conv.b) dropout_doc_vec = dropout_sent_conv.output.flatten(2) doc_vec = sent_conv.output.flatten(2) doc_vec_dim = num_maps_sentence * k_max_sentence # construct classifier dropout_logistic_layer = nn.LogisticRegressionLayer( input=dropout_doc_vec, n_in=doc_vec_dim, n_out=2) logistic_layer = nn.LogisticRegressionLayer(input=doc_vec, n_in=doc_vec_dim, n_out=2, W=dropout_logistic_layer.W, b=dropout_logistic_layer.b) dropout_cost = dropout_logistic_layer.negative_log_likelihood(y) cost = logistic_layer.negative_log_likelihood(y) preds = logistic_layer.y_pred errors = logistic_layer.errors(y) # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params += dropout_sent_conv.params self.params += dropout_logistic_layer.params grad_updates = nn.sgd_updates_adadelta(self.params, dropout_cost, rho, epsilon, norm_lim) # construct the dataset train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function( [index], dropout_cost, updates=grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size] }) test_func = theano.function( [index], preds, givens={x: test_x[index * batch_size:(index + 1) * batch_size]}) get_train_sentvec = theano.function( [index], sent_vec, givens={x: train_x[index * batch_size:(index + 1) * batch_size]}) get_test_sentvec = theano.function( [index], sent_vec, givens={x: test_x[index * batch_size:(index + 1) * batch_size]}) epoch = 0 best_score = 0 raw_train_x = dataset[0][0] raw_test_x = dataset[1][0] # get the sentence number for each document number_train_sens = [] number_test_sens = [] for doc in raw_train_x: sen_num = 0 for sen in doc: if np.any(sen): sen_num += 1 number_train_sens.append(sen_num) for doc in raw_test_x: sen_num = 0 for sen in doc: if np.any(sen): sen_num += 1 number_test_sens.append(sen_num) log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation( range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) if epoch % 5 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support( test_cpu_y, test_preds, pos_label=1) if test_score > best_score: best_score = test_score # save the sentence vectors train_sens = [ get_train_sentvec(i) for i in range(n_train_batches) ] test_sens = [ get_test_sentvec(i) for i in range(n_test_batches) ] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent.vec" % exp_name out_test_sent_file = "./results/%s_test_sent.vec" % exp_name with open(out_train_sent_file, 'w') as train_f, open(out_test_sent_file, 'w') as test_f: for i in range(len(train_sens)): tr_doc_vect = train_sens[i][ 0][:number_train_sens[i]] train_f.write( json.dumps(tr_doc_vect.tolist()) + "\n") for i in range(len(test_sens)): te_doc_vect = test_sens[i][0][:number_test_sens[i]] test_f.write( json.dumps(te_doc_vect.tolist()) + "\n") print "Get best performace at %d iteration" % epoch log_file.write("Get best performance at %d iteration\n" % epoch) end_time = timeit.default_timer() print "Iteration %d , precision, recall, support" % epoch, precision, recall, support log_file.write( "Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f \n" % (epoch, precision[0], precision[1], recall[0], recall[1])) print "Using time %f m" % ((end_time - start_time) / 60.) log_file.write("Uing time %f m\n" % ((end_time - start_time) / 60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % (epoch, (end_time - start_time) / 60.) log_file.write("Uing time %f m\n" % ((end_time - start_time) / 60.)) log_file.flush() log_file.close()
def train_cnn_encoder(datasets, word_embedding, input_width=64, filter_hs=[3, 4, 5], hidden_units=[100, 2], dropout_rate=[0.5], shuffle_batch=True, n_epochs=100, batch_size=50, lr_decay=0.95, activations=[ReLU], sqr_norm_lim=9, non_static=True): rng = np.random.RandomState(1234) input_height = len(datasets[0][0]) - 1 filter_width = input_width feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_width)) pool_sizes.append((input_height-filter_h+1, input_width-filter_width+1)) parameters = [("Input Shape", input_height, input_width), ("Filter Shape", filter_shapes), ("Pool Sizes", pool_sizes), ("dropout rate", dropout_rate), ("hidden units", hidden_units), ("shuffle_batch", shuffle_batch), ("n_epochs", n_epochs), ("batch size", batch_size)] print parameters # construct the model index = T.iscalar() x = T.matrix("x") y = T.ivector("y") words = shared(value=word_embedding, name="embedding") zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0],1,x.shape[1],words.shape[1])) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=(batch_size, 1, input_height, input_width), filter_shape=filter_shape, pool_size=pool_size, activation=ReLU) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) hidden_units[0] = feature_maps * len(filter_hs) classifier = nn.MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, dropout_rates=dropout_rate, activations=activations) params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: params.append(words) cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) np.random.seed(1234) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = datasets[0] new_data = np.random.permutation(new_data) n_batches = new_data.shape[0]/batch_size n_train_batches = int(np.round(n_batches*0.9)) # divide the train set intp train/val sets test_set_x = datasets[1][:,:input_height] test_set_y = np.asarray(datasets[1][:,-1], "int32") train_set = new_data[:n_train_batches*batch_size,:] val_set = new_data[n_train_batches*batch_size:,:] print train_set[:,-1] train_set_x, train_set_y = shared_dataset((train_set[:,:input_height],train_set[:,-1])) val_set_x, val_set_y = shared_dataset((val_set[:,:input_height],val_set[:,-1])) n_val_batches = n_batches - n_train_batches val_model = function([index], classifier.errors(y), givens={ x: val_set_x[index * batch_size: (index + 1) * batch_size], y: val_set_y[index * batch_size: (index + 1) * batch_size] }) test_model = function([index], classifier.errors(y), givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] }) train_model = function([index], cost, updates=grad_updates, givens={ x: train_set_x[index*batch_size:(index+1)*batch_size], y: train_set_y[index*batch_size:(index+1)*batch_size] }) test_pred_layers = [] test_size = test_set_x.shape[0] test_layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((test_size, 1, input_height, input_width)) for conv_layer in conv_layers: test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = classifier.predict(test_layer1_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model_all = function([x, y], test_error) # start to training the model print "Start training the model...." epoch = 0 best_val_perf = 0 val_perf = 0 cost_epoch = 0 while(epoch < n_epochs): epoch += 1 if shuffle_batch: for minibatch_index in np.random.permutation(range(n_train_batches)): print minibatch_index cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [test_model(i) for i in xrange(n_train_batches)] train_perf = 1 - np.mean(train_losses) val_losses = [val_model(i) for i in xrange(n_val_batches)] val_perf = 1 - np.mean(val_losses) print('epoch %i, train perf %f %%, val perf %f' % (epoch, train_perf * 100., val_perf*100.)) if val_perf >= best_val_perf: best_val_perf = val_perf test_losses = test_model_all(test_set_x, test_set_y) test_perf = 1 - test_losses print "Test Performance %f under Current Best Valid perf %f" % (test_perf, val_perf) return test_perf
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, print_freq=5): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) # number of words in the sentences num_sens = len(dataset[0][0][0]) # number of sentences print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") y = T.ivector("y") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) # the input for the sentence level conv layers layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(( x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm )) conv_layers = [] filter_shape = (num_maps, 1, filter_hs[0], emb_dm) pool_size = (input_height - filter_hs[0] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) sen_vecs = conv_layer.output.reshape((x.shape[0] * x.shape[1], num_maps)) conv_layers.append(conv_layer) # compute preactivation for each sentences layer_sizes = zip(hidden_units, hidden_units[1:]) full_layer_input = sen_vecs dropout_input = sen_vecs hidden_outs = [] drophidden_outs = [] hidden_layers = [] dropout_layers = [] droprate = 0.5 for lay_size in layer_sizes[:-1]: U_value = np.random.random(lay_size).astype(theano.config.floatX) b_value = np.zeros((lay_size[1],), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") hiddenLayer = nn.HiddenLayer(rng, full_layer_input, lay_size[0], lay_size[1], ReLU, U * (1 - droprate), b) dropHiddenLayer = nn.DropoutHiddenLayer(rng, dropout_input, lay_size[0], lay_size[1], ReLU, droprate, U, b) hidden_layers.append(hiddenLayer) dropout_layers.append(dropHiddenLayer) hidden_out = hiddenLayer.output drophidden_out = dropHiddenLayer.output hidden_outs.append(hidden_out) drophidden_outs.append(drophidden_out) full_layer_input = hidden_out dropout_input = drophidden_out # get the max value for each class n_in, n_out = layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out,), dtype=theano.config.floatX) W = theano.shared(W_value, borrow=True, name="logis_W") b = theano.shared(b_value, borrow=True, name="logis_b") full_act = T.dot(hidden_outs[-1], W*(1 - droprate)) + b dropout_act = nn.dropout_from_layer(rng, T.dot(drophidden_outs[-1], W) + b, droprate) # compute the probability sen_full_probs = T.nnet.softmax(full_act) sen_dropout_probs = T.nnet.softmax(dropout_act) # compute the sentence similarity sen_sen = T.dot(sen_vecs, sen_vecs.T) sen_sqr = T.sum(sen_vecs ** 2, axis=1) sen_sqr_left = sen_sqr.dimshuffle(0, 'x') sen_sqr_right = sen_sqr.dimshuffle('x', 0) sen_smi_matrix = sen_sqr_left - 2 * sen_sen + sen_sqr_right sen_smi_matrix = T.exp(-1 * sen_smi_matrix) # compute the delta between sentence probabilities prob_prob_full = T.dot(sen_full_probs, sen_full_probs.T) prob_sqr_full = T.sum(sen_full_probs ** 2, axis=1) prob_sqr_left_full = prob_sqr_full.dimshuffle(0, 'x') prob_sqr_right_full = prob_sqr_full.dimshuffle('x', 0) prob_delta_full = prob_sqr_left_full - 2 * prob_prob_full + prob_sqr_right_full sen_cost_full = T.sum(sen_smi_matrix * prob_delta_full) prob_prob_drop = T.dot(sen_dropout_probs, sen_dropout_probs.T) prob_sqr_drop = T.sum(sen_dropout_probs ** 2, axis=1) prob_sqr_left_drop = prob_sqr_drop.dimshuffle(0, 'x') prob_sqr_right_drop = prob_sqr_drop.dimshuffle('x', 0) prob_delta_drop = prob_sqr_left_drop - 2 * prob_prob_drop + prob_sqr_right_drop sen_cost_drop = T.sum(sen_smi_matrix * prob_delta_drop) # transform the sen probs to doc probs # by using average probs doc_full_probs = sen_full_probs.reshape((x.shape[0], x.shape[1], n_out)) doc_full_probs = T.mean(doc_full_probs, axis=1) doc_dropout_probs = sen_dropout_probs.reshape((x.shape[0], x.shape[1], n_out)) doc_dropout_probs = T.mean(doc_dropout_probs, axis=1) doc_full_y_pred = T.argmax(doc_full_probs, axis=1) doc_dropout_y_pred = T.argmax(doc_dropout_probs, axis=1) full_negative_likelihood = T.sum(-T.log(doc_full_probs)[T.arange(y.shape[0]), y]) dropout_negative_likelihood = T.sum(-T.log(doc_dropout_probs)[T.arange(y.shape[0]), y]) full_errors = T.mean(T.neq(doc_full_y_pred, y)) gamma = 2 full_cost = full_negative_likelihood + gamma * sen_cost_full dropout_cost = dropout_negative_likelihood + gamma * sen_cost_drop params = [] for conv_layer in conv_layers: params += conv_layer.params for dropout_layer in dropout_layers: params += dropout_layer.params params.append(W) params.append(b) if non_static: params.append(words) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_y = shared_dataset(dataset[0]) valid_x, valid_y = shared_dataset(dataset[1]) test_x, test_y = shared_dataset(dataset[2]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function([index], full_cost, updates=grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size] }) train_error = function([index], full_errors, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size] }) valid_train_func = function([index], [full_negative_likelihood, sen_cost_full], updates=grad_updates, givens={ x: valid_x[index*batch_size:(index+1)*batch_size], y: valid_y[index*batch_size:(index+1)*batch_size] }) test_pred = function([index], doc_full_y_pred, givens={ x:test_x[index*batch_size:(index+1)*batch_size], }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_valid = len(dataset[1][0]) n_test = len(dataset[2][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'w') print "Start to train the model....." cpu_trn_y = np.asarray(dataset[0][1]) cpu_val_y = np.asarray(dataset[1][1]) cpu_tst_y = np.asarray(dataset[2][1]) def compute_score(true_list, pred_list): mat = np.equal(true_list, pred_list) score = np.mean(mat) return score best_test_score = 0. while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) # do validatiovalidn valid_cost, valid_sen_cost = zip(*[valid_train_func(i) for i in np.random.permutation(xrange(n_valid_batches))]) if epoch % print_freq == 0: # do test test_preds = np.concatenate([test_pred(i) for i in xrange(n_test_batches)]) test_score = compute_score(cpu_tst_y, test_preds) with open(os.path.join(perf_fn, "%s_%d.pred" % (exp_name, epoch)), 'w') as epf: for p in test_preds: epf.write("%d\n" % int(p)) message = "Epoch %d test perf %f train cost %f, valid_sen_cost %f, valid_doc_cost %f" % (epoch, test_score, np.mean(costs), np.mean(valid_sen_cost), np.mean(valid_cost)) print message log_file.write(message + "\n") log_file.flush() """ # store the best model if (test_score > best_test_score) or (epoch % 25 == 0): best_test_score = test_score # save the model model_name = "%s_%d.model" % (exp_name, epoch) with open(model_name, 'wb') as bm: for p in params: cPickle.dump(p.get_value(), bm) """ end_time = timeit.default_timer() print "Finish one iteration using %f m" % ((end_time - start_time)/60.) log_file.flush() log_file.close()
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] drop_rate_sentence = self.options["drop_rate_sentence"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] k = self.options["k_max"] sentence_len = len(dataset[0][0][0][0]) sentence_num = len(dataset[0][0][0]) # define the parameters x = T.tensor3("x") y = T.ivector("y") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0]*x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word*k_max_word dropout_sent_vec = dropout_word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb*(1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) theta_value = np.random.random((sent_vec_dim,1)) theta = shared(value=np.asarray(theta_value, dtype=theano.config.floatX), name="theta", borrow=True) weighted_drop_sent_vec, weighted_sen_score = keep_max(dropout_sent_vec.reshape((x.shape[0], 1, x.shape[1], sent_vec_dim)), theta, k) drop_doc_vec = T.sum(weighted_drop_sent_vec, axis=2).flatten(2) weighted_sent_vec, sen_score = keep_max(sent_vec.reshape((x.shape[0], 1, x.shape[1], sent_vec_dim)), theta, k) doc_vec = T.sum(weighted_sent_vec, axis=2).flatten(2) # we need to constrain the number of positive sentences in positive # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params.append(sen_W) self.params.append(sen_b) grad_updates = nn.sgd_updates_adadelta(self.params, drop_cost, rho, epsilon, norm_lim) # construct the dataset train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function([index], [drop_cost, drop_bag_cost, drop_sent_cost, penal_cost], updates=grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size] }) test_func = theano.function([index], doc_preds, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) get_train_sent_prob = theano.function([index], sent_prob, givens={ x:train_x[index*batch_size:(index+1)*batch_size] }) get_test_sent_prob = theano.function([index], sent_prob, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) epoch = 0 best_score = 0 raw_train_x = dataset[0][0] raw_test_x = dataset[1][0] # get the sentence number for each document number_train_sens = [] number_test_sens = [] log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost = zip(*costs) print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost)) if epoch % 5 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support(test_cpu_y, test_preds, pos_label=1) if test_score > best_score: best_score = test_score # save the sentence vectors train_sens = [get_train_sent_prob(i) for i in range(n_train_batches)] test_sens = [get_test_sent_prob(i) for i in range(n_test_batches)] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent.vec" % exp_name out_test_sent_file = "./results/%s_test_sent.vec" % exp_name with open(out_train_sent_file, 'w') as train_f, open(out_test_sent_file, 'w') as test_f: cPickle.dump(train_sens, train_f) cPickle.dump(test_sens, test_f) print "Get best performace at %d iteration %f" % (epoch, test_score) log_file.write("Get best performance at %d iteration %f \n" % (epoch, test_score)) end_time = timeit.default_timer() print "Iteration %d , precision, recall, support" % epoch, precision, recall, support log_file.write("Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost))) print "Using time %f m" % ((end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % ( epoch, (end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) log_file.flush() log_file.close()
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) num_sens = len(dataset[0][0][0]) print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") y = T.matrix("y") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm)) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = (num_maps, 1, filter_hs[i], emb_dm) pool_size = (input_height - filter_hs[i] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) sen_vecs = conv_layer.output.reshape( (x.shape[0], x.shape[1], num_maps)) sen_vecs = sen_vecs.dimshuffle(0, 2, 1) doc_vec = T.sum(sen_vecs, axis=2).flatten(2) layer1_inputs.append(doc_vec) conv_layers.append(conv_layer) layer1_input = T.concatenate(layer1_inputs, 1) ############## # Task pop# ############## print "Construct classifier ...." hidden_units[0] = num_maps * len(filter_hs) pop_factor = nn.MLDropout( rng, input=layer1_input, layer_sizes=hidden_units, dropout_rates=[dropout_rate for i in range(len(hidden_units) - 1)], activations=[activation for i in range(len(hidden_units) - 1)]) pop_factor_output = pop_factor.output.dimshuffle(0, 1, 'x') pop_factor_dropout_output = pop_factor.dropout_output.dimshuffle(0, 1, 'x') ####################### # Task Type ##### ####################### type_hidden_units = [num for num in hidden_units] type_hidden_units[-1] = 5 type_factor = nn.MLDropout( rng, input=layer1_input, layer_sizes=type_hidden_units, dropout_rates=[ dropout_rate for i in range(len(type_hidden_units) - 1) ], activations=[activation for i in range(len(type_hidden_units) - 1)]) type_factor_output = type_factor.output.dimshuffle(0, 'x', 1) type_factor_dropout_output = type_factor.dropout_output.dimshuffle( 0, 'x', 1) ###################### ## Joint Y matrix ### ##################### # construct V matrix to model pop type dependency V_value = np.random.random((hidden_units[-1], type_hidden_units[-1])) V = theano.shared(value=np.asarray(V_value, dtype=theano.config.floatX), name="V", borrow=True) # compute the Joint propability joint_act = T.batched_dot(pop_factor_output, type_factor_output) + V joint_act_dropout = T.batched_dot(pop_factor_dropout_output, type_factor_dropout_output) + V joint_probs = T.nnet.softmax(joint_act.flatten(2)) joint_probs_dropout = T.nnet.softmax(joint_act_dropout.flatten(2)) neg_likelihood = -T.mean(T.log(T.sum(joint_probs * y, axis=1))) neg_likelihood_dropout = -T.mean( T.log(T.sum(joint_probs_dropout * y, axis=1))) joint_preds = T.argmax(joint_probs, axis=1) pop_preds = joint_preds // type_hidden_units[-1] type_preds = joint_preds % type_hidden_units[-1] y_index = T.argmax(y, axis=1) pop_y = y_index // type_hidden_units[-1] type_y = y_index % type_hidden_units[-1] pop_error = T.mean(T.neq(pop_preds, pop_y)) type_error = T.mean(T.neq(type_preds, type_y)) params = pop_factor.params params += type_factor.params params.append(V) for conv_layer in conv_layers: params += conv_layer.params if non_static: params.append(words) grad_updates = sgd_updates_adadelta(params, neg_likelihood_dropout, lr_decay, 1e-6, sqr_norm_lim) ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_y = shared_dataset(dataset[0]) test_x, test_y = shared_dataset(dataset[1]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function( [index], neg_likelihood_dropout, updates=grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size] }) test_pred = function( [index], [pop_error, type_error], givens={ x: test_x[index * batch_size:(index + 1) * batch_size], y: test_y[index * batch_size:(index + 1) * batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_test = len(dataset[1][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'a') while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) if epoch % 5 == 0: # do test test_pop_errors = [] test_type_errors = [] for i in xrange(n_test_batches): test_pop_error, test_type_error = test_pred(i) test_pop_errors.append(test_pop_error) test_type_errors.append(test_type_error) test_pop_score = 1 - np.mean(test_pop_errors) test_type_score = 1 - np.mean(test_type_errors) message = "Epoch %d test pop perf %f, type perf %f" % ( epoch, test_pop_score, test_type_score) print message log_file.write(message + "\n") log_file.flush() end_time = timeit.default_timer() print "Finish one iteration using %f m" % ( (end_time - start_time) / 60.) log_file.flush() log_file.close()
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, alpha=0.0001): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() input_height = len(dataset[0][0][0]) print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." word_x = T.matrix("word_x") freq_x = T.matrix("freq_x") pos_x = T.matrix("pos_x") y = T.ivector("y") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) sym_dim = 20 # the frequency embedding is 21 * 50 matrix freq_val = np.random.random((21, sym_dim)) freqs = shared(value=np.asarray(freq_val, dtype=theano.config.floatX), borrow=True, name="freqs") # the position embedding is 31 * 50 matrix poss_val = np.random.random((31, sym_dim)) poss = shared(value=np.asarray(poss_val, dtype=theano.config.floatX), borrow=True, name="poss") # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) freq_zero_tensor = T.vector() freq_zero_vec = np.zeros(sym_dim, dtype=theano.config.floatX) freq_set_zero = function([freq_zero_tensor], updates=[(freqs, T.set_subtensor(freqs[0, :], freq_zero_tensor))]) pos_zero_tensor = T.vector() pos_zero_vec = np.zeros(sym_dim, dtype=theano.config.floatX) pos_set_zero = function([pos_zero_tensor], updates=[(poss, T.set_subtensor(poss[0, :], pos_zero_tensor))]) word_x_emb = words[T.cast(word_x.flatten(), dtype="int32")].reshape( (word_x.shape[0], 1, word_x.shape[1], emb_dm)) freq_x_emb = freqs[T.cast(freq_x.flatten(), dtype="int32")].reshape( (freq_x.shape[0], 1, freq_x.shape[1], sym_dim)) pos_x_emb = poss[T.cast(pos_x.flatten(), dtype="int32")].reshape( (pos_x.shape[0], 1, pos_x.shape[1], sym_dim)) layer0_input = T.concatenate([word_x_emb, freq_x_emb, pos_x_emb], axis=3) conv_layers = [] layer1_inputs = [] rng = np.random.RandomState() for i in xrange(len(filter_hs)): filter_shape = (num_maps, 1, filter_hs[i], emb_dm + sym_dim + sym_dim) pool_size = (input_height - filter_hs[i] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) ############## # classifier # ############## print "Construct classifier ...." hidden_units[0] = num_maps * len(filter_hs) model = nn.MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, dropout_rates=[dropout_rate], activations=[activation]) params = model.params for conv_layer in conv_layers: params += conv_layer.params params.append(words) params.append(freqs) params.append(poss) cost = model.negative_log_likelihood(y) + alpha * model.L2 dropout_cost = model.dropout_negative_log_likelihood(y) + alpha * model.L2 grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" train_word_x, train_freq_x, train_pos_x, train_y = shared_dataset( dataset[0]) test_word_x, test_freq_x, test_pos_x, test_y = shared_dataset(dataset[1]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function( [index], cost, updates=grad_updates, givens={ word_x: train_word_x[index * batch_size:(index + 1) * batch_size], freq_x: train_freq_x[index * batch_size:(index + 1) * batch_size], pos_x: train_pos_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size] }) test_pred = function( [index], model.preds, givens={ word_x: test_word_x[index * batch_size:(index + 1) * batch_size], freq_x: test_freq_x[index * batch_size:(index + 1) * batch_size], pos_x: test_pos_x[index * batch_size:(index + 1) * batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_test = len(dataset[1][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'a') print "Start to train the model....." cpu_trn_y = np.asarray(dataset[0][3]) cpu_tst_y = np.asarray(dataset[1][3]) def compute_score(true_list, pred_list): mat = np.equal(true_list, pred_list) score = np.mean(mat) return score while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) freq_set_zero(freq_zero_vec) pos_set_zero(pos_zero_vec) if epoch % 5 == 0: # do test test_preds = np.concatenate( [test_pred(i) for i in xrange(n_test_batches)]) test_score = compute_score(cpu_tst_y, test_preds) with open(os.path.join(perf_fn, "%s_%d.pred" % (exp_name, epoch)), 'w') as epf: for p in test_preds: epf.write("%d\n" % int(p)) message = "Epoch %d test perf %f with train cost %f" % ( epoch, test_score, np.mean(costs)) print message log_file.write(message + "\n") log_file.flush() end_time = timeit.default_timer() print "Finish one iteration using %f m" % ( (end_time - start_time) / 60.) log_file.flush() log_file.close()