def weighted_model(path, valid_label_file, test_label_file, valid_group_file, test_group_file): valid_data_overlap = pickle.load(open(path + "valid_ensemble_overlap.pkl")) test_data_overlap = pickle.load(open(path + "test_ensemble_overlap.pkl")) valid_data_lcs = pickle.load(open(path + "valid_ensemble_lcs.pkl")) test_data_lcs = pickle.load(open(path + "test_ensemble_lcs.pkl")) valid_data_question = pickle.load( open(path + "valid_ensemble_question.pkl")) test_data_question = pickle.load(open(path + "test_ensemble_question.pkl")) valid_label_list = pickle.load(open(valid_label_file)) valid_group_list = pickle.load(open(valid_group_file)) test_label_list = pickle.load(open(test_label_file)) test_group_list = [int(x.strip()) for x in open(test_group_file, "r")] mrr_dic = {} p = 0. while p <= 1.: q = 0. while q < 1. - p: valid_score_list = [] for i in xrange(len(valid_data_lcs)): score = p * valid_data_question[i] + q * valid_data_lcs[i] + ( 1. - p - q) * valid_data_overlap[i] valid_score_list.append(score) valid_mrr = qa_evaluate(valid_score_list, valid_label_list, valid_group_list, label=1, mod="mrr") test_score_list = [] for i in xrange(len(test_data_lcs)): score = p * test_data_question[i] + q * test_data_lcs[i] + ( 1. - p - q) * test_data_overlap[i] test_score_list.append(score) test_mrr = qa_evaluate(test_score_list, test_label_list, test_group_list, label=1, mod="mrr") mrr_dic[str(p) + "-" + str(q) + "-" + str(1 - p - q)] = str(valid_mrr) + "-" + str(test_mrr) q += 0.1 p += 0.1 mrr_dic = sorted(mrr_dic.iteritems(), key=lambda x: x[1]) res(mrr_dic)
def train_rnn(n_epoch, batch_size, sequence_length, n_hidden_rnn, n_in_mlp, n_hidden_mlp, n_out, L1_reg, L2_reg, learning_rate, random=False, non_static=True): ############### # LOAD DATA # ############### print "loading the data... " path = "/Users/chenjun/PycharmProjects/DBQA/" loader = data_loader( path + "pkl/data-train-nn.pkl", path + "pkl/data-valid-nn.pkl", path + "pkl/data-test-nn.pkl", path + "/pkl/index2vec.pkl", ) valid_group_list = pickle.load(open(path + "pkl/valid_group.pkl")) test_group_list = [ int(x.strip()) for x in open(path + "data/dbqa-data-test.txt.group") ] datasets, emb_words = loader.get_input_by_model(model="tensorflow", random=random) train_q_data, valid_q_data, test_q_data = datasets[0] train_a_data, valid_a_data, test_a_data = datasets[1] train_l_data, valid_l_data, test_l_data = datasets[2] # calculate the number of batches. n_train_batches = train_q_data.shape[0] // batch_size n_valid_batches = valid_q_data.shape[0] // batch_size n_test_batches = test_q_data.shape[0] // batch_size print "batch_size: %i, n_train_batches: %i, n_train_batches: %i, n_test_batches: %i" % ( batch_size, n_train_batches, n_valid_batches, n_test_batches) ############### # BUILD MODEL # ############### print "building the model... " rnn_model = QARNNModel(sequence_length=sequence_length, n_hidden_rnn=n_hidden_rnn, n_in_mlp=n_in_mlp, n_hidden_mlp=n_hidden_mlp, n_out=n_out, L1_reg=L1_reg, L2_reg=L2_reg, learning_rate=learning_rate, word_embedding=emb_words, non_static=non_static) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) ############### # TRAIN MODEL # ############### print "training the model... " epoch = 0 while epoch < n_epoch: epoch += 1 batch_cost = 0. for batch_index1 in xrange(n_train_batches): qsent_batch = train_q_data[batch_size * batch_index1:batch_size * (batch_index1 + 1)] asent_batch = train_a_data[batch_size * batch_index1:batch_size * (batch_index1 + 1)] label_batch = train_l_data[batch_size * batch_index1:batch_size * (batch_index1 + 1)] batch_cost += rnn_model.train_batch(sess=sess, qsent_batch=qsent_batch, asent_batch=asent_batch, label_batch=label_batch, keep_prop=0.5) # validation & test if batch_index1 % 100 == 0: print("epoch %i/%i, batch %d/%d, cost %f") % ( epoch, n_epoch, batch_index1, n_train_batches, batch_cost / n_train_batches) valid_score_data = [] for batch_index in xrange(n_valid_batches): qsent_batch = valid_q_data[batch_size * batch_index:batch_size * (batch_index + 1)] asent_batch = valid_a_data[batch_size * batch_index:batch_size * (batch_index + 1)] label_batch = valid_l_data[batch_size * batch_index:batch_size * (batch_index + 1)] valid_pred = rnn_model.eval_batch(sess=sess, qsent_batch=qsent_batch, asent_batch=asent_batch, label_batch=label_batch, keep_prop=1.0) valid_score_data.append(valid_pred) valid_score_list = (np.concatenate( np.asarray(valid_score_data), axis=0)).tolist() valid_label_list = valid_l_data.tolist() for i in xrange(len(valid_score_list), len(valid_label_list)): valid_score_list.append(np.random.random()) _eval = qa_evaluate(valid_score_list, valid_label_list, valid_group_list, label=1, mod="mrr") # one-hot -> label=[0,1] print "---valid mrr: ", _eval test_score_data = [] for batch_index in xrange(n_test_batches): qsent_batch = test_q_data[batch_size * batch_index:batch_size * (batch_index + 1)] asent_batch = test_a_data[batch_size * batch_index:batch_size * (batch_index + 1)] label_batch = test_l_data[batch_size * batch_index:batch_size * (batch_index + 1)] test_pred = rnn_model.eval_batch(sess=sess, qsent_batch=qsent_batch, asent_batch=asent_batch, label_batch=label_batch, keep_prop=1.0) test_score_data.append(test_pred) test_score_list = (np.concatenate(np.asarray(test_score_data), axis=0)).tolist() test_label_list = test_l_data.tolist() for i in xrange(len(test_score_list), len(test_label_list)): test_score_list.append(np.random.random()) _eval = qa_evaluate(test_score_list, test_label_list, test_group_list, label=1, mod="mrr") # one-hot -> label=[0,1] print "---error mrr: ", _eval
def train_rnn(n_epochs, batch_size, sequence_length, n_in_rnn, n_hidden_rnn, n_in_mlp, n_hidden_mlp, n_out, learning_rate, random=False, non_static=True): print "loading the data... " path = "/Users/chenjun/PycharmProjects/DBQA/" loader = data_loader(path + "pkl/data-train-nn.pkl", path + "pkl/data-valid-nn.pkl", path + "pkl/data-test-nn.pkl", path + "/pkl/index2vec.pkl", ) valid_group_list = pickle.load(open(path + "pkl/valid_group.pkl")) test_group_list = [int(x.strip()) for x in open(path + "data/dbqa-data-test.txt.group")] datasets, emb_words = loader.get_input_by_model(model="pytorch", random=random) train_q_data, valid_q_data, test_q_data = datasets[0] train_a_data, valid_a_data, test_a_data = datasets[1] train_l_data, valid_l_data, test_l_data = datasets[2] # calculate the number of batches. n_train_batches = train_q_data.size(0) // batch_size n_valid_batches = valid_q_data.size(0) // batch_size n_test_batches = test_q_data.size(0) // batch_size print "batch_size: %i, n_train_batches: %i, n_valid_batches: %i, n_test_batches: %i" % \ (batch_size, n_train_batches, n_valid_batches, n_test_batches) ############### # BUILD MODEL # ############### print "building the model... " embedding = nn.Embedding(emb_words.size(0), emb_words.size(1)) embedding.weight = nn.Parameter(emb_words, requires_grad=non_static) qa_rnn_model = QARNNModel(embedding, batch_size, sequence_length, n_in_rnn, n_hidden_rnn, n_in_mlp, n_hidden_mlp, n_out) parameters = filter(lambda p: p.requires_grad, qa_rnn_model.parameters()) print qa_rnn_model ############### # TRAIN MODEL # ############### print "training the model... " criterion = nn.CrossEntropyLoss() optimzier = opt.RMSprop(parameters, learning_rate) epoch = 0 while epoch < n_epochs: epoch += 1 train_loss = 0.0 for index1 in xrange(n_train_batches): train_q_batch = Variable(train_q_data[batch_size * index1: batch_size * (index1 + 1)]) train_a_batch = Variable(train_a_data[batch_size * index1: batch_size * (index1 + 1)]) train_l_batch = Variable(train_l_data[batch_size * index1: batch_size * (index1 + 1)]) train_prop_batch, _ = qa_rnn_model(train_q_batch, train_a_batch, drop_rate=0.5) loss = criterion(train_prop_batch, train_l_batch) optimzier.zero_grad() loss.backward() optimzier.step() train_loss += loss.data[0] / batch_size if index1 % 100 == 0: print ("epoch: %d/%d, batch: %d/%d, cost: %f") % (epoch, n_epochs, index1, n_train_batches, train_loss) # valid valid_score_data = [] for index in xrange(n_valid_batches): vaild_q_batch = Variable(valid_q_data[batch_size * index: batch_size * (index + 1)]) valid_a_batch = Variable(valid_a_data[batch_size * index: batch_size * (index + 1)]) valid_prop_batch, _ = qa_rnn_model(vaild_q_batch, valid_a_batch, drop_rate=0.0) valid_score_data.append(valid_prop_batch.data.numpy()[:, 1]) valid_score_list = (np.concatenate(np.asarray(valid_score_data), axis=0)).tolist() valid_label_list = valid_l_data.numpy().tolist() for i in xrange(len(valid_score_list), len(valid_label_list)): valid_score_list.append(np.random.random()) _eval = qa_evaluate(valid_score_list, valid_label_list, valid_group_list, label=1, mod="mrr") print "---valid mrr: ", _eval # test test_score_data = [] for index in xrange(n_test_batches): test_q_batch = Variable(test_q_data[batch_size * index: batch_size * (index + 1)]) test_a_batch = Variable(test_a_data[batch_size * index: batch_size * (index + 1)]) test_prop_batch, _ = qa_rnn_model(test_q_batch, test_a_batch, drop_rate=0.0) test_score_data.append(test_prop_batch.data.numpy()[:, 1]) test_score_list = (np.concatenate(np.asarray(test_score_data), axis=0)).tolist() test_label_list = test_l_data.numpy().tolist() for i in xrange(len(test_score_list), len(test_label_list)): test_score_list.append(np.random.random()) _eval = qa_evaluate(test_score_list, test_label_list, test_group_list, label=1, mod="mrr") print "---test mrr: ", _eval
def build_model(batch_size, img_h, img_w, filter_windows, filter_num, n_in, n_hidden, n_out, L1_reg, L2_reg, conv_non_linear, learning_rate, n_epochs, random=False, non_static=False): """ build cnn model for QA. :param batch_size: batch_size :param img_h: sentence length :param img_w: word vector dimension [100] :param filter_windows: filter window sizes :param filter_num: the number of feature maps (per filter window) :param n_in: num of input units :param n_hidden: num of hidden units :param n_out: num of out units :param L1_reg: mlp L1 loss :param L2_reg: mlp L2 loss :param conv_non_linear: activation :param learning_rate: learning rate :param n_epochs: num of epochs :param random: bool, use random embedding or trained embedding :param non_static: bool, use word embedding for param or not :return: """ global rng ############### # LOAD DATA # ############### print "loading the data... " path = "/Users/chenjun/PycharmProjects/DBQA/" loader = data_loader(path + "pkl/data-train-nn.pkl", path + "pkl/data-valid-nn.pkl", path + "pkl/data-test-nn.pkl", path + "pkl/index2vec.pkl") valid_group_list = pickle.load(open(path + "pkl/valid_group.pkl")) test_group_list = [ int(x.strip()) for x in open(path + "data/dbqa-data-test.txt.group") ] datasets, emb_words = loader.get_input_by_model(model="theano", random=random) train_q_data, valid_q_data, test_q_data = datasets[0] train_a_data, valid_a_data, test_a_data = datasets[1] train_l_data, valid_l_data, test_l_data = datasets[2] features = get_overlap(path, length=img_h) train_overlap_q, valid_overlap_q, test_overlap_q = features[0] train_overlap_a, valid_overlap_a, test_overlap_a = features[1] # calculate the number of batches n_train_batches = train_q_data.get_value( borrow=True).shape[0] // batch_size n_valid_batches = valid_q_data.get_value( borrow=True).shape[0] // batch_size n_test_batches = test_q_data.get_value(borrow=True).shape[0] // batch_size print "batch_size: %i, n_train_batches: %i, n_valid_batches: %i, n_test_batches: %i" % ( batch_size, n_train_batches, n_valid_batches, n_test_batches) ############### # BUILD MODEL # ############### print "building the model... " # define the input variable index = T.lscalar(name="index") drop_rate = T.fscalar(name="drop_rate") x1 = T.matrix(name='x1', dtype='int64') x2 = T.matrix(name='x2', dtype='int64') y = T.lvector(name='y') x1_overlap = T.tensor4(name="x1_overlap", dtype='float32') x2_overlap = T.tensor4(name="x2_overlap", dtype='float32') # transfer input to vector with embedding. _x1 = emb_words[x1.flatten()].reshape((x1.shape[0], 1, img_h, img_w - 1)) emb_x1 = T.concatenate([_x1, x1_overlap], axis=3) _x2 = emb_words[x2.flatten()].reshape((x2.shape[0], 1, img_h, img_w - 1)) emb_x2 = T.concatenate([_x2, x2_overlap], axis=3) # conv_layer conv_layers = [] q_input = [] a_input = [] for i, filter_h in enumerate(filter_windows): filter_w = img_w filter_shape = (filter_num, 1, filter_h, filter_w) pool_size = (img_h - filter_h + 1, img_w - filter_w + 1) conv_layer = CNNModule(rng, filter_shape=filter_shape, pool_size=pool_size, non_linear=conv_non_linear) q_conv_output, a_conv_output = conv_layer(emb_x1, emb_x2) q_conv_output = q_conv_output.flatten(2) # [batch_size * filter_num] a_conv_output = a_conv_output.flatten(2) # [batch_size * filter_num] q_input.append(q_conv_output) a_input.append(a_conv_output) conv_layers.append(conv_layer) q_input = T.concatenate( q_input, axis=1) # batch_size*(filter_num*len(filter_windows)) a_input = T.concatenate( a_input, axis=1) # batch_size*(filter_num*len(filter_windows)) num_filters = len(filter_windows) * filter_num interact_layer = InteractLayer(rng, num_filters, num_filters, dim=n_in) qa_vec = interact_layer(q_input, a_input) bn_layer = BatchNormLayer(n_in=n_in, inputs=qa_vec) # classifier = MLP(rng,input=bn_layer.out,n_in=n_in,n_hidden=n_hidden,n_out=n_out) classifier = MLPDropout(rng, input=bn_layer.out, n_in=n_in, n_hidden=n_hidden, n_out=n_out, dropout_rate=drop_rate) # model params params = classifier.params + interact_layer.params + bn_layer.params for i in xrange(len(conv_layers)): params += conv_layers[i].params if non_static: print "---CNN-NON-STATIC---" params += [emb_words] else: print "---CNN-STATIC---" opt = Optimizer() cost = (classifier.cross_entropy(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) # updates = opt.sgd_updates_adadelta(params, cost, 0.95, 1e-6, 9) updates = opt.RMSprop(params, cost) train_model = theano.function( inputs=[index, drop_rate], updates=updates, outputs=cost, givens={ x1: train_q_data[index * batch_size:(index + 1) * batch_size], x2: train_a_data[index * batch_size:(index + 1) * batch_size], y: train_l_data[index * batch_size:(index + 1) * batch_size], x1_overlap: train_overlap_q[index * batch_size:(index + 1) * batch_size], x2_overlap: train_overlap_a[index * batch_size:(index + 1) * batch_size] }, ) valid_model = theano.function( inputs=[index, drop_rate], outputs=classifier.pred_prob(), givens={ x1: valid_q_data[index * batch_size:(index + 1) * batch_size], x2: valid_a_data[index * batch_size:(index + 1) * batch_size], x1_overlap: valid_overlap_q[index * batch_size:(index + 1) * batch_size], x2_overlap: valid_overlap_a[index * batch_size:(index + 1) * batch_size] }, ) test_model = theano.function( inputs=[index, drop_rate], outputs=classifier.pred_prob(), givens={ x1: test_q_data[index * batch_size:(index + 1) * batch_size], x2: test_a_data[index * batch_size:(index + 1) * batch_size], x1_overlap: test_overlap_q[index * batch_size:(index + 1) * batch_size], x2_overlap: test_overlap_a[index * batch_size:(index + 1) * batch_size] }, ) ############### # TRAIN MODEL # ############### print('training the model...') epoch = 0 valid_dic = OrderedDict() eval_dic = OrderedDict() while epoch < n_epochs: epoch += 1 batch_cost = 0. for batch_index1 in xrange(n_train_batches): batch_cost += train_model(batch_index1, 0.5) # drop if batch_index1 % 100 == 0: print('epoch %i/%i, batch %i/%i, cost %f') % ( epoch, n_epochs, batch_index1, n_train_batches, batch_cost / n_train_batches) ############### # VALID MODEL # ############### valid_score_data = [] for batch_index2 in xrange(n_valid_batches): batch_pred = valid_model(batch_index2, 0.0) # drop valid_score_data.append(batch_pred) valid_score_list = (np.concatenate( np.asarray(valid_score_data), axis=0)).tolist() valid_label_list = valid_l_data.get_value(borrow=True).tolist() for i in xrange(len(valid_score_list), len(valid_label_list)): valid_score_list.append(np.random.random()) _eval = qa_evaluate(valid_score_list, valid_label_list, valid_group_list, label=1, mod="mrr") print "---valid mrr: ", _eval valid_dic[str(epoch) + "-" + str(batch_index1)] = _eval ############### # TEST MODEL # ############### test_score_data = [] for batch_index3 in xrange(n_test_batches): batch_pred = test_model(batch_index3, 0.0) # drop test_score_data.append(batch_pred) test_score_list = (np.concatenate(np.asarray(test_score_data), axis=0)).tolist() test_label_list = test_l_data.get_value(borrow=True).tolist() for i in xrange(len(test_score_list), len(test_label_list)): test_score_list.append(np.random.random()) _eval = qa_evaluate(test_score_list, test_label_list, test_group_list, label=1, mod="mrr") print "---test mrr: ", _eval eval_dic[str(epoch) + "-" + str(batch_index1)] = _eval pickle.dump( valid_score_list, open( path + "result/cnn-overlap-valid.pkl." + str(epoch) + "-" + str(batch_index1), "w")) pickle.dump( test_score_list, open( path + "result/cnn-overlap-test.pkl." + str(epoch) + "-" + str(batch_index1), "w")) pickle.dump(test_label_list, open(path + "result/test_label.pkl", "w")) pickle.dump(valid_label_list, open(path + "result/valid_label.pkl", "w")) _valid_dic = sorted(valid_dic.items(), key=lambda x: x[1])[-10:] _eval_dic = sorted(eval_dic.items(), key=lambda x: x[1])[-10:] print "valid dic: ", _valid_dic print "eval dic: ", _eval_dic valid_score_file = [ path + "result/cnn-overlap-valid.pkl." + x[0] for x in _valid_dic ] test_score_file = [ path + "result/cnn-overlap-test.pkl." + x[0] for x in _valid_dic ] ###from valid valid_label_file = path + "result/valid_label.pkl" test_label_file = path + "result/test_label.pkl" test_ensemble_file = path + "result/test_ensemble_overlap.pkl" valid_ensemble_file = path + "result/valid_ensemble_overlap.pkl" valid_mrr = ensemble(valid_score_file, valid_label_file, valid_group_list, valid_ensemble_file) test_mrr = ensemble(test_score_file, test_label_file, test_group_list, test_ensemble_file) print "---ensemble valid mrr: ", valid_mrr print "---ensemble test mrr: ", test_mrr
def evaluate(path, valid_label_file, test_label_file, valid_group_file, test_group_file): valid_data_overlap = pickle.load(open(path + "valid_ensemble_overlap.pkl")) test_data_overlap = pickle.load(open(path + "test_ensemble_overlap.pkl")) valid_data_lcs = pickle.load(open(path + "valid_ensemble_lcs.pkl")) test_data_lcs = pickle.load(open(path + "test_ensemble_lcs.pkl")) valid_data_question = pickle.load( open(path + "valid_ensemble_question.pkl")) test_data_question = pickle.load(open(path + "test_ensemble_question.pkl")) valid_label_list = pickle.load(open(valid_label_file)) valid_group_list = pickle.load(open(valid_group_file)) test_label_list = pickle.load(open(test_label_file)) test_group_list = [int(x.strip()) for x in open(test_group_file, "r")] valid_score_list = [] for i in xrange(len(valid_data_lcs)): score = valid_data_question[i] valid_score_list.append(score) valid_mrr = qa_evaluate(valid_score_list, valid_label_list, valid_group_list, label=1, mod="mrr") test_score_list = [] for i in xrange(len(test_data_lcs)): score = test_data_question[i] test_score_list.append(score) test_mrr = qa_evaluate(test_score_list, test_label_list, test_group_list, label=1, mod="mrr") print("question: %s-%s ") % (valid_mrr, test_mrr) valid_score_list = [] for i in xrange(len(valid_data_lcs)): score = valid_data_lcs[i] valid_score_list.append(score) valid_mrr = qa_evaluate(valid_score_list, valid_label_list, valid_group_list, label=1, mod="mrr") test_score_list = [] for i in xrange(len(test_data_lcs)): score = test_data_lcs[i] test_score_list.append(score) test_mrr = qa_evaluate(test_score_list, test_label_list, test_group_list, label=1, mod="mrr") print("lcs: %s-%s ") % (valid_mrr, test_mrr) valid_score_list = [] for i in xrange(len(valid_data_lcs)): score = valid_data_overlap[i] valid_score_list.append(score) valid_mrr = qa_evaluate(valid_score_list, valid_label_list, valid_group_list, label=1, mod="mrr") test_score_list = [] for i in xrange(len(test_data_lcs)): score = test_data_overlap[i] test_score_list.append(score) test_mrr = qa_evaluate(test_score_list, test_label_list, test_group_list, label=1, mod="mrr") print("overlap: %s-%s ") % (valid_mrr, test_mrr)
def ensemble(score_file_list, label_file, group_list, ensemble_file): score_list = average_score(score_file_list) pickle.dump(score_list, open(ensemble_file, "w")) label_list = pickle.load(open(label_file)) mrr = qa_evaluate(score_list, label_list, group_list, label=1, mod="mrr") return mrr