def validation(validate_model, testList, vocab, batch_size, words_num_dim): index, score_list = int(0), [] while True: x1, x2, x3, m1, m2, m3 = load_test_data(testList, vocab, index, batch_size, words_num_dim) batch_scores, nouse = validate_model(x1, x2, x3, m1, m2, m3) for score in batch_scores: score_list.append(score) index += batch_size if index >= len(testList): break #log('Evalution' + str(index), logfile_path) print 'Evalution...', str(index) sdict, index = {}, int(0) qa_count = 0 for items in testList: qid = items[1].split(':')[1] if not qid in sdict: sdict[qid] = [] sdict[qid].append((score_list[index], items[0])) index += 1 if int(qid) > qa_count: qa_count = int(qid) qa_count += 1 top1 = float(0) map_sum = float(0) mrr_sum = float(0) for qid, items in sdict.items(): items.sort(key=operator.itemgetter(0), reverse=True) #for top1 score, flag = items[0] if flag == '1': top1 += 1 #for mrp mrr_index = 0 for score, flag in items: #log('[debug]' + ' qid=' + str(qid) + ' score=' + str(score) + ' flag=' + str(flag), logfilepath) mrr_index += 1 if flag == '1': mrr_sum += float(1) / float(mrr_index) #for map map_index_down = 0 map_index_up = 0 temp_map_sum = 0 for score, flag in items: #log('[debug]' + ' qid=' + str(qid) + ' score=' + str(score) + ' flag=' + str(flag)) map_index_down += 1 if flag == '1': map_index_up += 1 temp_map_sum += float(map_index_up) / float(map_index_down) temp_map_sum /= float(map_index_up) map_sum += temp_map_sum log('qid = ' + str(qid) + ' / top1 count = ' + str(top1), logfile_path) top1 /= float(qa_count) mrr_sum /= float(qa_count) map_sum /= float(qa_count) log('top-1 = ' + str(top1) + ' / ' + 'mrr = ' + str(mrr_sum) + ' / ' + 'map = ' + str(map_sum), logfile_path)
def train(): global logfile_path global train1file global train0file global test1file global idf_file_path batch_size = int(256) filter_sizes = [1, 2, 3] num_filters = 1000 words_num_dim = 50 #normal embedding size embedding_size = 300 #new embedding size with idf #embedding_size = 301 learning_rate = 0.001 n_epochs = 20000 validation_freq = 50 keep_prob_value = 0.7 margin_size = 0.05 logfile_path = os.path.join(logfile_path, 'CNN-' \ + GetNowTime() + '-' \ + '-log.txt') log("New start ...", logfile_path) log(str(time.asctime(time.localtime(time.time()))), logfile_path) log("batch_size = " + str(batch_size), logfile_path) log("filter_sizes = " + str(filter_sizes), logfile_path) log("num_filters = " + str(num_filters), logfile_path) log("embedding_size = " + str(embedding_size), logfile_path) log("learning_rate = " + str(learning_rate), logfile_path) log("n_epochs = " + str(n_epochs), logfile_path) log("margin_size = " + str(margin_size), logfile_path) log("words_num_dim = " + str(words_num_dim), logfile_path) log("validation_freq = " + str(validation_freq), logfile_path) log("keep_prob_value = " + str(keep_prob_value), logfile_path) log("train_1_file = " + str(train1file.split('/')[-1]), logfile_path) log("train_0_file = " + str(train0file.split('/')[-1]), logfile_path) log("test_file = " + str(test1file.split('/')[-1]), logfile_path) log("vector_file = " + str(vectorsfile.split('/')[-1]), logfile_path) log("idf_file_path = " + str(idf_file_path.split('/')[-1]), logfile_path) log("lda_train_file_path = " + str(lda_train_file_path.split('/')[-1]), logfile_path) log("lda_test_file_path = " + str(lda_test_file_path.split('/')[-1]), logfile_path) vocab = build_vocab() #word_embeddings is list, shape = numOfWords*100 #for normal embeddings word_embeddings = load_word_embeddings(vocab, embedding_size) #for new embeddings with idf features #word_embeddings = load_word_embeddings_with_idf(vocab, embedding_size, idf_file_path) trainList = load_train1_list() testList = load_test_list() train0Dict = load_train0_dict() #train_x1.shape = 256*100 #train_x1, train_x2, train_x3 = load_train_data(trainList, vocab, batch_size, words_num_dim) train_x1, train_x2, train_x3 = load_train_data_from_2files( train0Dict, trainList, vocab, batch_size, words_num_dim) x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3') keep_prob = T.fscalar('keep_prob') model = QACnn(input1=x1, input2=x2, input3=x3, keep_prob=keep_prob, word_embeddings=word_embeddings, batch_size=batch_size, sequence_len=train_x1.shape[1], embedding_size=embedding_size, filter_sizes=filter_sizes, num_filters=num_filters, margin_size=margin_size) dbg_x1 = model.dbg_x1 dbg_outputs_1 = model.dbg_outputs_1 cost, cos12, cos13 = model.cost, model.cos12, model.cos13 params, accuracy = model.params, model.accuracy grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3') prob = T.fscalar('prob') train_model = theano.function([p1, p2, p3, prob], [cost, accuracy, dbg_x1, dbg_outputs_1], updates=updates, givens={ x1: p1, x2: p2, x3: p3, keep_prob: prob }) v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') validate_model = theano.function( inputs=[v1, v2, v3, prob], outputs=[cos12, cos13], #updates=updates, givens={ x1: v1, x2: v2, x3: v3, keep_prob: prob }) epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #train_x1, train_x2, train_x3 = load_train_data(trainList, vocab, batch_size) train_x1, train_x2, train_x3 = load_train_data_from_2files( train0Dict, trainList, vocab, batch_size, words_num_dim) #print train_x3.shape cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model( train_x1, train_x2, train_x3, keep_prob_value) log( 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc), logfile_path) if epoch % validation_freq == 0: log('Evaluation ......', logfile_path) validation(validate_model, testList, vocab, batch_size, words_num_dim)
def validation(validate_model, testList, vocab, batch_size, words_num_dim): index, score_list = int(0), [] while True: x1, x2, x3 = load_test_data(testList, vocab, index, batch_size, words_num_dim) batch_scores, nouse = validate_model(x1, x2, x3, 1.0) for score in batch_scores: if len(score_list) < len(testList): score_list.append(score) else: break index += batch_size #log('Evalution' + str(index), logfile_path) print 'Evalution...', str(index) if index >= len(testList): break sdict, index = {}, int(0) qa_count = 0 for items in testList: qid = items[1].split(':')[1] question = items[2].strip('_<a>') answer = items[3].strip('_<a>') if not qid in sdict: sdict[qid] = [] sdict[qid].append((score_list[index], items[0], question, answer)) index += 1 if int(qid) > qa_count: qa_count = int(qid) qa_count += 1 top1 = float(0) map_sum = float(0) mrr_sum = float(0) #qid_count = 0 for qid, items in sdict.items(): items.sort(key=operator.itemgetter(0), reverse=True) #just for analysis ''' global logfile_path analysis_log_file_path = logfile_path + '.analysis' for score, flag, question, answer in items: log('[' + str(qid) + ']' + question, analysis_log_file_path) log('[Predicted][' + '1:' + str(len(items)) + '] ' + answer , analysis_log_file_path) break expected_answer_index = 0 expected_answer_flag = False for score, flag, question, answer in items: expected_answer_index += 1 if flag == '1': log('[Expected][' + str(expected_answer_index) + ':' + str(len(items)) + '] ' + answer , analysis_log_file_path) expected_answer_flag = True break if expected_answer_flag == False: log('[Expected][' + str(qid) + '/' + flag + '/' + 'Not Exist!' , analysis_log_file_path) log('', analysis_log_file_path) ''' #for top1 score, flag, question, answer = items[0] if flag == '1': top1 += 1 #for mrr mrr_index = 0 for score, flag, question, answer in items: mrr_index += 1 if flag == '1': mrr_sum += float(1) / float(mrr_index) #for map map_index_down = 0 map_index_up = 0 temp_map_sum = 0 for score, flag, question, answer in items: #log('[debug]' + ' qid=' + str(qid) + ' score=' + str(score) + ' flag=' + str(flag)) map_index_down += 1 if flag == '1': map_index_up += 1 temp_map_sum += float(map_index_up) / float(map_index_down) temp_map_sum /= float(map_index_up) map_sum += temp_map_sum #log('qid = ' + str(qid) + ' / top1 count = ' + str(top1), logfile_path) top1 /= float(qa_count) mrr_sum /= float(qa_count) map_sum /= float(qa_count) log( 'top-1 = ' + str(top1) + ' / ' + 'mrr = ' + str(mrr_sum) + ' / ' + 'map = ' + str(map_sum), logfile_path)
def validation(validate_model, testList, vocab, batch_size, words_num_dim): index, score_list = int(0), [] while True: x1, x2, x3 = load_test_data(testList, vocab, index, batch_size, words_num_dim) batch_scores, nouse = validate_model(x1, x2, x3, 1.0) for score in batch_scores: if len(score_list) < len(testList): score_list.append(score) else: break index += batch_size #log('Evalution' + str(index), logfile_path) print 'Evalution...', str(index) if index >= len(testList): break sdict, index = {}, int(0) qa_count = 0 for items in testList: qid = items[1].split(':')[1] if not qid in sdict: sdict[qid] = [] sdict[qid].append((score_list[index], items[0])) index += 1 if int(qid) > qa_count: qa_count = int(qid) qa_count += 1 dcg3 = float(0) dcg5 = float(0) dcg_all = float(0) idcg3 = float(0) idcg5 = float(0) idcg_all = float(0) #top_1 = float(0) #top_3 = float(0) for qid, items in sdict.items(): items.sort(key=operator.itemgetter(0), reverse=True) label_list = [] index = 0 top_3_flag = False for score, label in items: label_list.append(int(label)) index += 1 #if index <= 1 and str(label) == '2': # top_1 += 1 #if index <= 3 and str(label) == '2' and top_3_flag == False: # top_3 += 1 # top_3_flag = True top_k_3 = min(3, len(label_list)) top_k_5 = min(5, len(label_list)) top_k_all = len(label_list) dcg3 += DCG(label_list, top_k_3) dcg5 += DCG(label_list, top_k_5) dcg_all += DCG(label_list, top_k_all) #idcg temp = sorted(label_list, reverse=True) idcg3 += DCG(temp, top_k_3) idcg5 += DCG(temp, top_k_5) idcg_all += DCG(temp, top_k_all) log( 'qid = ' + str(qid) + ' / dcg3 = ' + str(DCG(label_list, top_k_3)) + ' / idcg3 = ' + str(idcg3), logfile_path) log( 'qid = ' + str(qid) + ' / dcg5 = ' + str(DCG(label_list, top_k_5)) + ' / idcg5 = ' + str(idcg5), logfile_path) log( 'qid = ' + str(qid) + ' / dcg_all = ' + str(DCG(label_list, top_k_all)) + ' / idcg_all' + str(idcg_all), logfile_path) log( 'ALL-DCG: all-dcg3 = ' + str(dcg3) + ' / ' + 'all-dcg5 = ' + str(dcg5) + ' / ' + 'all-dcg = ' + str(dcg_all), logfile_path) log( 'ALL-iDCG: all-idcg3 = ' + str(idcg3) + ' / ' + 'all-idcg5 = ' + str(idcg5) + ' / ' + 'all-idcg = ' + str(idcg_all), logfile_path) log( 'AVG-DCG3 = ' + str(dcg3 / idcg3) + ' / AVG-DCG5 = ' + str(dcg5 / idcg5) + ' / AVG-DCG = ' + str(dcg_all / idcg_all), logfile_path)
def train(): global logfile_path global trainfile global train0file global test1file batch_size = int(256) embedding_size = 300 learning_rate = 0.005 n_epochs = 20000 words_num_dim = 1200 validation_freq = 10 filter_sizes = [1, 2, 3, 5] num_filters = 500 margin_size = 0.05 logfile_path = os.path.join(logfile_path, 'LSTM-' + GetNowTime() + '-' \ + 'batch_size-' + str(batch_size) + '-' \ + 'num_filters-' + str(num_filters) + '-' \ + 'embedding_size-' + str(embedding_size) + '-' \ + 'n_epochs-' + str(n_epochs) + '-' \ + 'freq-' + str(validation_freq) + '-' \ + '-log.txt') log("New start ...", logfile_path) log(str(time.asctime(time.localtime(time.time()))), logfile_path) log("batch_size = " + str(batch_size), logfile_path) log("filter_sizes = " + str(filter_sizes), logfile_path) log("num_filters = " + str(num_filters), logfile_path) log("embedding_size = " + str(embedding_size), logfile_path) log("learning_rate = " + str(learning_rate), logfile_path) log("words_num_dim = " + str(words_num_dim), logfile_path) log("n_epochs = " + str(n_epochs), logfile_path) log("margin_size = " + str(margin_size), logfile_path) log("validation_freq = " + str(validation_freq), logfile_path) log("train_1_file = " + str(trainfile.split('/')[-1]), logfile_path) log("train_0_file = " + str(train0file.split('/')[-1]), logfile_path) log("test_file = " + str(test1file.split('/')[-1]), logfile_path) log("vector_file = " + str(vectorsfile.split('/')[-1]), logfile_path) vocab = build_vocab() word_embeddings = load_word_embeddings(vocab, embedding_size) trainList = load_train_list() testList = load_test_list() train0Dict = load_train0_dict() train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim) x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3') m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3') model = LSTM( input1=x1, input2=x2, input3=x3, mask1=m1, mask2=m2, mask3=m3, word_embeddings=word_embeddings, batch_size=batch_size, sequence_len=train_x1.shape[0], #row is sequence_len embedding_size=embedding_size, filter_sizes=filter_sizes, num_filters=num_filters, margin_size = margin_size) cost, cos12, cos13 = model.cost, model.cos12, model.cos13 params, accuracy = model.params, model.accuracy grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3') q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3') train_model = theano.function( [p1, p2, p3, q1, q2, q3], [cost, accuracy], updates=updates, givens={ x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3 } ) v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3') validate_model = theano.function( inputs=[v1, v2, v3, u1, u2, u3], outputs=[cos12, cos13], #updates=updates, givens={ x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3 } ) epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch += 1 train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim) #print('train_x1, train_x2, train_x3') #print(train_x1.shape, train_x2.shape, train_x3.shape) cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3) log('load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc), logfile_path) if epoch % validation_freq == 0: log('Evaluation ......', logfile_path) validation(validate_model, testList, vocab, batch_size, words_num_dim)