def main(): setup_logging() logger = logging.getLogger(__name__) processing_word = get_processing_word(lowercase=True) dataset = Dataset('./data/test.txt', processing_word=processing_word) create_vocabulary([dataset], './data/words.txt', './data/tags.txt') dataset = Dataset('./data/test.txt') create_char_vocabulary([dataset], './data/chars.txt')
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1 == 1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_vocabulary( word2vec_model_path=FLAGS.word2vec_model_path, name_scope="transformer_classification") vocab_size = len(vocabulary_word2index) print("transformer.vocab_size:", vocab_size) train, test, _ = load_data_multilabel_new( vocabulary_word2index, training_data_path=FLAGS.training_data_path) compare_train_data = WikiQA(word2vec=Word2Vec(), max_len=FLAGS.max_len_compare) compare_train_data.open_file(mode="train") compare_test_data = WikiQA(word2vec=Word2Vec(), max_len=FLAGS.max_len_compare) compare_test_data.open_file(mode="valid") trainX, trainY, = train testX, testY = test trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size, FLAGS.d_model, FLAGS.d_k, FLAGS.d_v, FLAGS.h, FLAGS.num_layer, FLAGS.is_training, compare_train_data.num_features, di=50, s=compare_train_data.max_len, w=4, l2_reg=0.0004, l2_lambda=FLAGS.l2_lambda) print("=" * 50) print("List of Variables:") for v in tf.trainable_variables(): print(v.name) print("=" * 50) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, model, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(model.epoch_step) number_of_training_data = len(trainX) print("number_of_training_data:", number_of_training_data) previous_eval_loss = 10000 best_eval_loss = 10000 batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 compare_train_data.reset_index() for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) batch_x1, batch_x2, _, batch_features = compare_train_data.next_batch( batch_size=end - start) feed_dict = { model.input_x: trainX[start:end], model.dropout_keep_prob: 0.9, model.x1: batch_x1, model.x2: batch_x2, model.features: batch_features } feed_dict[model.input_y_label] = trainY[start:end] curr_loss, curr_acc, _ = sess.run( [model.loss_val, model.accuracy, model.train_op], feed_dict) #curr_acc--->TextCNN.accuracy loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 50 == 0: print( "transformer.classification==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)) ) #tTrain Accuracy:%.3f---》acc/float(counter) ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### if FLAGS.batch_size != 0 and ( start % (FLAGS.validate_step * FLAGS.batch_size) == 0): eval_loss, eval_acc = do_eval(sess, model, testX, testY, compare_test_data, batch_size) print( "transformer.classification.validation.part. previous_eval_loss:", previous_eval_loss, ";current_eval_loss:", eval_loss) if eval_loss > previous_eval_loss: #if loss is not decreasing # reduce the learning rate by a factor of 0.5 print( "transformer.classification.==>validation.part.going to reduce the learning rate." ) learning_rate1 = sess.run(model.learning_rate) lrr = sess.run([model.learning_rate_decay_half_op]) learning_rate2 = sess.run(model.learning_rate) print( "transformer.classification==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:", learning_rate2) #print("HierAtten==>Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) else: # loss is decreasing if eval_loss < best_eval_loss: print( "transformer.classification==>going to save the model.eval_loss:", eval_loss, ";best_eval_loss:", best_eval_loss) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) best_eval_loss = eval_loss previous_eval_loss = eval_loss compare_test_data.reset_index() ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### #epoch increment print("going to increment epoch counter....") sess.run(model.epoch_increment)
def main(_): #if FLAGS.use_pingyin: vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary( FLAGS.traning_data_path, FLAGS.vocab_size, name_scope=FLAGS.model_name, tokenize_style=FLAGS.tokenize_style) vocab_size = len(vocabulary_word2index) print("cnn_model.vocab_size:", vocab_size) num_classes = len(vocabulary_index2label) print("num_classes:", num_classes) train, valid, test, true_label_percent = load_data( FLAGS.traning_data_path, vocabulary_word2index, vocabulary_label2index, FLAGS.sentence_len, FLAGS.model_name, tokenize_style=FLAGS.tokenize_style) trainX1, trainX2, trainBlueScores, trainY = train validX1, validX2, validBlueScores, validY = valid testX1, testX2, testBlueScores, testY = test length_data_mining_features = len(trainBlueScores[0]) print("length_data_mining_features:", length_data_mining_features) #print some message for debug purpose print("model_name:", FLAGS.model_name, ";length of training data:", len(trainX1), ";length of validation data:", len(testX1), ";true_label_percent:", true_label_percent, ";tokenize_style:", FLAGS.tokenize_style, ";vocabulary size:", vocab_size) print("train_x1:", trainX1[0], ";train_x2:", trainX2[0]) print("data mining features.length:", len(trainBlueScores[0]), "data_mining_features:", trainBlueScores[0], ";train_y:", trainY[0]) #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model textCNN = DualBilstmCnnModel( filter_sizes, FLAGS.num_filters, num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training, model=FLAGS.model_name, similiarity_strategy=FLAGS.similiarity_strategy, top_k=FLAGS.top_k, max_pooling_style=FLAGS.max_pooling_style, length_data_mining_features=length_data_mining_features) #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint.") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) if FLAGS.decay_lr_flag: #trainX1, trainX2, trainY = shuffle_data(trainX1, trainX2, trainY) for i in range(2): # decay learning rate if necessary. print(i, "Going to decay learning rate by half.") sess.run(textCNN.learning_rate_decay_half_op) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if not os.path.exists(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) if FLAGS.use_pretrained_embedding: #load pre-trained word embedding print("===>>>going to use pretrained word embeddings...") assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textCNN, FLAGS.word2vec_model_path) curr_epoch = sess.run(textCNN.epoch_step) #3.feed data & training number_of_training_data = len(trainX1) batch_size = FLAGS.batch_size iteration = 0 best_acc = 0.60 best_f1_score = 0.20 weights_dict = init_weights_dict( vocabulary_label2index) #init weights dict. for epoch in range(curr_epoch, FLAGS.num_epochs): print("Auto.Going to shuffle data") trainX1, trainX2, trainBlueScores, trainY = shuffle_data( trainX1, trainX2, trainBlueScores, trainY) loss, eval_acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): iteration = iteration + 1 input_x1, input_x2, input_bluescores, input_y = generate_batch_training_data( trainX1, trainX2, trainBlueScores, trainY, number_of_training_data, batch_size) #input_x1=trainX1[start:end] #input_x2=trainX2[start:end] #input_bluescores=trainBlueScores[start:end] #input_y=trainY[start:end] weights = get_weights_for_current_batch(input_y, weights_dict) feed_dict = { textCNN.input_x1: input_x1, textCNN.input_x2: input_x2, textCNN.input_bluescores: input_bluescores, textCNN.input_y: input_y, textCNN.weights: np.array(weights), textCNN.dropout_keep_prob: FLAGS.dropout_keep_prob, textCNN.iter: iteration, textCNN.tst: not FLAGS.is_training } curr_loss, curr_acc, lr, _, _ = sess.run([ textCNN.loss_val, textCNN.accuracy, textCNN.learning_rate, textCNN.update_ema, textCNN.train_op ], feed_dict) loss, eval_acc, counter = loss + curr_loss, eval_acc + curr_acc, counter + 1 if counter % 100 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tAcc:%.3f\tLearning rate:%.5f" % (epoch, counter, loss / float(counter), eval_acc / float(counter), lr)) #middle checkpoint #if start!=0 and start%(500*FLAGS.batch_size)==0: # eval every 3000 steps. #eval_loss, acc,f1_score, precision, recall,_ = do_eval(sess, textCNN, validX1, validX2, validY,iteration) #print("【Validation】Epoch %d Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, acc,eval_loss, f1_score, precision, recall)) # save model to checkpoint #save_path = FLAGS.ckpt_dir + "model.ckpt" #saver.save(sess, save_path, global_step=epoch) #epoch increment print("going to increment epoch counter....") sess.run(textCNN.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_accc, f1_scoree, precision, recall, weights_label = do_eval( sess, textCNN, validX1, validX2, validBlueScores, validY, iteration, vocabulary_index2word) weights_dict = get_weights_label_as_standard_dict( weights_label) print("label accuracy(used for label weight):==========>>>>", weights_dict) print( "【Validation】Epoch %d\t Loss:%.3f\tAcc %.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, eval_loss, eval_accc, f1_scoree, precision, recall)) #save model to checkpoint if eval_accc * 1.05 > best_acc and f1_scoree > best_f1_score: save_path = FLAGS.ckpt_dir + "model.ckpt" print("going to save model. eval_f1_score:", f1_scoree, ";previous best f1 score:", best_f1_score, ";eval_acc", str(eval_accc), ";previous best_acc:", str(best_acc)) saver.save(sess, save_path, global_step=epoch) best_acc = eval_accc best_f1_score = f1_scoree if FLAGS.decay_lr_flag and (epoch != 0 and (epoch == 1 or epoch == 3 or epoch == 5 or epoch == 8)): #TODO print("Auto.Restoring Variables from Checkpoint.") #TODO saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) for i in range(2): # decay learning rate if necessary. print(i, "Going to decay learning rate by half.") sess.run(textCNN.learning_rate_decay_half_op) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, acc_t, f1_score_t, precision, recall, weights_label = do_eval( sess, textCNN, testX1, testX2, testBlueScores, testY, iteration, vocabulary_index2word) print( "Test Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f:" % (test_loss, acc_t, f1_score_t, precision, recall)) pass
def main(_): training_data_path = '/Users/liyangyang/Downloads/bdci/train.txt' vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = \ data_util.create_vocabulary(training_data_path, 17259, name_scope='cnn') vocab_size = len(vocabulary_word2index) + 1 print("cnn_model.vocab_size:", vocab_size) num_classes = len(vocabulary_index2label) print("num_classes:", num_classes) print(vocabulary_index2label) train, test = data_util.load_data_multilabel(training_data_path, vocabulary_word2index, vocabulary_label2index, 200) trainX, trainY = train testX, testY = test # trainX = trainX[0:8000] # trainY = trainY[0:8000] # testX = testX[0:500] # testY = testY[0:500] # print some message for debug purpose print("length of training data:", len(trainX), ";length of validation data:", len(testX)) print("trainX.shape", np.array(trainX).shape) print("trainY.shape", np.array(trainY).shape) print("trainX[0]:", trainX[1]) print("trainY[0]:", trainY[1]) print("end padding & transform to one hot...") # 2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Instantiate Model textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) # Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint.") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) # for i in range(3): #decay learning rate if necessary. # print(i,"Going to decay learning rate by half.") # sess.run(textCNN.learning_rate_decay_half_op) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: # load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textCNN) curr_epoch = sess.run(textCNN.epoch_step) # 3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size iteration = 0 for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): iteration = iteration + 1 if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) print("trainY[start:end]:", trainY[start:end]) feed_dict = { textCNN.input_x: trainX[start:end], textCNN.dropout_keep_prob: 0.5, textCNN.iter: iteration, textCNN.tst: not FLAGS.is_training } if not FLAGS.multi_label_flag: feed_dict[textCNN.input_y] = trainY[start:end] else: feed_dict[textCNN.input_y_multilabel] = trainY[start:end] curr_loss, lr, curr_acc, _ = sess.run([ textCNN.loss_val, textCNN.learning_rate, textCNN.accuracy, textCNN.train_op ], feed_dict) loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 2 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tLearning rate:%.5f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), lr, acc / float(counter))) ######################################################################################################## # if start % (2000 * FLAGS.batch_size) == 0: # eval every 3000 steps. # eval_loss, f1_score, precision, recall = do_eval(sess, textCNN, testX, testY, iteration) # print("Epoch %d Validation Loss:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % ( # epoch, eval_loss, f1_score, precision, recall)) # # save model to checkpoint # save_path = FLAGS.ckpt_dir + "model.ckpt" # saver.save(sess, save_path, global_step=epoch) ######################################################################################################## # epoch increment print("going to increment epoch counter....") sess.run(textCNN.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) eval_loss, eval_acc = do_eval(sess, textCNN, testX, testY, iteration, batch_size) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) # 5.最后在测试集上做测试,并报告测试准确率 Test eval_loss, eval_acc = do_eval(sess, textCNN, testX, testY, iteration, batch_size) print("Test Loss:%.3f" % (eval_loss)) pass
def main(_): trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary( FLAGS.traning_data_path, FLAGS.vocab_size, name_scope=FLAGS.name_scope) vocab_size = len(vocabulary_word2index) print("cnn_model.vocab_size:", vocab_size) num_classes = len(vocabulary_index2label) print("num_classes:", num_classes) train, test = load_data_multilabel(FLAGS.traning_data_path, vocabulary_word2index, vocabulary_label2index, FLAGS.sentence_len) trainX, trainY = train testX, testY = test #print some message for debug purpose print("length of training data:", len(trainX), ";length of validation data:", len(testX)) print("trainX[0]:", trainX[0]) print("trainY[0]:", trainY[0]) train_y_short = get_target_label_short(trainY[0]) print("train_y_short:", train_y_short) #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model textCNN = TextCNN(filter_sizes, FLAGS.num_filters, num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training, multi_label_flag=FLAGS.multi_label_flag) #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint.") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) #for i in range(3): #decay learning rate if necessary. # print(i,"Going to decay learning rate by half.") # sess.run(textCNN.learning_rate_decay_half_op) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textCNN, FLAGS.word2vec_model_path) curr_epoch = sess.run(textCNN.epoch_step) #3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size iteration = 0 for epoch in range(curr_epoch, FLAGS.num_epochs): loss, counter = 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): iteration = iteration + 1 if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) feed_dict = { textCNN.input_x: trainX[start:end], textCNN.dropout_keep_prob: 0.5, textCNN.iter: iteration, textCNN.tst: not FLAGS.is_training } if not FLAGS.multi_label_flag: feed_dict[textCNN.input_y] = trainY[start:end] else: feed_dict[textCNN.input_y_multilabel] = trainY[start:end] curr_loss, lr, _, _ = sess.run([ textCNN.loss_val, textCNN.learning_rate, textCNN.update_ema, textCNN.train_op ], feed_dict) loss, counter = loss + curr_loss, counter + 1 if counter % 50 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tLearning rate:%.5f" % (epoch, counter, loss / float(counter), lr)) ######################################################################################################## if start % (2000 * FLAGS.batch_size) == 0: # eval every 3000 steps. eval_loss, f1_score, precision, recall = do_eval( sess, textCNN, testX, testY, iteration) print( "Epoch %d Validation Loss:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, eval_loss, f1_score, precision, recall)) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) ######################################################################################################## #epoch increment print("going to increment epoch counter....") sess.run(textCNN.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, f1_score, precision, recall = do_eval( sess, textCNN, testX, testY, iteration) print( "Epoch %d Validation Loss:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, eval_loss, f1_score, precision, recall)) #save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, _, _, _ = do_eval(sess, textCNN, testX, testY, iteration) print("Test Loss:%.3f" % (test_loss)) pass
def main(_): # 1.load data(X:list of lint,y:int). # if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) # else: if 1 == 1: # vocab_processor_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/vocab' # # print("end padding & transform to one hot...") # x_train, y = data_helpers.load_data_and_labels(FLAGS.data_file) # # # vocab_processor = learn.preprocessing.VocabularyProcessor(2000,min_frequency=2) # # x = np.array(list(vocab_processor.fit_transform(x_train))) # # vocab_processor.save(vocab_processor_path) # # vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_processor_path) # x = np.array(list(vocab_processor.transform(x_train))) # # trainX = x[:100000] # testX = x[100000:] # trainY = y[:100000] # testY = y[100000:] # vocab_size = len(vocab_processor.vocabulary_) # print('vocab_size', vocab_size) # print("trainX[0]:", trainX[0]) # ;print("trainY[0]:", trainY[0]) # # Converting labels to binary vectors # print("end padding & transform to one hot...") training_data_path = '/Users/liyangyang/Downloads/dwb/new_data/train_set.txt' vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = \ data_util.create_vocabulary(training_data_path, 345325, name_scope='cnn') vocab_size = len(vocabulary_word2index) + 1 print("cnn_model.vocab_size:", vocab_size) num_classes = len(vocabulary_index2label) print("num_classes:", num_classes) print(vocabulary_index2label) train, test = data_util.load_data_multilabel(training_data_path, vocabulary_word2index, vocabulary_label2index, 5000) trainX, trainY = train testX, testY = test trainX = trainX[0:1000] trainY = trainY[0:1000] testX = testX[0:500] testY = testY[0:500] # print some message for debug purpose print("length of training data:", len(trainX), ";length of validation data:", len(testX)) print("trainX.shape", np.array(trainX).shape) print("trainY.shape", np.array(trainY).shape) print("trainX[0]:", trainX[1]) print("trainY[0]:", trainY[1]) print("end padding & transform to one hot...") # 2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Instantiate Model textRNN = TextRNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size, FLAGS.is_training) # Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint for rnn model.") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: # load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textRNN) curr_epoch = sess.run(textRNN.epoch_step) # 3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end] ) # ;print("trainY[start:end]:",trainY[start:end]) curr_loss, curr_acc, _ = sess.run( [textRNN.loss_val, textRNN.accuracy, textRNN.train_op], feed_dict={ textRNN.input_x: trainX[start:end], textRNN.input_y: trainY[start:end], textRNN.dropout_keep_prob: 1 } ) # curr_acc--->TextCNN.accuracy -->,textRNN.dropout_keep_prob:1 loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 1 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)) ) # tTrain Accuracy:%.3f---》acc/float(counter) # epoch increment print("going to increment epoch counter....") sess.run(textRNN.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, textRNN, testX, testY, batch_size) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, textRNN, testX, testY, batch_size) pass
tf.app.flags.DEFINE_boolean( "is_training_flag", True, "is training.true:tranining,false:testing/inference") tf.app.flags.DEFINE_integer("num_epochs", 15, "number of epochs to run.") tf.app.flags.DEFINE_integer( "validate_every", 1, "Validate every validate_every epochs.") # 每10轮做一次验证 tf.app.flags.DEFINE_boolean("use_embedding", False, "whether to use embedding or not.") tf.app.flags.DEFINE_integer( "num_filters", 128, "number of filters") # 256--->512 tf.app.flags.DEFINE_string( "word2vec_model_path", "word2vec-title-desc.bin", "word2vec's vocabulary and vectors") tf.app.flags.DEFINE_string("name_scope", "cnn", "name scope value.") tf.app.flags.DEFINE_boolean( "multi_label_flag", False, "use multi label or single label.") filter_sizes = [6, 7, 8] print("Restoring Variables from Checkpoint.") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, _ = create_vocabulary(FLAGS.traning_data_path, FLAGS.vocab_size, name_scope=FLAGS.name_scope) vocab_size = len(vocabulary_word2index) print("cnn_model.vocab_size:", vocab_size) num_classes = len(vocabulary_label2index) print("num_classes:", num_classes) #num_examples,FLAGS.sentence_len=trainX.shape #print("num_examples of training:",num_examples,";sentence_len:",FLAGS.sentence_len) train, test = load_data_multilabel( FLAGS.traning_data_path, vocabulary_word2index, vocabulary_label2index, FLAGS.sentence_len) trainX, trainY = train testX, testY = test
def main(_): ''' 主函数:数据预处理,迭代数据训练模型 ''' print("数据预处理阶段:......") trainX,trainY,testX,testY = None, None, None, None #加载词向量转换 vocabulary_word2index, vocabulary_index2word = create_vocabulary( word2vec_model_path=FLAGS.word2vec_model_path, name_scope="cnn") vocab_size = len(vocabulary_word2index) #标签索引 vocabulary_word2index_label, vocabulary_index2word_label = create_vocabulary_label( vocabulary_label=FLAGS.training_data_path, name_scope="cnn") #将文本转换为向量形式,分训练,测试集 train, test, _ = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label,training_data_path=FLAGS.training_data_path) trainX, trainY = train testX, testY = test #用0填充短句 trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0) testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0) print("数据预处理部分完成.....") print("创建session 对话.......") config = tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: textCNN = TextCNN(filter_size,FLAGS.num_filters,FLAGS.num_classes,FLAGS.learning_rate,FLAGS.batch_size, FLAGS.sequence_length,vocab_size,FLAGS.embed_size,FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.is_decay, FLAGS.is_dropout,FLAGS.is_l2) #存储变量 merged = tf.summary.merge_all() writer = tf.summary.FileWriter(FLAGS.tensorboard_dir, sess.graph) #初始化模型保存 saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): #判断模型是否存在 print("从模型中恢复变量") # saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) #自动获取最近一次的模型变量 else: print("初始化变量") sess.run(tf.global_variables_initializer()) #初始化所有变量 if FLAGS.use_embedding: #加载预训练词向量 assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textCNN, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(textCNN.epoch_step) #划分训练数据 num_train_data = len(trainX) batch_size = FLAGS.batch_size index = 0 for epoch in range(curr_epoch, FLAGS.num_epochs): loss,acc,counter = 0.0, 0.0, 0.0 for start, end in zip(range(0, num_train_data, batch_size), range(batch_size, num_train_data, batch_size)): feed_dict = {textCNN.input_x:trainX[start:end], textCNN.input_y:trainY[start:end], textCNN.dropout_keep_prob:0.9} curr_loss, curr_acc, logits, _ = sess.run([textCNN.loss_val, textCNN.accuracy, textCNN.logits, textCNN.train_op], feed_dict) index += 1 loss, counter, acc = loss+curr_loss, counter+1, acc+curr_acc if counter % 100 == 0: rs = sess.run(merged,feed_dict) #执行参数记录 writer.add_summary(rs, index) print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f\tGlobal Step %d" %(epoch,counter,loss/float(counter),acc/float(counter),sess.run(textCNN.global_step))) # print("Train Logits{}".format(logits)) #迭代次数增加 epoch_increment = tf.assign(textCNN.epoch_step, tf.add(textCNN.epoch_step, tf.constant(1))) sess.run(epoch_increment) #验证 print("迭代次数:{}".format(epoch)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess,textCNN,testX,testY,batch_size) print("迭代次数:{}\t验证损失值:{}\t准确率:{}".format(epoch, eval_loss, eval_acc)) #保存模型 # save_path = FLAGS.ckpt_dir+"model.ckpt" # saver.save(sess, save_path, global_step=epoch) print("验证集上进行损失,准确率计算.....") test_loss, test_acc = do_eval(sess, textCNN, testX, testY, batch_size) print("测试集中损失值:{}\t准确率:{}".format(test_loss, test_acc))
def main(_): vocab_word2index, accusation_label2index, articles_label2index = create_vocabulary( FLAGS.data_path, FLAGS.traning_data_file, FLAGS.vocab_size, name_scope=FLAGS.name_scope, test_mode=FLAGS.test_mode) deathpenalty_label2index = {True: 1, False: 0} lifeimprisonment_label2index = {True: 1, False: 0} vocab_size = len(vocab_word2index) print("cnn_model.vocab_size:", vocab_size) accusation_num_classes = len(accusation_label2index) article_num_classes = len(articles_label2index) deathpenalty_num_classes = len(deathpenalty_label2index) lifeimprisonment_num_classes = len(lifeimprisonment_label2index) print("accusation_num_classes:", accusation_num_classes) print("article_num_clasess:", article_num_classes) train, valid, test = load_data_multilabel(FLAGS.traning_data_file, FLAGS.valid_data_file, FLAGS.test_data_path, vocab_word2index, accusation_label2index, articles_label2index, deathpenalty_label2index, lifeimprisonment_label2index, FLAGS.sentence_len, name_scope=FLAGS.name_scope, test_mode=FLAGS.test_mode) train_X, train_Y_accusation, train_Y_article, train_Y_deathpenalty, train_Y_lifeimprisonment, train_Y_imprisonment = train valid_X, valid_Y_accusation, valid_Y_article, valid_Y_deathpenalty, valid_Y_lifeimprisonment, valid_Y_imprisonment = valid test_X, test_Y_accusation, test_Y_article, test_Y_deathpenalty, test_Y_lifeimprisonment, test_Y_imprisonment = test #print some message for debug purpose print("length of training data:", len(train_X), ";valid data:", len(valid_X), ";test data:", len(test_X)) print("trainX_[0]:", train_X[0]) train_Y_accusation_short = get_target_label_short(train_Y_accusation[0]) train_Y_article_short = get_target_label_short(train_Y_article[0]) print("train_Y_accusation_short:", train_Y_accusation_short, ";train_Y_article_short:", train_Y_article_short) print("train_Y_deathpenalty:", train_Y_deathpenalty[0], ";train_Y_lifeimprisonment:", train_Y_lifeimprisonment[0], ";train_Y_imprisonment:", train_Y_imprisonment[0]) #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model model = HierarchicalAttention( accusation_num_classes, article_num_classes, deathpenalty_num_classes, lifeimprisonment_num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, FLAGS.num_sentences, vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training) #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint.") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) for i in range(2): #decay learning rate if necessary. print(i, "Going to decay learning rate by half.") sess.run(model.learning_rate_decay_half_op) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding vocabulary_index2word = { index: word for word, index in vocab_word2index.items() } assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, model, FLAGS.word2vec_model_path) curr_epoch = sess.run(model.epoch_step) #3.feed data & training number_of_training_data = len(train_X) batch_size = FLAGS.batch_size iteration = 0 for epoch in range(curr_epoch, FLAGS.num_epochs): loss_total, counter = 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): iteration = iteration + 1 if epoch == 0 and counter == 0: print("trainX[start:end]:", train_X[start:end], "train_X.shape:", train_X.shape) feed_dict = { model.input_x: train_X[start:end], model.input_y_accusation: train_Y_accusation[start:end], model.input_y_article: train_Y_article[start:end], model.input_y_deathpenalty: train_Y_deathpenalty[start:end], model.input_y_lifeimprisonment: train_Y_lifeimprisonment[start:end], model.input_y_imprisonment: train_Y_imprisonment[start:end], model.dropout_keep_prob: FLAGS.keep_dropout_rate } #model.iter: iteration,model.tst: not FLAGS.is_training current_loss, lr, loss_accusation, loss_article, loss_deathpenalty, loss_lifeimprisonment, loss_imprisonment, l2_loss, _ = sess.run( [ model.loss_val, model.learning_rate, model.loss_accusation, model.loss_article, model.loss_deathpenalty, model.loss_lifeimprisonment, model.loss_imprisonment, model.l2_loss, model.train_op ], feed_dict) #model.update_ema loss_total, counter = loss_total + current_loss, counter + 1 if counter % 100 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tLearning rate:%.5f" % (epoch, counter, float(loss_total) / float(counter), lr)) if counter % 400 == 0: print( "Loss_accusation:%.3f\tLoss_article:%.3f\tLoss_deathpenalty:%.3f\tLoss_lifeimprisonment:%.3f\tLoss_imprisonment:%.3f\tL2_loss:%.3f\tCurrent_loss:%.3f\t" % (loss_accusation, loss_article, loss_deathpenalty, loss_lifeimprisonment, loss_imprisonment, l2_loss, current_loss)) ######################################################################################################## if start != 0 and start % ( 1000 * FLAGS.batch_size) == 0: # eval every 400 steps. loss, f1_macro_accasation, f1_micro_accasation, f1_a_article, f1_i_aritcle, f1_a_death, f1_i_death, f1_a_life, f1_i_life, score_penalty = \ do_eval(sess, model, valid,iteration,accusation_num_classes,article_num_classes) accasation_score = ( (f1_macro_accasation + f1_micro_accasation) / 2.0) * 100.0 article_score = ( (f1_a_article + f1_i_aritcle) / 2.0) * 100.0 score_all = accasation_score + article_score + score_penalty print( "Epoch %d ValidLoss:%.3f\tMacro_f1_accasation:%.3f\tMicro_f1_accsastion:%.3f\tMacro_f1_article:%.3f Micro_f1_article:%.3f Macro_f1_deathpenalty:%.3f\t" "Micro_f1_deathpenalty:%.3f\tMacro_f1_lifeimprisonment:%.3f\tMicro_f1_lifeimprisonment:%.3f\t" % (epoch, loss, f1_macro_accasation, f1_micro_accasation, f1_a_article, f1_i_aritcle, f1_a_death, f1_i_death, f1_a_life, f1_i_life)) print("1.Accasation Score:", accasation_score, ";2.Article Score:", article_score, ";3.Penalty Score:", score_penalty, ";Score ALL:", score_all) # save model to checkpoint #save_path = FLAGS.ckpt_dir + "model.ckpt" #TODO temp remove==>only save checkpoint for each epoch once. #saver.save(sess, save_path, global_step=epoch) #epoch increment print("going to increment epoch counter....") sess.run(model.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: loss,f1_macro_accasation,f1_micro_accasation,f1_a_article,f1_i_aritcle,f1_a_death,f1_i_death,f1_a_life,f1_i_life,score_penalty=\ do_eval(sess,model,valid,iteration,accusation_num_classes,article_num_classes) accasation_score = ( (f1_macro_accasation + f1_micro_accasation) / 2.0) * 100.0 article_score = ((f1_a_article + f1_i_aritcle) / 2.0) * 100.0 score_all = accasation_score + article_score + score_penalty print() print( "Epoch %d ValidLoss:%.3f\tMacro_f1_accasation:%.3f\tMicro_f1_accsastion:%.3f\tMacro_f1_article:%.3f\tMicro_f1_article:%.3f\tMacro_f1_deathpenalty%.3f\t" "Micro_f1_deathpenalty%.3f\tMacro_f1_lifeimprisonment%.3f\tMicro_f1_lifeimprisonment%.3f\t" % (epoch, loss, f1_macro_accasation, f1_micro_accasation, f1_a_article, f1_i_aritcle, f1_a_death, f1_i_death, f1_a_life, f1_i_life)) print("===>1.Accasation Score:", accasation_score, ";2.Article Score:", article_score, ";3.Penalty Score:", score_penalty, ";Score ALL:", score_all) #save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) # 5.最后在测试集上做测试,并报告测试准确率 Testto 0.0 #test_loss,macrof1,microf1 = do_eval(sess, textCNN, testX, testY,iteration) #print("Test Loss:%.3f\tMacro f1:%.3f\tMicro f1:%.3f" % (test_loss,macrof1,microf1)) print("training completed...") pass
def main(_): # 1.load data with vocabulary of words and labels compare_test_data = WikiQA(word2vec=Word2Vec(), max_len=FLAGS.max_compare_len) compare_test_data.open_file(mode="test") vocabulary_word2index, vocabulary_index2word = create_vocabulary( word2vec_model_path=FLAGS.word2vec_model_path, name_scope="transformer_classification") # simple='simple' vocab_size = len(vocabulary_word2index) print("transformer_classification.vocab_size:", vocab_size) questionid_question_lists = load_final_test_data(FLAGS.predict_source_file) print("list of total questions:", len(questionid_question_lists)) test = load_data_predict(vocabulary_word2index, questionid_question_lists) print("list of total questions2:", len(test)) testX = [] question_id_list = [] for tuple in test: question_id, question_string_list = tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testY2 = load_data_predict_y(FLAGS.predict_source_file_y) print("list of total questions3:", len(testX2)) print("end padding...") # 3.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # 4.Instantiate Model model = Transformer(FLAGS.num_classes, FLAGS.learning_rate, len(testX2), FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size, FLAGS.d_model, FLAGS.d_k, FLAGS.d_v, FLAGS.h, FLAGS.num_layer, FLAGS.is_training, compare_test_data.num_features, di=50, s=compare_test_data.max_len, w=4, l2_reg=0.0004, l2_lambda=FLAGS.l2_lambda) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data = len(testX2) print("number_of_training_data:", number_of_training_data) batch_x1, batch_x2, _, batch_features = compare_test_data.next_batch( batch_size=number_of_training_data) predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'w', 'utf-8') logits = sess.run(model.return_logits, feed_dict={ model.input_x: testX2, model.input_y_label: testY2, model.dropout_keep_prob: 1, model.x1: batch_x1, model.x2: batch_x2, model.features: batch_features }) #logits:[batch_size,self.num_classes] answers = {} MAP, MRR = 0, 0 total = len(logits) for i in range(total): #prob = logits[i][1] - logits[i][0] prob = logits[i][1] if question_id_list[i] in answers: answers[question_id_list[i]].append( (testX[i], testY2[i], prob)) else: answers[question_id_list[i]] = [(testX[i], testY2[i], prob)] predict_target_file_f.write(str(logits[i]) + "\n") for i in answers.keys(): p, AP = 0, 0 MRR_check = False answers[i] = sorted(answers[i], key=lambda x: x[-1], reverse=True) for idx, (s, label, prob) in enumerate(answers[i]): if label == 1: if not MRR_check: MRR += 1 / (idx + 1) MRR_check = True p += 1 AP += p / (idx + 1) AP /= p MAP += AP total_q = len(answers.keys()) MAP /= total_q MRR /= total_q print("MAP", MAP, ",MRR", MRR) predict_target_file_f.close()