logging.info("Load datas like this:\n") for query_a, id_a, query_b, id_b, label in zip(querys_a, ids_a, querys_b, ids_b, labels): print "query_a:\t%s" % " ".join(query_a) print "id_a:\t%s" % " ".join([str(id_) for id_ in id_a]) print "query_b:\t%s" % " ".join(query_b) print "id_b:\t%s" % " ".join([str(id_) for id_ in id_b]) if label[0] == 1: label_ = 0 else: label_ = 1 print "label:\t%d" % label_ print "*" * 100 train_data_helper = dh.TrainDataHelper(FLAGS.max_seq_len) train_data_helper.initialize() vocab_size = train_data_helper.get_vocab_size() # 词汇量大小 test_datas = train_data_helper.read_input_file(FLAGS.test_file, type="test") test_data_size = len(test_datas) test_batch_sum = test_data_size / FLAGS.batch_size if test_data_size % FLAGS.batch_size != 0: test_batch_sum += 1 graph = tf.Graph() with graph.as_default(): sess = tf.Session() with sess.as_default(): saver = tf.train.import_meta_graph("%s/model-%d.meta" %
hp = Hyperparams(prj_name) tf.flags.DEFINE_string("test_file", hp.train_params["test_file"], "Data for the training data.") tf.flags.DEFINE_string("model_path", hp.train_params["model_path"], "Path to save model") tf.flags.DEFINE_string("pretrained_embedding_file", hp.train_params["pretrained_embedding_file"], "Pretrained embeddings for querys") tf.flags.DEFINE_integer("embedding_size", hp.model_params["embedding_size"], "Size of embedding for token/position") tf.flags.DEFINE_integer("batch_size", hp.train_params["batch_size"], "Batch size for validation") FLAGS = tf.flags.FLAGS train_data_helper = dh.TrainDataHelper(FLAGS.embedding_size) train_data_helper.initialize(FLAGS.pretrained_embedding_file) test_datas = train_data_helper.read_input_file(FLAGS.test_file, type="test") test_data_size = len(test_datas) test_batch_sum = test_data_size / FLAGS.batch_size if test_data_size % FLAGS.batch_size != 0: test_batch_sum += 1 graph = tf.Graph() with graph.as_default(): sess = tf.Session() with sess.as_default(): saver = tf.train.import_meta_graph("%s/model-%d.meta" % (FLAGS.model_path, model_id))
def train(): # 初始化日志和模型路径 OtherUtils.initPaths(FLAGS.model_path, FLAGS.log_path) # 初始化输入文件 train_data_helper = dh.TrainDataHelper(FLAGS.embedding_size) train_data_helper.initialize(FLAGS.pretrained_embedding_file) train_datas = train_data_helper.read_input_file(FLAGS.train_file, type="train") train_data_size = len(train_datas) valid_datas = train_data_helper.read_input_file(FLAGS.valid_file, type="valid") valid_data_size = len(valid_datas) # Build a graph and rnn object with tf.Graph().as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6) session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, gpu_options=gpu_options, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): model = Model.Model(d_emb=FLAGS.embedding_size, d_hiddens=dims_hidden, d_fc=dims_fc) model.build() # 获取train_operator train_op = ModelUtils.train_step(model.loss, FLAGS.learning_rate, model.global_step, decay=False) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) def train_step(batch_ids_a, batch_ids_b, batch_labels): batch_labels = np.array(batch_labels) feed_dict = { model.inputs_a: batch_ids_a, model.inputs_b: batch_ids_b, model.input_y: batch_labels, model.dropout_keep_rate: FLAGS.dropout_keep_rate, } _, loss, scores = sess.run( [train_op, model.loss, model.scores], feed_dict=feed_dict) tp, fp, tn, fn = Metrix.get_accu(scores[:], batch_labels[:], FLAGS.accu_threshold) return loss, tp, fp, tn, fn def validation_step(batch_ids_a, batch_ids_b, batch_labels): batch_labels = np.array(batch_labels) feed_dict = { model.inputs_a: batch_ids_a, model.inputs_b: batch_ids_b, model.input_y: batch_labels, model.dropout_keep_rate: 1.0, } loss, scores = sess.run([model.loss, model.scores], feed_dict=feed_dict) tp, fp, tn, fn = Metrix.get_accu(scores[:], batch_labels[:], FLAGS.accu_threshold) return loss, tp, fp, tn, fn with tf.device("/gpu:0"): batch_per_epoch = train_data_size / FLAGS.batch_size if train_data_size % FLAGS.batch_size != 0: batch_per_epoch += 1 valid_batch_sum = valid_data_size / FLAGS.batch_size if valid_data_size % FLAGS.batch_size != 0: valid_batch_sum += 1 best_val_loss = 1000 best_val_accu = 0.0 best_val_recall = 0.0 best_val_prec = 0.0 best_val_f1 = -1 best_epoch = -1 for epoch in range(FLAGS.epochs): total_loss = 0.0 tp, tn, fp, fn = 0.0, 0.0, 0.0, 0.0 batches = train_data_helper.batch_iter(train_datas, FLAGS.batch_size, shuffle=True) for idx, batch in enumerate(batches): batch_ids_a, batch_ids_b, batch_labels = train_data_helper.trans_batch_to_inputs( batch) _loss, _tp, _fp, _tn, _fn = train_step( batch_ids_a, batch_ids_b, batch_labels) total_loss += _loss tp += _tp tn += _tn fp += _fp fn += _fn if idx != 0 and idx % (batch_per_epoch / 10) == 0: tmp_loss = total_loss / idx tmp_accu = (tp + tn) / (tp + tn + fp + fn) per = idx / (batch_per_epoch / 10) mess = "Epoch: %d, percent: %d0%%, loss: %f, accu: %f" % ( epoch, per, tmp_loss, tmp_accu) logging.info(mess) logging.info( "Epoch: %d, percent: %d0%%, tp=%d, tn=%d, fp=%d, fn=%d" % (epoch, per, int(tp), int(tn), int(fp), int(fn))) total_loss = total_loss / batch_per_epoch accu = (tp + tn) / (tp + tn + fp + fn) mess = "Epoch %d: train result - loss %f, accu %f" % ( epoch, total_loss, accu) logging.info(mess) total_loss = 0.0 tp, tn, fp, fn = 0.0, 0.0, 0.0, 0.0 batches = train_data_helper.batch_iter(valid_datas, FLAGS.batch_size, shuffle=False) for batch in batches: batch_ids_a, batch_ids_b, batch_labels = train_data_helper.trans_batch_to_inputs( batch) loss_, _tp, _fp, _tn, _fn = validation_step( batch_ids_a, batch_ids_b, batch_labels) total_loss += loss_ tp += _tp tn += _tn fp += _fp fn += _fn total_loss = total_loss / valid_batch_sum accu, recall, f1, prec = Metrix.eva(tp, tn, fp, fn) mess = "Evaluation: loss %f, acc %f, recall %f, precision %f, f1 %f" % \ (total_loss, accu, recall, prec, f1) logging.info(mess) logging.info("Evaluation: tp=%d, tn=%d, fp=%d, fn=%d" % (int(tp), int(tn), int(fp), int(fn))) # checkpoint_prefix = "%s/model" % FLAGS.model_path # path = saver.save(sess, checkpoint_prefix, global_step=epoch) # print("Saved model checkpoint to {0}".format(path)) if best_val_loss > total_loss: best_val_loss = total_loss best_val_accu = accu best_val_recall = recall best_val_prec = prec best_val_f1 = f1 best_epoch = epoch checkpoint_prefix = "%s/model" % FLAGS.model_path path = saver.save(sess, checkpoint_prefix, global_step=epoch) print("Saved model checkpoint to {0}".format(path)) logging.info( "Best epoch=%d, loss=%f, accu=%.4f, recall=%.4f, prec=%.4f, f1=%.4f", best_epoch, best_val_loss, best_val_accu, best_val_recall, best_val_prec, best_val_f1) logging.info("Training done")
def train(): # 初始化日志和模型路径 OtherUtils.initPaths(FLAGS.model_path, FLAGS.log_path) # 初始化输入文件 train_data_helper = dh.TrainDataHelper(FLAGS.max_seq_len) train_data_helper.initialize() vocab_size = train_data_helper.get_vocab_size() # 词汇量大小 train_datas = train_data_helper.read_input_file(FLAGS.train_file, type="train") train_data_size = len(train_datas) valid_datas = train_data_helper.read_input_file(FLAGS.valid_file, type="valid") valid_data_size = len(valid_datas) logging.info("Train start") # Build a graph and rnn object with tf.Graph().as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6) session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, gpu_options=gpu_options, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): model = BILSTM_CONV_NETWORK(vocab_size=vocab_size, seq_length=FLAGS.max_seq_len, d_emb=FLAGS.embedding_size, d_hidden_lstm=dims_hidden_lstm, d_hidden_conv=dims_hidden_conv, d_fc=dims_fc) model.build() # 获取train_operator train_op = ModelUtils.train_step(model.loss, FLAGS.learning_rate, model.global_step, decay=False) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) def train_step(batch_ids_a, batch_ids_b, batch_labels): batch_labels = np.array(batch_labels) feed_dict = { model.input_a: batch_ids_a, model.input_b: batch_ids_b, model.input_y: batch_labels, model.dropout_prob: FLAGS.dropout_rate, } _, loss, scores, preds, labels = sess.run( [train_op, model.loss, model.probs, model.predictions, model.labels], feed_dict=feed_dict ) print "actucal-{} predict-{}".format(labels[:32], preds[:32]) tp, fp, tn, fn = Metrix.get_accu(scores[:, 1], batch_labels[:, 1], FLAGS.accu_threshold) return loss, tp, fp, tn, fn def validation_step(batch_ids_a, batch_ids_b, batch_labels): batch_labels = np.array(batch_labels) feed_dict = { model.input_a: batch_ids_a, model.input_b: batch_ids_b, model.input_y: batch_labels, model.dropout_prob: 0.0, } loss, scores = sess.run( [model.softmax_score_losses, model.probs], feed_dict=feed_dict ) tp, fp, tn, fn = Metrix.get_accu(scores[:, 1], batch_labels[:, 1], FLAGS.accu_threshold) return loss, tp, fp, tn, fn with tf.device("/gpu:0"): batch_per_epoch = train_data_size / FLAGS.batch_size if train_data_size % FLAGS.batch_size != 0: batch_per_epoch += 1 valid_batch_sum = valid_data_size / FLAGS.batch_size if valid_data_size % FLAGS.batch_size != 0: valid_batch_sum += 1 best_val_loss = 1000 best_val_accu = 0.0 best_val_recall = 0.0 best_val_prec = 0.0 best_val_f1 = -1 best_epoch = -1 for epoch in range(FLAGS.epochs): total_loss = 0.0 tp, tn, fp, fn = 0.0, 0.0, 0.0, 0.0 batches = train_data_helper.batch_iter(train_datas, FLAGS.batch_size, shuffle=True) for idx, batch in enumerate(batches): batch_ids_a = [train_data_helper.get_input_ids(data[0]) for data in batch] batch_ids_b = [train_data_helper.get_input_ids(data[1]) for data in batch] batch_labels = [data[2:] for data in batch] _loss, _tp, _fp, _tn, _fn = train_step(batch_ids_a, batch_ids_b, batch_labels) total_loss += _loss tp += _tp tn += _tn fp += _fp fn += _fn if idx !=0 and idx % (batch_per_epoch/10) == 0: tmp_loss = total_loss / idx tmp_accu = (tp+tn) / (tp+tn+fp+fn) per = idx/(batch_per_epoch/10) mess = "Epoch: %d, percent: %d0%%, loss: %f, accu: %f" % (epoch, per, tmp_loss, tmp_accu) logging.info(mess) logging.info("Epoch: %d, percent: %d0%%, tp=%d, tn=%d, fp=%d, fn=%d" % (epoch, per, int(tp), int(tn), int(fp), int(fn))) total_loss = total_loss / batch_per_epoch accu = (tp+tn) / (tp+tn+fp+fn) mess = "Epoch %d: train result - loss %f, accu %f"%(epoch, total_loss, accu) logging.info(mess) total_loss = 0.0 tp, tn, fp, fn = 0.0, 0.0, 0.0, 0.0 batches = train_data_helper.batch_iter(valid_datas, FLAGS.batch_size, shuffle=False) # for batch_ids_a, batch_ids_b, batch_labels in zip(batches_ids_a, batches_ids_b, batches_labels): for batch in batches: batch_ids_a = [train_data_helper.get_input_ids(data[0]) for data in batch] batch_ids_b = [train_data_helper.get_input_ids(data[1]) for data in batch] batch_labels = [data[2:] for data in batch] loss_, _tp, _fp, _tn, _fn = validation_step(batch_ids_a, batch_ids_b, batch_labels) total_loss += loss_ tp += _tp tn += _tn fp += _fp fn += _fn total_loss = total_loss / valid_batch_sum accu, recall, f1, prec = Metrix.eva(tp, tn, fp, fn) mess = "Evaluation: loss %f, acc %f, recall %f, precision %f, f1 %f" % \ (total_loss, accu, recall, prec, f1) logging.info(mess) logging.info("Evaluation: tp=%d, tn=%d, fp=%d, fn=%d" % (int(tp), int(tn), int(fp), int(fn))) # checkpoint_prefix = "%s/model" % FLAGS.model_path # path = saver.save(sess, checkpoint_prefix, global_step=epoch) # print("Saved model checkpoint to {0}".format(path)) if best_val_loss > total_loss: best_val_loss = total_loss best_val_accu = accu best_val_recall = recall best_val_prec = prec best_val_f1 = f1 best_epoch = epoch checkpoint_prefix = "%s/model" % FLAGS.model_path path = saver.save(sess, checkpoint_prefix, global_step=epoch) print("Saved model checkpoint to {0}".format(path)) model_conf = {"epoch": 0, "maxSeqLength": FLAGS.max_seq_len, "hiddenSize": sum([size[1] for size in dims_hidden_conv]), "vocabDic": "vocab.txt", "encoderModelPath": "encoder", "similarityModelPath": "similairity"} train_data_helper.save_vocab_file("%s/%s" % (FLAGS.model_path, model_conf["vocabDic"])) model_conf_file = "%s/model.conf" % FLAGS.model_path with open(model_conf_file, 'w') as wp: print >> wp, json.dumps(model_conf, ensure_ascii=False) wp.close() logging.info("Best epoch=%d, loss=%f, accu=%.4f, recall=%.4f, prec=%.4f, f1=%.4f", best_epoch, best_val_loss, best_val_accu, best_val_recall, best_val_prec, best_val_f1) logging.info("Training done")
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) sys.path.append(os.path.join(os.path.dirname(__file__), "../../..")) import data_helper as dh import hyperparams import Modeling from CommonLibs import OtherUtils prj_name = sys.argv[1] hp = hyperparams.Hyperparams(prj_name) model_id = sys.argv[2] my_dh = dh.TrainDataHelper(hp.model_params["max_seq_len"]) my_dh.initialize() def trans_line_to_inputs(line): datas = [] splits = line.strip('\r\n').split('\t') q_b = splits[-2] label = int(float(splits[-1])) q_id_b = my_dh.trans_query_to_input_id(q_b) q_a = ",".join(splits[:-2]) datas.append([my_dh.trans_query_to_input_id(q_a), q_id_b]) for q in splits[:-2]: q_id = my_dh.trans_query_to_input_id(q) datas.append([q_id, q_id_b])