def train(conf, _model): if conf['rand_seed'] is not None: np.random.seed() tf.set_random_seed( conf['rand_seed'] ) if not os.path.exists(conf['save_path']): os.makedirs(conf['save_path']) # load data print('starting loading data') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) train_data, val_data, test_data = pickle.load(open(conf["data_path"], 'rb')) print('finish loading data') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) val_batches = reader.build_batches(val_data, conf) print("finish building test batches") print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) # refine conf batch_num = int(len(train_data['y']) / conf["batch_size"]) val_batch_num = len(val_batches["response"]) conf["train_steps"] = conf["num_scan_data"] * batch_num conf["save_step"] = 100000 conf["print_step"] = 100000 print('configurations: %s' %conf) print('model sucess') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) _graph = _model.build_graph() print('build graph sucess') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) with tf.Session(graph=_graph) as sess: _model.init.run(); if conf["init_model"]: _model.saver.restore(sess, conf["init_model"]) print("sucess init %s" %conf["init_model"]) average_loss = 0.0 batch_index = 0 step = 0 best_result = [0, 0, 0, 0] for step_i in tqdm(xrange(conf["num_scan_data"])): #for batch_index in rng.permutation(range(batch_num)): print('starting shuffle train data') shuffle_train = reader.unison_shuffle(train_data) train_batches = reader.build_batches(shuffle_train, conf) print('finish building train data') for batch_index in range(batch_num): feed = { _model.turns: train_batches["turns"][batch_index], _model.tt_turns_len: train_batches["tt_turns_len"][batch_index], _model.every_turn_len: train_batches["every_turn_len"][batch_index], _model.response: train_batches["response"][batch_index], _model.response_len: train_batches["response_len"][batch_index], _model.label: train_batches["label"][batch_index] } batch_index = (batch_index + 1) % batch_num; _, curr_loss = sess.run([_model.g_updates, _model.loss], feed_dict = feed) average_loss += curr_loss step += 1 # if step % conf["print_step"] == 0 and step > 0: # g_step, lr = sess.run([_model.global_step, _model.learning_rate]) # print('step: %s, lr: %s' %(g_step, lr)) # print("processed: [" + str(step * 1.0 / batch_num) + "] loss: [" + str(average_loss / conf["print_step"]) + "]" ) # average_loss = 0 # if step % conf["save_step"] == 0 and step > 0: # index = step / conf['save_step'] # score_file_path = conf['save_path'] + 'score.' + str(index) # score_file = open(score_file_path, 'w') # print('save step: %s' %index) # print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) # for batch_index in xrange(val_batch_num): # # feed = { # _model.turns: val_batches["turns"][batch_index], # _model.tt_turns_len: val_batches["tt_turns_len"][batch_index], # _model.every_turn_len: val_batches["every_turn_len"][batch_index], # _model.response: val_batches["response"][batch_index], # _model.response_len: val_batches["response_len"][batch_index], # _model.label: val_batches["label"][batch_index] # } # # scores = sess.run(_model.logits, feed_dict = feed) # for i in xrange(conf["batch_size"]): # score_file.write( # str(scores[i]) + '\t' + # str(val_batches["label"][batch_index][i]) + '\n') # score_file.close() # #write evaluation result # result = eva.evaluate(score_file_path) # result_file_path = conf["save_path"] + "result." + str(index) # with open(result_file_path, 'w') as out_file: # for p_at in result: # out_file.write(str(p_at) + '\n') # print('finish evaluation') # print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) # # if result[1] + result[2] > best_result[1] + best_result[2]: # best_result = result # _save_path = _model.saver.save(sess, conf["save_path"] + "model.ckpt." + str(step / conf["save_step"])) # print("succ saving model in " + _save_path) # print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) print("running evaluation on val") all_preds = [] for batch_index in xrange(val_batch_num): feed = { _model.turns: val_batches["turns"][batch_index], _model.tt_turns_len: val_batches["tt_turns_len"][batch_index], _model.every_turn_len: val_batches["every_turn_len"][batch_index], _model.response: val_batches["response"][batch_index], _model.response_len: val_batches["response_len"][batch_index], _model.label: val_batches["label"][batch_index] } scores = sess.run(_model.logits, feed_dict=feed) all_preds.append(list(scores)) df = pd.DataFrame(all_preds, columns=['prediction_'+str(i) for i in range(len(all_preds[0]))]) if not os.path.isdir(conf['output_predictions_folder']): os.makedirs(conf['output_predictions_folder']) with open(os.path.join(conf['output_predictions_folder'], 'config.json'), 'w') as f: conf['ranker'] = "DAM" conf['seed'] = str(conf['rand_seed']) args_dict = {} args_dict['args'] = conf f.write(json.dumps(args_dict, indent=4, sort_keys=True)) df.to_csv(conf['output_predictions_folder']+"/predictions.csv", index=False)
def train(conf, _model): if conf['rand_seed'] is not None: np.random.seed(conf['rand_seed']) if not os.path.exists(conf['save_path']): os.makedirs(conf['save_path']) # load data print('starting loading data') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) train_data, val_data, test_data = pickle.load(open(conf["data_path"], 'rb')) print('finish loading data') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) val_batches = reader.build_batches(val_data, conf) print("finish building test batches") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) # refine conf batch_num = len(train_data['y']) / conf["batch_size"] val_batch_num = len(val_batches["response"]) conf["train_steps"] = conf["num_scan_data"] * batch_num conf["save_step"] = int(max(1, batch_num / 10)) conf["print_step"] = int(max(1, batch_num / 100)) print('configurations: %s' % conf) print('model sucess') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) _graph = _model.build_graph() print('build graph sucess') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) with tf.Session(graph=_graph) as sess: _model.init.run() if conf["init_model"]: _model.saver.restore(sess, conf["init_model"]) print("sucess init %s" % conf["init_model"]) average_loss = 0.0 batch_index = 0 step = 0 best_result = [0, 0, 0, 0] for step_i in xrange(conf["num_scan_data"]): #for batch_index in rng.permutation(range(batch_num)): print('starting shuffle train data') shuffle_train = reader.unison_shuffle(train_data) train_batches = reader.build_batches(shuffle_train, conf) print('finish building train data') for batch_index in range(batch_num): feed = { _model.turns: train_batches["turns"][batch_index], _model.tt_turns_len: train_batches["tt_turns_len"][batch_index], _model.every_turn_len: train_batches["every_turn_len"][batch_index], _model.response: train_batches["response"][batch_index], _model.response_len: train_batches["response_len"][batch_index], _model.label: train_batches["label"][batch_index] } batch_index = (batch_index + 1) % batch_num _, curr_loss = sess.run([_model.g_updates, _model.loss], feed_dict=feed) average_loss += curr_loss step += 1 if step % conf["print_step"] == 0 and step > 0: g_step, lr = sess.run( [_model.global_step, _model.learning_rate]) print('step: %s, lr: %s' % (g_step, lr)) print("processed: [" + str(step * 1.0 / batch_num) + "] loss: [" + str(average_loss / conf["print_step"]) + "]") average_loss = 0 if step % conf["save_step"] == 0 and step > 0: index = step / conf['save_step'] score_file_path = conf['save_path'] + 'score.' + str(index) score_file = open(score_file_path, 'w') print('save step: %s' % index) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) for batch_index in xrange(val_batch_num): feed = { _model.turns: val_batches["turns"][batch_index], _model.tt_turns_len: val_batches["tt_turns_len"][batch_index], _model.every_turn_len: val_batches["every_turn_len"][batch_index], _model.response: val_batches["response"][batch_index], _model.response_len: val_batches["response_len"][batch_index], _model.label: val_batches["label"][batch_index] } scores = sess.run(_model.logits, feed_dict=feed) for i in xrange(conf["batch_size"]): score_file.write( str(scores[i]) + '\t' + str(val_batches["label"][batch_index][i]) + '\n') score_file.close() #write evaluation result result = eva.evaluate(score_file_path) result_file_path = conf["save_path"] + "result." + str( index) with open(result_file_path, 'w') as out_file: for p_at in result: out_file.write(str(p_at) + '\n') print('finish evaluation') print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) if result[1] + result[2] > best_result[1] + best_result[2]: best_result = result _save_path = _model.saver.save( sess, conf["save_path"] + "model.ckpt." + str(step / conf["save_step"])) print("succ saving model in " + _save_path) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def train(conf, _model): if conf['rand_seed'] is not None: np.random.seed(conf['rand_seed']) if not os.path.exists(conf['save_path']): os.makedirs(conf['save_path']) # load data print('starting loading data') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) train_data, val_data, test_data = pickle.load(open(conf["data_path"], 'rb')) print('train:', len(train_data['y'])) print('dev:', len(val_data['y'])) print('test:', len(test_data['y'])) print('finish loading data') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) val_batches = reader.build_batches('train',val_data, conf) print("finish building test batches") print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) # refine conf batch_num = int(len(train_data['y']) / conf["batch_size"]) val_batch_num = len(val_batches["response"]) conf["train_steps"] = conf["num_scan_data"] * batch_num conf["save_step"] = int(max(1, batch_num / 10)) conf["print_step"] = int(max(1, batch_num / 100)) print('configurations: %s' %conf) print('begin build model') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) _graph = _model.build_graph() print('build graph sucess') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) with tf.Session(graph=_graph) as sess: #writer = tf.summary.FileWriter("logs/", sess.graph) # for tensorboard # summary writer ''' train_summary_dir = os.path.join(conf["save_path"], "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) dev_summary_dir = os.path.join(conf["save_path"], "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) ''' _model.init.run() if not conf["init_model"]: emb_feed = {_model.emb_placeholder: _model._word_embedding_init} sess.run(_model.emb_init, feed_dict=emb_feed) if conf["init_model"]: _model.saver.restore(sess, conf["init_model"]) print("sucess init %s" %conf["init_model"]) average_loss = 0.0 batch_index = 0 step = 0 best_result = [0, 0, 0, 0] #best_result = [0, 0, 0, 0, 0, 0] # eva matrix: p1(2),p1(10),p2(10),p5(10) for step_i in range(conf["num_scan_data"]): # each epoch print('starting shuffle train data') shuffle_train = reader.unison_shuffle(train_data) train_batches = reader.build_batches('train',shuffle_train, conf) print('finish building train data') for batch_index in range(batch_num): # each batch feed = { _model.turns: train_batches["turns"][batch_index], _model.tt_turns_len: train_batches["tt_turns_len"][batch_index], _model.every_turn_len: train_batches["every_turn_len"][batch_index], _model.response: train_batches["response"][batch_index], _model.response_len: train_batches["response_len"][batch_index], _model.label: train_batches["label"][batch_index], _model.dropout_keep_prob: conf["dropout_keep_prob"] } batch_index = (batch_index + 1) % batch_num; _, curr_loss, summaries = sess.run([_model.g_updates, _model.loss, _model.train_summary_op], feed_dict = feed) # summary #train_summary_writer.add_summary(summaries, step) average_loss += curr_loss step += 1 if step % conf["print_step"] == 0 and step > 0: g_step, lr = sess.run([_model.global_step, _model.learning_rate]) print('step: %s, lr: %s' %(g_step, lr)) print("processed: [" + str(step * 1.0 / batch_num) + "] loss: [" + str(average_loss / conf["print_step"]) + "]") average_loss = 0 if step % conf["save_step"] == 0 and step > 0: index = step / conf['save_step'] score_file_path = conf['save_path'] + 'score.' + str(index) score_file = open(score_file_path, 'w') print('save step: %s' %index) print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) for batch_index in range(val_batch_num): feed = { _model.turns: val_batches["turns"][batch_index], _model.tt_turns_len: val_batches["tt_turns_len"][batch_index], _model.every_turn_len: val_batches["every_turn_len"][batch_index], _model.response: val_batches["response"][batch_index], _model.response_len: val_batches["response_len"][batch_index], _model.label: val_batches["label"][batch_index], _model.dropout_keep_prob: 1.0 } scores, dev_loss, summaries = sess.run([_model.logits, _model.loss, _model.dev_summary_op], feed_dict = feed) # summary #dev_summary_writer.add_summary(summaries, step) for i in range(len(scores)): # logit, true_label score_file.write( str(scores[i]) + '\t' + str(val_batches["label"][batch_index][i]) + '\n') score_file.close() #write evaluation result result = eva.evaluate(score_file_path) result_file_path = conf["save_path"] + "result." + str(index) with open(result_file_path, 'w') as out_file: for p_at in result: out_file.write(str(p_at) + '\n') print('finish evaluation') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) if result[1] + result[2] > best_result[1] + best_result[2]: # for ubuntu #if result[2] + result[3] > best_result[2] + best_result[3]: # for douban best_result = result _save_path = _model.saver.save(sess, conf["save_path"] + "model.ckpt." + str(step / conf["save_step"])) print("succ saving model in " + _save_path) print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
def train(args): if not os.path.exists(args.save_path): os.makedirs(args.save_path) # data data_config data_conf = { "batch_size": args.batch_size, "max_turn_num": args.max_turn_num, "max_turn_len": args.max_turn_len, "_EOS_": args._EOS_, } dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size, args.emb_size, args.stack_num, args.channel1_num, args.channel2_num) train_program = fluid.Program() train_startup = fluid.Program() if "CE_MODE_X" in os.environ: train_program.random_seed = 110 train_startup.random_seed = 110 with fluid.program_guard(train_program, train_startup): with fluid.unique_name.guard(): if args.use_pyreader: train_pyreader = dam.create_py_reader( capacity=10, name='train_reader') else: dam.create_data_layers() loss, logits = dam.create_network() loss.persistable = True logits.persistable = True # gradient clipping fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByValue( max=1.0, min=-1.0)) optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay( learning_rate=args.learning_rate, decay_steps=400, decay_rate=0.9, staircase=True)) optimizer.minimize(loss) print("begin memory optimization ...") fluid.memory_optimize(train_program) print("end memory optimization ...") test_program = fluid.Program() test_startup = fluid.Program() if "CE_MODE_X" in os.environ: test_program.random_seed = 110 test_startup.random_seed = 110 with fluid.program_guard(test_program, test_startup): with fluid.unique_name.guard(): if args.use_pyreader: test_pyreader = dam.create_py_reader( capacity=10, name='test_reader') else: dam.create_data_layers() loss, logits = dam.create_network() loss.persistable = True logits.persistable = True test_program = test_program.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("device count %d" % dev_count) print("theoretical memory usage: ") print(fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size)) exe = fluid.Executor(place) exe.run(train_startup) exe.run(test_startup) train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=loss.name, main_program=train_program) test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_program, share_vars_from=train_exe) if args.word_emb_init is not None: print("start loading word embedding init ...") if six.PY2: word_emb = np.array(pickle.load(open(args.word_emb_init, 'rb'))).astype('float32') else: word_emb = np.array( pickle.load( open(args.word_emb_init, 'rb'), encoding="bytes")).astype( 'float32') dam.set_word_embedding(word_emb, place) print("finish init word embedding ...") print("start loading data ...") with open(args.data_path, 'rb') as f: if six.PY2: train_data, val_data, test_data = pickle.load(f) else: train_data, val_data, test_data = pickle.load(f, encoding="bytes") print("finish loading data ...") val_batches = reader.build_batches(val_data, data_conf) batch_num = len(train_data[six.b('y')]) // args.batch_size val_batch_num = len(val_batches["response"]) print_step = max(1, batch_num // (dev_count * 100)) save_step = max(1, batch_num // (dev_count * 10)) print("begin model training ...") print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) # train on one epoch data by feeding def train_with_feed(step): ave_cost = 0.0 for it in six.moves.xrange(batch_num // dev_count): feed_list = [] for dev in six.moves.xrange(dev_count): index = it * dev_count + dev batch_data = reader.make_one_batch_input(train_batches, index) feed_dict = dict(zip(dam.get_feed_names(), batch_data)) feed_list.append(feed_dict) cost = train_exe.run(feed=feed_list, fetch_list=[loss.name]) ave_cost += np.array(cost[0]).mean() step = step + 1 if step % print_step == 0: print("processed: [" + str(step * dev_count * 1.0 / batch_num) + "] ave loss: [" + str(ave_cost / print_step) + "]") ave_cost = 0.0 if (args.save_path is not None) and (step % save_step == 0): save_path = os.path.join(args.save_path, "step_" + str(step)) print("Save model at step %d ... " % step) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) fluid.io.save_persistables(exe, save_path, train_program) score_path = os.path.join(args.save_path, 'score.' + str(step)) test_with_feed(test_exe, test_program, dam.get_feed_names(), [logits.name], score_path, val_batches, val_batch_num, dev_count) result_file_path = os.path.join(args.save_path, 'result.' + str(step)) evaluate(score_path, result_file_path) return step, np.array(cost[0]).mean() # train on one epoch with pyreader def train_with_pyreader(step): def data_provider(): for index in six.moves.xrange(batch_num): yield reader.make_one_batch_input(train_batches, index) train_pyreader.decorate_tensor_provider(data_provider) ave_cost = 0.0 train_pyreader.start() while True: try: cost = train_exe.run(fetch_list=[loss.name]) ave_cost += np.array(cost[0]).mean() step = step + 1 if step % print_step == 0: print("processed: [" + str(step * dev_count * 1.0 / batch_num) + "] ave loss: [" + str(ave_cost / print_step) + "]") ave_cost = 0.0 if (args.save_path is not None) and (step % save_step == 0): save_path = os.path.join(args.save_path, "step_" + str(step)) print("Save model at step %d ... " % step) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) fluid.io.save_persistables(exe, save_path, train_program) score_path = os.path.join(args.save_path, 'score.' + str(step)) test_with_pyreader(test_exe, test_program, test_pyreader, [logits.name], score_path, val_batches, val_batch_num, dev_count) result_file_path = os.path.join(args.save_path, 'result.' + str(step)) evaluate(score_path, result_file_path) except fluid.core.EOFException: train_pyreader.reset() break return step, np.array(cost[0]).mean() # train over different epoches global_step, train_time = 0, 0.0 for epoch in six.moves.xrange(args.num_scan_data): shuffle_train = reader.unison_shuffle( train_data, seed=110 if ("CE_MODE_X" in os.environ) else None) train_batches = reader.build_batches(shuffle_train, data_conf) begin_time = time.time() if args.use_pyreader: global_step, last_cost = train_with_pyreader(global_step) else: global_step, last_cost = train_with_feed(global_step) pass_time_cost = time.time() - begin_time train_time += pass_time_cost print("Pass {0}, pass_time_cost {1}" .format(epoch, "%2.2f sec" % pass_time_cost)) # For internal continuous evaluation if "CE_MODE_X" in os.environ: print("kpis train_cost %f" % last_cost) print("kpis train_duration %f" % train_time)
def train(conf, _model): if conf['rand_seed'] is not None: np.random.seed(conf['rand_seed']) if not os.path.exists(conf['save_path']): os.makedirs(conf['save_path']) # load data print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'starting loading data') train_data_cc, val_data_cc, test_data_cc, test_human_cc = pickle.load( open(conf["data_path"] + "cc.pkl", 'rb')) if conf["train_type"] == "cr": train_data_cr, val_data_cr, test_data_cr, test_human_cr = pickle.load( open(conf["data_path"] + "cr.pkl", 'rb')) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'finish loading data') val_batches_cc = reader.build_batches(val_data_cc, conf) if conf["train_type"] == "cr": val_batches_cr = reader.build_batches(val_data_cr, conf) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "finish building test batches") # refine conf batch_num_cc = int(len(train_data_cc['y']) / conf["batch_size"]) if conf["train_type"] == "cr": batch_num_cr = int(len(train_data_cr['y']) / conf["batch_size"]) val_batch_num_cc = len(val_batches_cc["response"]) if conf["train_type"] == "cr": val_batch_num_cr = len(val_batches_cr["response"]) print('configurations: %s' % conf) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'model sucess') _graph = _model.build_graph() print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'build graph sucess') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(graph=_graph, config=config) as sess: _model.init.run() if conf["init_model"]: _model.saver_load.restore(sess, conf["init_model"]) print("sucess init %s" % conf["init_model"]) average_loss = 0.0 batch_index = 0 step = 0 best_result = (0, 0, 0, 0) train_type = conf["train_type"] if train_type == "cc": g_updates = _model.g_updates_cc loss = _model.loss_cc global_step = _model.global_step_cc learning_rate = _model.learning_rate_cc logits = _model.logits_cc train_data = train_data_cc val_batches = val_batches_cc batch_num = batch_num_cc val_batch_num = val_batch_num_cc elif train_type == "cr": g_updates = _model.g_updates_cr loss = _model.loss_cr global_step = _model.global_step_cr learning_rate = _model.learning_rate_cr logits = _model.logits_cr train_data = train_data_cr val_batches = val_batches_cc batch_num = batch_num_cr val_batch_num = val_batch_num_cc elif train_type == "ccr": g_updates = _model.g_updates_ccr loss = _model.loss_ccr global_step = _model.global_step_ccr learning_rate = _model.learning_rate_ccr logits = _model.logits_ccr train_data = train_data_cc val_batches = val_batches_cc batch_num = batch_num_cc val_batch_num = val_batch_num_cc else: assert False for step_i in range(conf["num_scan_data"]): #for batch_index in rng.permutation(range(batch_num)): print('starting shuffle train data') shuffle_train = reader.unison_shuffle(train_data) train_batches = reader.build_batches(shuffle_train, conf) print('finish building train data') for batch_index in range(batch_num): feed = { _model.turns1: train_batches["turns1"][batch_index], _model.turns2: train_batches["turns2"][batch_index], _model.tt_turns_len1: train_batches["tt_turns_len1"][batch_index], _model.every_turn_len1: train_batches["every_turn_len1"][batch_index], _model.tt_turns_len2: train_batches["tt_turns_len2"][batch_index], _model.every_turn_len2: train_batches["every_turn_len2"][batch_index], _model.response: train_batches["response"][batch_index], _model.response_len: train_batches["response_len"][batch_index], _model.label: train_batches["label"][batch_index], _model.keep_rate: 1.0, } _, curr_loss = sess.run([g_updates, loss], feed_dict=feed) average_loss += curr_loss step += 1 if step % conf["print_step"] == 0 and step > 0: g_step, lr = sess.run([global_step, learning_rate]) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'epoch: %d, step: %.5d, lr: %-.6f, loss: %s' % (step_i, g_step, lr, average_loss / conf["print_step"])) average_loss = 0 #--------------------------evaluation--------------------------------- score_file_path = conf['save_path'] + '/score.' + str(step_i) score_file = open(score_file_path, 'w') for batch_index in range(val_batch_num): feed = { _model.turns1: val_batches["turns1"][batch_index], _model.turns2: val_batches["turns2"][batch_index], _model.tt_turns_len1: val_batches["tt_turns_len1"][batch_index], _model.every_turn_len1: val_batches["every_turn_len1"][batch_index], _model.tt_turns_len2: val_batches["tt_turns_len2"][batch_index], _model.every_turn_len2: val_batches["every_turn_len2"][batch_index], _model.response: val_batches["response"][batch_index], _model.response_len: val_batches["response_len"][batch_index], _model.keep_rate: 1.0, } scores = sess.run(logits, feed_dict=feed) att_scores = 0.0 for i in range(conf["batch_size"]): score_file.write( str(scores[i]) + '\t' + str(val_batches["label"][batch_index][i]) + '\n') score_file.close() result = eva.evaluate(score_file_path) print( time.strftime('%Y-%m-%d %H:%M:%S result: ', time.localtime(time.time())), *result) if result[1] + result[2] > best_result[1] + best_result[2]: best_result = result _save_path = _model.saver_save.save(sess, conf["save_path"] + "/model", global_step=step_i) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "succ saving model in " + _save_path) print( time.strftime('%Y-%m-%d %H:%M:%S best result', time.localtime(time.time())), *best_result)
def train(conf, _model): if conf['rand_seed'] is not None: np.random.seed(conf['rand_seed']) if not os.path.exists(conf['save_path']): os.makedirs(conf['save_path']) # load data print('starting loading data') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) train_data, val_data, test_data = pickle.load(open(conf["data_path"], 'rb')) print('lyang test: val_data: ', len(val_data), type(val_data), len(val_data['y'])) print('lyang test: val_data[y]: ', val_data['y'][0:2]) print('lyang test: val_data[c]: ', val_data['c'][0:2]) print('lyang test: val_data[r]: ', val_data['r'][0:2]) print('lyang test: val_data[qids]: ', val_data['qids'][0:2]) print('lyang test: val_data[dids]: ', val_data['dids'][0:2]) print('map id to words ...') id2word = reader.read_dict('../data/' + conf["data_name"]+ '/word2id') response_ids = val_data['r'][0:1][0] context_ids = val_data['c'][0:1][0] print('lyang test: val_data[c]: ', [id2word[str(id)] for id in context_ids], val_data.keys()) print('lyang test: val_data[r]: ', [id2word[str(id)] for id in response_ids], val_data.keys()) print('finish loading data') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) print('init intent_dict...') conf['intent_dict'] = reader.read_intent(conf['intent_vec_path']) if conf[ 'model_name'] != 'dam' else None print('lyang test len(conf[intent_dict])', len(conf['intent_dict'])) val_batches = reader.build_batches(val_data, conf) # check the example 0 and 1 in batch 0 print('intent of val_batches context: ', val_batches['turns_intent'][0][0:2]) print('intent of val_batches response: ',val_batches['response_intent'][0][0:2]) print("finish building valid batches") print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) # refine conf batch_num = len(train_data['y']) / conf["batch_size"] print('batch_size: ', conf["batch_size"]) print('total number of batches in one epoch: ', batch_num) val_batch_num = len(val_batches["response"]) conf["train_steps"] = conf["num_scan_data"] * batch_num # total number of training steps epoch_num * batch_num conf["save_step"] = max(1, batch_num / 10) # at most save 10 times conf["print_step"] = max(1, batch_num / 100) # at most print 100 times print('configurations:') conf_copy = {} for k in conf: if k != 'intent_dict': conf_copy[k] = conf[k] print(conf_copy) print('model sucess') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) _graph = _model.build_graph() print('build graph sucess') print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) with tf.Session(graph=_graph) as sess: # train_writer = tf.summary.FileWriter( # conf["save_path"] + "tensorboard_log/", sess.graph) # merge = tf.summary.merge_all() # for tensorboard _model.init.run(); if conf["init_model"]: _model.saver.restore(sess, conf["init_model"]) print("sucess init %s" %conf["init_model"]) average_loss = 0.0 batch_index = 0 step = 0 best_result = [0, 0, 0, 0] for step_i in xrange(conf["num_scan_data"]): #for batch_index in rng.permutation(range(batch_num)): print('starting shuffle train data') shuffle_train = reader.unison_shuffle(train_data) train_batches = reader.build_batches(shuffle_train, conf) print('finish building train data') for batch_index in range(batch_num): feed = { _model.turns: train_batches["turns"][batch_index], _model.tt_turns_len: train_batches["tt_turns_len"][batch_index], _model.every_turn_len: train_batches["every_turn_len"][batch_index], _model.response: train_batches["response"][batch_index], _model.response_len: train_batches["response_len"][batch_index], _model.label: train_batches["label"][batch_index], } if conf['model_name'] != 'dam': feed[_model.turns_intent] = train_batches["turns_intent"][batch_index] feed[_model.response_intent] = train_batches["response_intent"][batch_index] batch_index = (batch_index + 1) % batch_num; _, curr_loss = sess.run([_model.g_updates, _model.loss], feed_dict = feed) # print loss and metrics into tensorboard log # train_writer.add_summary(summ, global_step=step) average_loss += curr_loss step += 1 if step % conf["print_step"] == 0 and step > 0: g_step, lr = sess.run([_model.global_step, _model.learning_rate]) print('step: %s lr: %s, epoch: %s ' %(g_step, lr, step_i)) print("step: " + str(g_step)+ " processed current epoch: [" \ + str(step * 1.0 / batch_num) + "] loss: " + \ str(average_loss / conf["print_step"])) average_loss = 0 if step % conf["save_step"] == 0 and step > 0: index = step / conf['save_step'] score_file_path = conf['save_path'] + 'score.' + str(index) score_file = open(score_file_path, 'w') print('save step: %s' %index) print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) for batch_index in xrange(val_batch_num): feed = { _model.turns: val_batches["turns"][batch_index], _model.tt_turns_len: val_batches["tt_turns_len"][batch_index], _model.every_turn_len: val_batches["every_turn_len"][batch_index], _model.response: val_batches["response"][batch_index], _model.response_len: val_batches["response_len"][batch_index], _model.label: val_batches["label"][batch_index] } if conf['model_name'] != 'dam': feed[_model.turns_intent] = \ val_batches["turns_intent"][batch_index] feed[_model.response_intent] = \ val_batches["response_intent"][batch_index] scores = sess.run(_model.logits, feed_dict = feed) for i in xrange(conf["batch_size"]): score_file.write( str(scores[i]) + '\t' + str(val_batches["label"][batch_index][i]) + '\n') score_file.close() #write evaluation result result = eva.evaluate(score_file_path) result_file_path = conf["save_path"] + "result." + str(index) with open(result_file_path, 'w') as out_file: for m in result: out_file.write(str(m) + '\n') print('finish evaluation') # lyang: also print metrics in log file print('save step:\t{:d}\t[current metrics (r2@1 r10@1 r10@2 r10@5 map)]\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}'.format( index, result[0], result[1], result[2], result[3], result[4])) print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) # lyang: also print metrics in tensorboard log file # metrics = tf.Summary(value=[ # tf.Summary.Value(tag="R10at1", simple_value=result[1]), # ]) # metrics.value.add(tag="MAP", simple_value=result[4]) # # metrics.value.add(tag="R10at2", simple_value=result[2]) # # metrics.value.add(tag="R10at5", simple_value=result[3]) # train_writer.add_summary(metrics, global_step=step) if result[1] + result[2] > best_result[1] + best_result[2]: # save model only when find a model better than previously best model best_result = result _save_path = _model.saver.save(sess, conf["save_path"] + "model.ckpt." + str(step / conf["save_step"])) print("succ saving model in " + _save_path) print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
def train(conf, _model): if conf['rand_seed'] is not None: np.random.seed(conf['rand_seed']) if not os.path.exists(conf['save_path']): os.makedirs(conf['save_path']) train_type = conf["train_type"] # load data print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'starting loading data') data_collections = pickle.load(open(conf["data_path"], 'rb')) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'finish loading data') file_names = ["train.txt", "valid.txt", "test.txt"] train_data = data_collections[file_names.index("train.txt")] batch_num = math.ceil(float(len(train_data['y'])) / conf["batch_size"]) valid_data = data_collections[file_names.index("valid.txt")] val_batches = reader.build_batches(valid_data, conf) val_batch_num = len(val_batches["response"]) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "finish building test batches") print('configurations: %s' % conf) _graph = _model.build_graph() print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'build graph sucess') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(graph=_graph, config=config) as sess: _model.init.run() if conf["init_model"]: _model.saver_load.restore(sess, conf["init_model"]) print("sucess init %s" % conf["init_model"]) average_loss = 0.0 batch_index = 0 step = 0 best_result = 0.0 g_updates = _model.trainops[train_type]["g_updates"] loss = _model.trainops[train_type]["loss"] global_step = _model.trainops[train_type]["global_step"] learning_rate = _model.trainops[train_type]["learning_rate"] logits = _model.trainops[train_type]["logits"] early_stop_count = 0 for step_i in range(conf["num_scan_data"]): #for batch_index in rng.permutation(range(batch_num)): print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'starting shuffle train data') shuffle_train = reader.unison_shuffle(train_data) train_batches = reader.build_batches(shuffle_train, conf) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'finish building train data') for batch_index in range(batch_num): feed = { _model.turns1: train_batches["turns1"][batch_index], _model.tt_turns_len1: train_batches["tt_turns_len1"][batch_index], _model.every_turn_len1: train_batches["every_turn_len1"][batch_index], _model.response: train_batches["response"][batch_index], _model.response_len: train_batches["response_len"][batch_index], _model.label: train_batches["label"][batch_index], _model.keep_rate: conf["keep_rate"], } _, curr_loss = sess.run([g_updates, loss], feed_dict=feed) average_loss += curr_loss step += 1 if step < 500: print_step_time = int(conf["print_step"] / 10) else: print_step_time = conf["print_step"] if step % print_step_time == 0 and step > 0: g_step, lr = sess.run([global_step, learning_rate]) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), 'epoch: %d, step: %.5d, lr: %-.6f, loss: %s' % (step_i, g_step, lr, average_loss / print_step_time)) average_loss = 0 #--------------------------evaluation--------------------------------- score_file_path = conf['save_path'] + '/score.' + str(step_i) score_file = open(score_file_path, 'w') for batch_index in range(val_batch_num): feed = { _model.turns1: val_batches["turns1"][batch_index], _model.tt_turns_len1: val_batches["tt_turns_len1"][batch_index], _model.every_turn_len1: val_batches["every_turn_len1"][batch_index], _model.response: val_batches["response"][batch_index], _model.response_len: val_batches["response_len"][batch_index], _model.keep_rate: 1.0, } scores = sess.run(logits, feed_dict=feed) for i in range(len(scores)): score_file.write( str(scores[i]) + '\t' + str(val_batches["label"][batch_index][i]) + '\n') score_file.close() #write evaluation result result = eva_2cands.evaluate(score_file_path) format_str = "Accuracy: {:01.4f}" # if "douban" in conf["data_path"]: # result = eva_douban.evaluate(score_file_path) # format_str = "MAP: {:01.4f} MRR {:01.4f} P@1 {:01.4f} R@1 {:01.4f} R@2 {:01.4f} R@5 {:01.4f}" # else: # result = eva.evaluate(score_file_path) # format_str = "MRR: {:01.4f} R2@1 {:01.4f} R@1 {:01.4f} R@2 {:01.4f} R@5 {:01.4f}" print(time.strftime('%Y-%m-%d %H:%M:%S result: ', time.localtime(time.time())), end="") print(format_str.format(result)) # if result[1] + result[2] > best_result[1] + best_result[2]: if result > best_result: early_stop_count = 0 best_result = result _save_path = _model.saver.save(sess, conf["save_path"] + "/model", global_step=step_i) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "succ saving model in " + _save_path) else: early_stop_count += 1 if early_stop_count >= conf["early_stop_count"]: break print(time.strftime( '%Y-%m-%d %H:%M:%S ' + conf["data_name"] + ' best result: ', time.localtime(time.time())), end="") print(format_str.format(best_result))