def main(args): # loading configurations with open(args.config) as f: config = yaml.safe_load(f)["configuration"] # set up workspace work_space = config["workspace"] tf_board = config["tf_board"] setup_workpath(work_space) name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] vocab_file = '%s/data/%s-%s' % (work_space, "vocab", vocab_size) print("\tDone.") # Build the model and compute losses (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, attn_num_units, dec_num_layers, dec_num_units, dec_cell_type, state_pass, infer_max_iter, l2_regularize, learning_rate) = get_model_config(config) (train_s_file, train_t_file, dev_s_file, dev_t_file, max_length, gpu_fraction, gpu_id, checkpoint_every, max_checkpoints, print_every, train_steps, is_beam_search, batch_size, beam_size) = get_training_config(config) print("Building model architecture ...") train_model = Seq2SeqModel(mode='train', model_name=name, vocab_size=vocab_size, embedding_size=embed_size, enc_num_layers=enc_num_layers, enc_num_units=enc_num_units, enc_cell_type=enc_cell_type, enc_bidir=enc_bidir, attn_num_units=attn_num_units, dec_num_layers=dec_num_layers, dec_num_units=dec_num_units, dec_cell_type=dec_cell_type, batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, l2_regularize=l2_regularize, learning_rate=learning_rate, max_to_keep=max_checkpoints) print("\tDone.") logdir = '%s/nn_models/' % work_space restore_from = '%s/nn_models/' % work_space is_overwritten_training = logdir != restore_from # 判断两个文件件是否相同 # Set up session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # tensorbord train_writer = tf.summary.FileWriter(tf_board + 'train/', sess.graph) test_writer = tf.summary.FileWriter(tf_board + 'test/', sess.graph) try: saved_global_step = load(train_model.saver, sess, restore_from) if is_overwritten_training or saved_global_step is None: # The first training step will be saved_global_step + 1, # therefore we put -1 here for new or overwritten trainings. saved_global_step = -1 except Exception: print("Something went wrong while restoring checkpoint. " "Training is terminated to avoid the overwriting.") raise # ##### Training ##### # Load data print("Loading data ...") # Load vocabularies. if os.path.exists(vocab_file): vocab_table, reverse_vocab_table = create_vocab_tables(vocab_file) else: create_vocab_file(train_s_file, train_t_file, dev_s_file, dev_t_file, vocab_file, vocab_size) vocab_table, reverse_vocab_table = create_vocab_tables(vocab_file) train_set, dev_set = prepare_train_dev_data(train_s_file, train_t_file, dev_s_file, dev_t_file, vocab_table, max_length) # Training last_saved_step = saved_global_step num_steps = saved_global_step + train_steps losses = [] steps = [] print("Start training ...") try: for step in range(saved_global_step + 1, num_steps): start_time = time.time() batch = get_train_batch(train_set, max_length, batch_size) loss_value = train_model.train(sess, batch) losses.append(loss_value) duration = (time.time() - start_time) if step % print_every == 0 and step != 0: # train perplexity t_perp = train_model.compute_perplexity(sess, batch) add_summary(train_writer, step, 'train perplexity', t_perp) # eval perplexity dev_str = "" if dev_set is not None: eval_batch = get_train_batch(dev_set, max_length, batch_size) eval_perp = train_model.compute_perplexity( sess, eval_batch) add_summary(test_writer, step, 'eval perplexity', eval_perp) dev_str += "val_prep: {:.3f}\n".format(eval_perp) steps.append(step) info = 'step {:d}, loss = {:.6f},perp: {:.3f}\n{}({:.3f} sec/step)' print(info.format(step, loss_value, t_perp, dev_str, duration)) if step % checkpoint_every == 0: save(train_model.saver, sess, logdir, step) last_saved_step = step except KeyboardInterrupt: # Introduce a line break after ^C so save message is on its own line. print() finally: if step > last_saved_step: save(train_model.saver, sess, logdir, step)
x, y = x.to(device), y.to(device) pred, epis = model.compute_prediction_and_uncertainty(x) y, pred, epis = prepare_to_visualize(y, pred, epis, mean, std) fig = plt.figure(figsize=(5, 2.5), dpi=300) plt.subplots_adjust(wspace=0.25, hspace=0.4) add_subplot(fig, 111, y, pred, epis) plt.show() plt.close() if __name__ == "__main__": args = parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') postprocess = args.postprocess model_desc = get_model_description(args, postprocess) checkpoint_path = "params/{}/{}_model.pth".format(model_desc, args.load_type) te_dataset = ARTDataset(args.te_path) te_loader = DataLoader(te_dataset, batch_size=1, shuffle=False, num_workers=1) config = get_model_config(args, postprocess) model = Model(**config).to(device) checkpoint = torch.load(checkpoint_path, map_location=device) print("Load checkpoint from: {}".format(checkpoint_path)) model.load_state_dict(checkpoint["state_dict"]) model.eval() with torch.no_grad(): viz_regression(te_loader, model, args)
return loss_all_avg, MAE_avg if __name__ == "__main__": global global_step global_step = 0 args = parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tr_dataset = ARTDataset(args.tr_path) val_dataset = ARTDataset(args.val_path) tr_loader = DataLoader(tr_dataset, batch_size=args.bsz, num_workers=args.num_workers, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=args.bsz, num_workers=args.num_workers, shuffle=False) config = get_model_config(args) model = Model(**config).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) stats = pickle.load(open(args.stats_path, 'rb')) model_desc = get_model_description(args) save_dir, log_dir = make_dirs(model_desc) log_train, log_valid, log_info = get_logger(log_dir) writer = SummaryWriter(logdir=os.path.join(log_dir, 'runs', str(time.strftime('%Y-%m-%d_%H:%M:%S')))) write_experiment_info(log_info, args, model) loss_best = MAE_best = 987654321 for epoch in range(1, 987654321): print('# --- {}th epoch start --- # '.format(epoch)) train(epoch, writer, log_train, args) with torch.no_grad():
def main(args): # loading configurations with open(args.config) as f: config = yaml.safe_load(f)["configuration"] name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] embeddings = init_embeddings(vocab_size, embed_size, name=name) print("\tDone.") # Build the model and compute losses source_ids = tf.placeholder(tf.int32, [None, None], name="source") target_ids = tf.placeholder(tf.int32, [None, None], name="target") sequence_mask = tf.placeholder(tf.bool, [None, None], name="mask") attn_wrappers = { "None": None, "Attention": AttentionWrapper, } attn_wrapper = attn_wrappers.get(config["decoder"]["wrapper"]) (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers, dec_num_units, dec_cell_type, state_pass, infer_batch_size, infer_type, beam_size, max_iter, attn_num_units, l2_regularize) = get_model_config(config) print("Building model architecture ...") CE, loss, logits, infer_outputs = compute_loss( source_ids, target_ids, sequence_mask, embeddings, enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers, dec_num_units, dec_cell_type, state_pass, infer_batch_size, infer_type, beam_size, max_iter, attn_wrapper, attn_num_units, l2_regularize, name) print("\tDone.") # Even if we restored the model, we will treat it as new training # if the trained model is written into an arbitrary location. (logdir, restore_from, learning_rate, gpu_fraction, max_checkpoints, train_steps, batch_size, print_every, checkpoint_every, s_filename, t_filename, s_max_leng, t_max_leng, dev_s_filename, dev_t_filename, loss_fig, perp_fig) = get_training_config(config) is_overwritten_training = logdir != restore_from optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) trainable = tf.trainable_variables() optim = optimizer.minimize(loss, var_list=trainable) # Set up session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=max_checkpoints) try: saved_global_step = load(saver, sess, restore_from) if is_overwritten_training or saved_global_step is None: # The first training step will be saved_global_step + 1, # therefore we put -1 here for new or overwritten trainings. saved_global_step = -1 except Exception: print("Something went wrong while restoring checkpoint. " "Training is terminated to avoid the overwriting.") raise # ##### Training ##### # Load data print("Loading data ...") # id_0, id_1, id_2 preserved for SOS, EOS, constant zero padding embed_shift = 3 source_data = loadfile(s_filename, is_source=True, max_length=s_max_leng) + embed_shift target_data = loadfile(t_filename, is_source=False, max_length=t_max_leng) + embed_shift masks = (target_data >= embed_shift) masks = np.append(np.ones([len(masks), 1], dtype=bool), masks, axis=1) masks = masks[:, :-1] n_data = len(source_data) dev_source_data = None if dev_s_filename is not None: dev_source_data = loadfile(dev_s_filename, is_source=True, max_length=s_max_leng) + embed_shift dev_target_data = loadfile(dev_t_filename, is_source=False, max_length=t_max_leng) + embed_shift dev_masks = (dev_target_data >= embed_shift) dev_masks = np.append(np.ones([len(dev_masks), 1], dtype=bool), dev_masks, axis=1) dev_masks = dev_masks[:, :-1] print("\tDone.") # Training last_saved_step = saved_global_step num_steps = saved_global_step + train_steps losses = [] steps = [] perps = [] dev_perps = [] print("Start training ...") try: for step in range(saved_global_step + 1, num_steps): start_time = time.time() rand_indexes = np.random.choice(n_data, batch_size) source_batch = source_data[rand_indexes] target_batch = target_data[rand_indexes] mask_batch = masks[rand_indexes] feed_dict = { source_ids: source_batch, target_ids: target_batch, sequence_mask: mask_batch, } loss_value, _ = sess.run([loss, optim], feed_dict=feed_dict) losses.append(loss_value) duration = time.time() - start_time if step % print_every == 0: # train perplexity t_perp = compute_perplexity(sess, CE, mask_batch, feed_dict) perps.append(t_perp) # dev perplexity dev_str = "" if dev_source_data is not None: dev_inds = np.random.choice(len(dev_source_data), batch_size) dev_feed_dict = { source_ids: dev_source_data[dev_inds], target_ids: dev_target_data[dev_inds], sequence_mask: dev_masks[dev_inds], } dev_perp = compute_perplexity(sess, CE, dev_masks[dev_inds], dev_feed_dict) dev_perps.append(dev_perp) dev_str = "dev_prep: {:.3f}, ".format(dev_perp) steps.append(step) info = 'step {:d}, loss = {:.6f}, ' info += 'perp: {:.3f}, {}({:.3f} sec/step)' print(info.format(step, loss_value, t_perp, dev_str, duration)) if step % checkpoint_every == 0: save(saver, sess, logdir, step) last_saved_step = step except KeyboardInterrupt: # Introduce a line break after ^C so save message is on its own line. print() finally: if step > last_saved_step: save(saver, sess, logdir, step) # plot loss plt.figure() plt.plot(losses) plt.title("Total loss") plt.xlabel("step") plt.savefig(loss_fig) # plot perplexity plt.figure() if len(perps) > len(steps): perps.pop() plt.plot(steps[5:], perps[5:], label="train") if dev_source_data is not None: plt.plot(steps[5:], dev_perps[5:], label="dev") plt.title("Perplexity") plt.xlabel("step") plt.legend() plt.savefig(perp_fig)
def main(args): # loading configurations with open(args.config) as f: config = yaml.safe_load(f)["configuration"] name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] embeddings = init_embeddings(vocab_size, embed_size, name=name) print("\tDone.") # Build the model and compute losses source_ids = tf.placeholder(tf.int32, [None, None], name="source") target_ids = tf.placeholder(tf.int32, [None, None], name="target") sequence_mask = tf.placeholder(tf.bool, [None, None], name="mask") attn_wrappers = { "None": None, "Attention": AttentionWrapper, } attn_wrapper = attn_wrappers.get(config["decoder"]["wrapper"]) (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers, dec_num_units, dec_cell_type, state_pass, infer_batch_size, infer_type, beam_size, max_iter, attn_num_units, l2_regularize) = get_model_config(config) print("Building model architecture ...") CE, loss, logits, infer_outputs = compute_loss( source_ids, target_ids, sequence_mask, embeddings, enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers, dec_num_units, dec_cell_type, state_pass, infer_batch_size, infer_type, beam_size, max_iter, attn_wrapper, attn_num_units, l2_regularize, name) print("\tDone.") # Set up session restore_from = config["training"]["restore_from"] gpu_fraction = config["training"]["gpu_fraction"] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.trainable_variables()) try: saved_global_step = load(saver, sess, restore_from) if saved_global_step is None: raise ValueError("Cannot find the checkpoint to restore from.") except Exception: print("Something went wrong while restoring checkpoint. ") raise # ##### Inference ##### # Load data print("Loading inference data ...") # id_0, id_1, id_2 preserved for SOS, EOS, constant zero padding embed_shift = 3 filename = config["inference"]["infer_source_file"] max_leng = config["inference"]["infer_source_max_length"] source_data = loadfile(filename, is_source=True, max_length=max_leng) + embed_shift print("\tDone.") # Inference print("Start inferring ...") final_result = [] n_data = source_data.shape[0] n_pad = n_data % infer_batch_size if n_pad > 0: n_pad = infer_batch_size - n_pad pad = np.zeros((n_pad, max_leng), dtype=np.int32) source_data = np.concatenate((source_data, pad)) for ith in range(int(len(source_data) / infer_batch_size)): start = ith * infer_batch_size end = (ith + 1) * infer_batch_size batch = source_data[start:end] result = sess.run(infer_outputs, feed_dict={source_ids: batch}) result = result.ids[:, :, 0] if result.shape[1] < max_iter: l_pad = max_iter - result.shape[1] result = np.concatenate( (result, np.ones((infer_batch_size, l_pad))), axis=1) final_result.append(result) final_result = np.concatenate(final_result)[:n_data] - embed_shift final_result[final_result < 0] = -1 final_result = final_result.astype(str).tolist() final_result = list(map(lambda t: " ".join(t), final_result)) df = pd.DataFrame(data={"0": final_result}) df.to_csv(config["inference"]["output_path"], header=None, index=None) print("\tDone.")
def main(args): # loading configurations with open(args.config) as f: config = yaml.safe_load(f)["configuration"] work_space = config["workspace"] name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] vocab_file = '%s/data/%s-%s' % (work_space, "vocab", vocab_size) print("\tDone.") (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, attn_num_units, dec_num_layers, dec_num_units, dec_cell_type, state_pass, infer_max_iter, l2_regularize, learning_rate) = get_model_config(config) (is_beam_search, beam_size, batch_size, infer_source_file, infer_source_max_length, output_path, gpu_fraction, gpu_id) = get_infer_config(config) print("Building model architecture ...") infer_model = Seq2SeqModel(mode='infer', model_name=name, vocab_size=vocab_size, embedding_size=embed_size, enc_num_layers=enc_num_layers, enc_num_units=enc_num_units, enc_cell_type=enc_cell_type, enc_bidir=enc_bidir, attn_num_units=attn_num_units, dec_num_layers=dec_num_layers, dec_num_units=dec_num_units, dec_cell_type=dec_cell_type, batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, l2_regularize=l2_regularize, learning_rate=learning_rate) print("\tDone.") # Set up session restore_from = '%s/nn_models/' % work_space gpu_fraction = config["training"]["gpu_fraction"] gpu_id = config["training"]["gpu_id"] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # print('global_variables:\n') # glob_var = tf.global_variables() # pprint(glob_var) try: saved_global_step = load(infer_model.saver, sess, restore_from) if saved_global_step is None: raise ValueError("Cannot find the checkpoint to restore from.") except Exception: print("Something went wrong while restoring checkpoint. ") raise # ##### Inference ##### # Load data print("Loading inference data ...") # Load vocabularies. vocab_table, reverse_vocab_table = create_vocab_tables(vocab_file) src_dataset = prepare_infer_data(infer_source_file, vocab_table, max_length=infer_source_max_length) print("\tDone.") # Inference print("Start inferring ...") final_result = [] for ith in range(int(len(src_dataset) / batch_size)): start = ith end = ith + 1 batch = get_infer_batch(src_dataset, start, end, infer_source_max_length) sentence = token_to_str(batch[0][0], reverse_vocab_table) start_time = time.time() result = infer_model.infer(sess, batch) duration = round((time.time() - start_time), 3) print("sentence:%s, cost:%s s" % (ith, duration)) res = "src:{}\n".format(sentence) if is_beam_search is True: for idx, i in enumerate(result[0][0]): reply = token_to_str(i, reverse_vocab_table) res += "\tpred %s:%s\n" % (idx, reply) res += "\n" else: reply = result[0][0] reply = token_to_str(reply, reverse_vocab_table) res += "\tpred:%s\n\n" % reply print(res) final_result.append(res) with open(config["inference"]["output_path"], 'w') as f: for i in final_result: f.write(i + '\n') print("\tDone.")
def main(args): # loading configurations with open(args.config) as f: config = yaml.safe_load(f)["configuration"] work_space = config["workspace"] name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] vocab_file = '%s/data/%s-%s' % (work_space, "vocab", vocab_size) print("\tDone.") (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, attn_num_units, dec_num_layers, dec_num_units, dec_cell_type, state_pass, infer_max_iter, l2_regularize, learning_rate) = get_model_config(config) (is_beam_search, beam_size, batch_size, infer_source_file, infer_source_max_length, output_path, gpu_fraction, gpu_id) = get_infer_config(config) print("Building model architecture ...") infer_model = Seq2SeqModel(mode='infer', model_name=name, vocab_size=vocab_size, embedding_size=embed_size, enc_num_layers=enc_num_layers, enc_num_units=enc_num_units, enc_cell_type=enc_cell_type, enc_bidir=enc_bidir, attn_num_units=attn_num_units, dec_num_layers=dec_num_layers, dec_num_units=dec_num_units, dec_cell_type=dec_cell_type, batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, l2_regularize=l2_regularize, learning_rate=learning_rate) print("\tDone.") # Set up session restore_from = '%s/nn_models/' % work_space gpu_fraction = config["training"]["gpu_fraction"] gpu_id = config["training"]["gpu_id"] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) try: saved_global_step = load(infer_model.saver, sess, restore_from) if saved_global_step is None: raise ValueError("Cannot find the checkpoint to restore from.") except Exception: print("Something went wrong while restoring checkpoint. ") raise print('save model for infer ...') infer_model_dir = '%s/infer_model' % work_space builder = tf.saved_model.builder.SavedModelBuilder(infer_model_dir) builder.add_meta_graph_and_variables(sess, tf.saved_model.tag_constants.SERVING) builder.save() print('\tDone.')