def main(): rnd_seed = None if rnd_seed: torch.manual_seed(rnd_seed) np.random.seed(rnd_seed) # --------------------------------------- # DATA LOADING # --------------------------------------- #result_path = "../result_lrn_0p001_rl/" dict_file = "../dataset/German/vocab1.de" entity_file = "../dataset/German/vocab1.en" index2word = get_index2word(dict_file) index2label = get_index2label(entity_file) vocab_size = len(index2word) label_size = len(index2label) train_X, train_Y = minibatch_of_one_de('train') val_X, val_Y = minibatch_of_one_de('valid') test_X, test_Y = minibatch_of_one_de('test') # --------------------------------------- # HYPER PARAMETERS # --------------------------------------- # Using word2vec pre-trained embedding #word_embedding_dim = 300 hidden_dim = 64 label_embedding_dim = 8 max_epoch = 100 # 0.001 is a good value ner_learning_rate = 0.001 pretrained = 'de64' word_embedding_dim = 64 # --------------------------------------- # GPU OR NOT? # --------------------------------------- gpu = False if gpu and rnd_seed: torch.cuda.manual_seed(rnd_seed) # --------------------------------------- # MODEL INSTANTIATION # --------------------------------------- #attention = None attention = "fixed" load_model_dir = "../result_lrn_0p001_atten/" load_model_filename = os.path.join(load_model_dir, "ckpt_46.pth") batch_size = 1 machine = ner(word_embedding_dim, hidden_dim, label_embedding_dim, vocab_size, label_size, learning_rate=ner_learning_rate, minibatch_size=batch_size, max_epoch=max_epoch, train_X=None, train_Y=None, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y, attention=attention, gpu=gpu, pretrained=pretrained, load_model_filename=load_model_filename, load_map_location="cpu") if gpu: machine = machine.cuda() initial_beam_size = 3 # When you have only one beam, it does not make sense to consider # max_beam_size larger than the size of your label vocabulary max_beam_size = label_size # ============ INIT RL ===================== parser = argparse.ArgumentParser(description='A3C') parser.add_argument('--logdir', default='../result_lrn_0p001_atten_rl', help='name of logging directory') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (default: 0.0001)') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor for rewards (default: 0.99)') parser.add_argument('--n_epochs', type=int, default=2, help='number of epochs for training agent(default: 30)') parser.add_argument('--entropy-coef', type=float, default=0.01, help='entropy term coefficient (default: 0.01)') parser.add_argument('--num-processes', type=int, default=2, help='how many training processes to use (default: 4)') parser.add_argument('--num-steps', type=int, default=20, help='number of forward steps in A3C (default: 20)') parser.add_argument('--tau', type=float, default=1.00, help='parameter for GAE (default: 1.00)') parser.add_argument('--value-loss-coef', type=float, default=0.5, help='value loss coefficient (default: 0.5)') parser.add_argument('--max-grad-norm', type=float, default=5, help='value loss coefficient (default: 5)') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--max-episode-length', type=int, default=1000000, help='maximum length of an episode (default: 1000000)') parser.add_argument('--name', default='train', help='name of the process') parser.add_argument('--no-shared', default=False, help='use an optimizer without shared momentum.') args = parser.parse_args() if not os.path.exists(args.logdir): os.mkdir(args.logdir) # For German dataset, f_score_index_begin = 5 (because O_INDEX = 4) # For toy dataset, f_score_index_begin = 4 (because {0: '<s>', 1: '<e>', 2: '<p>', 3: '<u>', ...}) f_score_index_begin = 5 # RL reward coefficient reward_coef_fscore = 1 reward_coef_beam_size = 0.1 logfile = open(os.path.join(args.logdir, "eval_test.txt"), "w+") model = AdaptiveActorCritic(max_beam_size=max_beam_size, action_space=3) # Marking as for evaluation model.eval() load_map_location = "cpu" for epoch in range(0, args.n_epochs): load_model_filename = os.path.join(args.logdir, "ckpt_" + str(epoch) + ".pth") checkpoint = torch.load(load_model_filename, map_location=load_map_location) model.load_state_dict(checkpoint["state_dict"]) fscore, total_beam_number_in_dataset, avg_beam_size, time_used = \ eval_adaptive(machine, max_beam_size, model, test_X, test_Y, index2word, index2label, "test", False, "adaptive", initial_beam_size, reward_coef_fscore, reward_coef_beam_size, f_score_index_begin, args) log_msg = "%d\t%f\t%d\t%f\t%f" % (epoch, fscore, total_beam_number_in_dataset, avg_beam_size, time_used) print(log_msg) print(log_msg, file=logfile, flush=True) # End for epoch logfile.close()
def main(): rnd_seed = None if rnd_seed: torch.manual_seed(rnd_seed) np.random.seed(rnd_seed) # --------------------------------------- # DATA LOADING # --------------------------------------- #result_path = "../result_lrn_0p001_rl/" dict_file = "../dataset/CCGbank/dict_word" entity_file = "../dataset/CCGbank/dict_tag" index2word = get_index2word(dict_file) index2label = get_index2label(entity_file) vocab_size = len(index2word) label_size = len(index2label) #train_X, train_Y = minibatch_of_one_de('train') val_X, val_Y = minibatch_of_one_de('val') test_X, test_Y = minibatch_of_one_de('test') # --------------------------------------- # HYPER PARAMETERS # --------------------------------------- # Using word2vec pre-trained embedding word_embedding_dim = 300 hidden_dim = 512 label_embedding_dim = 512 max_epoch = 30 # 0.001 is a good value ner_learning_rate = 0.001 pretrained = None # --------------------------------------- # GPU OR NOT? # --------------------------------------- gpu = True if gpu and rnd_seed: torch.cuda.manual_seed(rnd_seed) # --------------------------------------- # MODEL INSTANTIATION # --------------------------------------- #attention = None attention = "fixed" load_model_dir = "../result_ccg_lrn_0p001_atten/" load_model_filename = os.path.join(load_model_dir, "ckpt_11.pth") batch_size = 1 machine = ner(word_embedding_dim, hidden_dim, label_embedding_dim, vocab_size, label_size, learning_rate=ner_learning_rate, minibatch_size=batch_size, max_epoch=max_epoch, train_X=None, train_Y=None, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y, attention=attention, gpu=gpu, pretrained=pretrained, load_model_filename=load_model_filename) if gpu: machine = machine.cuda() initial_beam_size = 3 # When you have only one beam, it does not make sense to consider # max_beam_size larger than the size of your label vocabulary max_beam_size = 10 # ============ INIT RL ===================== os.environ['OMP_NUM_THREADS'] = '4' #os.environ['CUDA_VISIBLE_DEVICES'] = "" parser = argparse.ArgumentParser(description='A3C') parser.add_argument('--logdir', default='../result_ccg_atten_ckpt_11_rl_lrn_0p001_reward_0p02_beam_3_gpu', help='name of logging directory') parser.add_argument('--lr', type=float, default=0.001, help='learning rate (default: 0.0001)') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor for rewards (default: 0.99)') parser.add_argument('--n_epochs', type=int, default=50, help='number of epochs for training agent(default: 30)') parser.add_argument('--entropy-coef', type=float, default=0.01, help='entropy term coefficient (default: 0.01)') parser.add_argument('--num-processes', type=int, default=1, help='how many training processes to use (default: 4)') parser.add_argument('--num-steps', type=int, default=20, help='number of forward steps in A3C (default: 20)') parser.add_argument('--tau', type=float, default=1.00, help='parameter for GAE (default: 1.00)') parser.add_argument('--value-loss-coef', type=float, default=0.5, help='value loss coefficient (default: 0.5)') parser.add_argument('--max-grad-norm', type=float, default=5, help='value loss coefficient (default: 5)') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--max-episode-length', type=int, default=1000000, help='maximum length of an episode (default: 1000000)') parser.add_argument('--name', default='test', help='name of the process') parser.add_argument('--no-shared', default=False, help='use an optimizer without shared momentum.') args = parser.parse_args() if not os.path.exists(args.logdir): os.mkdir(args.logdir) shared_model = AdaptiveActorCritic(max_beam_size=max_beam_size, action_space=3) #shared_model.share_memory() shared_model.eval() if args.no_shared: shared_optimizer = None # default here (False) else: shared_optimizer = SharedAdam(params=shared_model.parameters(), lr=args.lr) # optimizer = optim.Adam(shared_model.parameters(), lr=learning_rate) shared_optimizer.share_memory() # -------------------------------------------- # RL TRAINING # -------------------------------------------- # For German dataset, f_score_index_begin = 5 (because O_INDEX = 4) # For toy dataset, f_score_index_begin = 4 (because {0: '<s>', 1: '<e>', 2: '<p>', 3: '<u>', ...}) # For CCG dataset, f_score_index_begin = 2 (because {0: _PAD, 1: _SOS, ...}) f_score_index_begin = 2 # RL reward coefficient reward_coef_fscore = 1 reward_coef_beam_size = 0.02 load_map_location = "cpu" logfile = open(os.path.join(args.logdir, "eval_test.txt"), "w+") for epoch in range(0, args.n_epochs): print("Eval for epoch {}".format(epoch)) load_model_filename = os.path.join(args.logdir, "ckpt_" + str(epoch) + ".pth") checkpoint = torch.load(load_model_filename, map_location=load_map_location) shared_model.load_state_dict(checkpoint["state_dict"]) print("\tEval now...") fscore, total_beam_number_in_dataset, avg_beam_size, time_used = \ eval_adaptive( machine, max_beam_size, shared_model, test_X, test_Y, index2word, index2label, "test", args.name, "adaptive", initial_beam_size, reward_coef_fscore, reward_coef_beam_size, f_score_index_begin, args) log_msg = "%d\t%f\t%d\t%f\t%f" % (epoch, fscore, total_beam_number_in_dataset, avg_beam_size, time_used) print(log_msg) logfile.write(log_msg + '\n') logfile.flush() logfile.close()
def test_adaptive( rank, machine, max_beam_size, lr, shared_model, counter, eval_data_X, eval_data_Y, index2word, index2label, suffix, result_path, decode_method, beam_size, reward_coef_fscore, reward_coef_beam_size, f_score_index_begin, args, ): torch.manual_seed(123 + rank) # create adative model model = AdaptiveActorCritic(max_beam_size=max_beam_size, action_space=3) model.eval() batch_num = len(eval_data_X) instance_num = 0 beam_size_seqs = [] for batch in eval_data_X: instance_num += len(batch) for epoch in range(1, args.n_epochs + 1): print("Epoch: {} of {} (rank{})".format(epoch, args.name, rank)) desc = result_path + '_process_' + args.name + '_' + str(epoch) + '_' if result_path: f_sen = open( os.path.join(args.logdir, desc + "sen_" + suffix + ".txt"), 'w') f_pred = open( os.path.join(args.logdir, desc + "pred_" + suffix + ".txt"), 'w') f_label = open( os.path.join(args.logdir, desc + "label_" + suffix + ".txt"), 'w') f_result_processed = \ open(os.path.join(args.logdir, desc + "result_processed_" + suffix + ".txt"), 'w') f_beam_size = \ open(os.path.join(args.logdir, desc + 'beam_size_' + suffix + ".txt"), 'w') # for calculating F-SCORE true_pos_count = 0 pred_pos_count = 0 true_pred_pos_count = 0 for batch_idx in range(batch_num): if (batch_idx + 1) % 200 == 0: print("Batch {}/{}".format(batch_idx + 1, batch_num)) sen = eval_data_X[batch_idx] label = eval_data_Y[batch_idx] current_batch_size = len(sen) current_sen_len = len(sen[0]) # DEBUG # print(batch_idx, current_sen_len) if current_sen_len < 3: continue sen_var = Variable(torch.LongTensor(sen)) label_var = Variable(torch.LongTensor(label)) if machine.gpu: sen_var = sen_var.cuda() label_var = label_var.cuda() # Initialize the hidden and cell states # The axes semantics are # (num_layers * num_directions, batch_size, hidden_size) # So 1 for single-directional LSTM encoder, # 2 for bi-directional LSTM encoder. init_enc_hidden = Variable( torch.zeros((2, current_batch_size, machine.hidden_dim))) init_enc_cell = Variable( torch.zeros((2, current_batch_size, machine.hidden_dim))) if machine.gpu: init_enc_hidden = init_enc_hidden.cuda() init_enc_cell = init_enc_cell.cuda() enc_hidden_seq, (enc_hidden_out, enc_cell_out) = machine.encode( sen_var, init_enc_hidden, init_enc_cell) # The semantics of enc_hidden_out is (num_layers * num_directions, # batch, hidden_size), and it is "tensor containing the hidden state # for t = seq_len". # # Here we use a linear layer to transform the two-directions of the dec_hidden_out's into a single hidden_dim vector, to use as the input of the decoder init_dec_hidden = machine.enc2dec_hidden( torch.cat([enc_hidden_out[0], enc_hidden_out[1]], dim=1)) init_dec_cell = machine.enc2dec_cell( torch.cat([enc_cell_out[0], enc_cell_out[1]], dim=1)) # =================================== if decode_method == "adaptive": # the input argument "beam_size" serves as initial_beam_size here # TODO: implement this here label_pred_seq, accum_logP_pred_seq, logP_pred_seq, \ attention_pred_seq, episode, sen_beam_size_seq= \ decode_one_sentence_adaptive_rl(machine, current_sen_len, init_dec_hidden, init_dec_cell, enc_hidden_seq, beam_size, max_beam_size, model, shared_model, reward_coef_fscore, reward_coef_beam_size, label_var, f_score_index_begin, counter, args) else: raise Exception("Not implemented!") # =================================== # update beam seq beam_size_seqs += sen_beam_size_seq ### Debugging... # print("input sentence =", sen) # print("true label =", label) # print("predicted label =", label_pred_seq) # print("episode =", episode) for label_index in range(f_score_index_begin, machine.label_size): true_pos = (label_var == label_index) true_pos_count += true_pos.float().sum() pred_pos = (label_pred_seq == label_index) pred_pos_count += pred_pos.float().sum() true_pred_pos = true_pos & pred_pos true_pred_pos_count += true_pred_pos.float().sum() # Write result into file if result_path: if machine.gpu: label_pred_seq = label_pred_seq.cpu() label_pred_seq = label_pred_seq.data.numpy().tolist() # Here label_pred_seq.shape = (batch size, sen len) # sen, label, label_pred_seq are list of lists, # thus I would like to flatten them for iterating easier sen = list(itertools.chain.from_iterable(sen)) label = list(itertools.chain.from_iterable(label)) label_pred_seq = list( itertools.chain.from_iterable(label_pred_seq)) assert len(sen) == len(label) and len(label) == len( label_pred_seq) for i in range(len(sen)): f_sen.write(str(sen[i]) + '\n') f_label.write(str(label[i]) + '\n') f_pred.write(str(label_pred_seq[i]) + '\n') # clean version (does not print <PAD>, print a newline instead of <EOS>) # if sen[i] != 0 and sen[i] != 2: # not <PAD> and not <EOS> # if sen[i] != 0: # not <PAD> result_sen = index2word[sen[i]] result_label = index2label[label[i]] result_pred = index2label[label_pred_seq[i]] f_result_processed.write( "%s %s %s\n" % (result_sen, result_label, result_pred)) f_sen.flush() f_label.flush() f_pred.flush() f_result_processed.flush() if decode_method == "adaptive": beam_size_seq_str = ' '.join(map(str, sen_beam_size_seq)) f_beam_size.write(beam_size_seq_str + '\n') f_beam_size.flush() # End for batch_idx if machine.gpu: true_pos_count = true_pos_count.cpu() pred_pos_count = pred_pos_count.cpu() true_pred_pos_count = true_pred_pos_count.cpu() true_pos_count = true_pos_count.data.numpy()[0] pred_pos_count = pred_pos_count.data.numpy()[0] true_pred_pos_count = true_pred_pos_count.data.numpy()[0] precision = true_pred_pos_count / pred_pos_count if pred_pos_count > 0 else 0 recall = true_pred_pos_count / true_pos_count if true_pos_count > 0 else 0 fscore = 2 / (1 / precision + 1 / recall) if (precision > 0 and recall > 0) else 0 fscore = fscore * 100 if result_path: f_sen.close() f_pred.close() f_label.close() f_result_processed.close() f_beam_size.close() avg_beam_sizes = sum(beam_size_seqs) / float(len(beam_size_seqs)) print("Epoch {}: Avg {} beam size: {} (rank{})".format( epoch, args.name, avg_beam_sizes, rank)) print("Epoch {}: Avg {} Fscore = {} (rank{})".format( epoch, args.name, fscore, rank))