def evaluate(env, agent, valid_test="valid"): env.seed(42) env.split_reset(valid_test) agent.eval() print_qa_acc, print_correct_state_acc, print_steps = [], [], [] while (True): obs, infos = env.reset(random=False) agent.init(obs, infos) quest_list = agent.get_game_quest_info(infos) input_quest, input_quest_char, quest_id_list = agent.get_agent_inputs( quest_list) tmp_replay_buffer = [] for step_no in range(agent.eval_max_nb_steps_per_episode): commands, replay_info = agent.act_greedy(obs, infos, input_quest, input_quest_char, quest_id_list) tmp_replay_buffer.append(replay_info) obs, infos = env.step(commands) still_running = generic.to_np(replay_info[-1]) if np.sum(still_running) == 0: break # The agent has exhausted all steps, now answer question. chosen_head_tails = agent.answer_question_act(agent.naozi.get(), quest_list) # batch chosen_head_tails_np = generic.to_np(chosen_head_tails) chosen_answer_strings = generic.get_answer_strings( agent.naozi.get(), chosen_head_tails_np) answer_strings = [item["a"] for item in infos] masks_np = [generic.to_np(item[-1]) for item in tmp_replay_buffer] qa_reward_np = generic.get_qa_reward(chosen_answer_strings, answer_strings) correct_state_reward_np = generic.get_sufficient_info_reward( agent.naozi.get(), answer_strings) step_masks_np = np.sum(np.array(masks_np), 0) for i in range(len(qa_reward_np)): # if the answer is totally wrong, we assume it used all steps if qa_reward_np[i] == 0.0: step_masks_np[i] = agent.eval_max_nb_steps_per_episode print_qa_acc += qa_reward_np.tolist() print_correct_state_acc += correct_state_reward_np.tolist() print_steps += step_masks_np.tolist() if env.batch_pointer == 0: break print( "===== Eval =====: qa acc: {:2.3f} | correct state: {:2.3f} | used steps: {:2.3f}" .format(np.mean(np.array(print_qa_acc)), np.mean(np.array(print_correct_state_acc)), np.mean(np.array(print_steps)))) return np.mean(np.array(print_qa_acc)), np.mean( np.array(print_correct_state_acc)), np.mean(np.array(print_steps))
def update_interaction(self): # update neural model by replaying snapshots in replay memory interaction_loss, q_value = self.get_dqn_loss() if interaction_loss is None: return None, None loss = interaction_loss * self.interaction_loss_lambda # Backpropagate self.online_net.zero_grad() self.optimizer.zero_grad() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(self.online_net.parameters(), self.clip_grad_norm) self.optimizer.step() # apply gradients return to_np(torch.mean(interaction_loss)), to_np(torch.mean(q_value))
def evaluate_observation_generation_loss(env, agent, valid_test="valid"): env.split_reset(valid_test) agent.eval() ave_loss = [] while(True): observation_strings, prev_action_strings = env.get_batch() batch_size = len(observation_strings) lens = [len(elem) for elem in observation_strings] max_len = max(lens) padded_observation_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in observation_strings] padded_prev_action_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in prev_action_strings] eps_masks = torch.zeros((batch_size, max_len), dtype=torch.float).cuda() if agent.use_cuda else torch.zeros((batch_size, max_len), dtype=torch.float) for i in range(batch_size): eps_masks[i, :lens[i]] = 1 prev_h = None for j in range(max_len): batch_obs_string = [elem[j] for elem in padded_observation_strings] batch_prev_action_string = [elem[j] for elem in padded_prev_action_strings] with torch.no_grad(): loss, _, prev_h = agent.observation_generation_teacher_force(batch_obs_string, batch_prev_action_string, eps_masks[:, j], prev_h) ave_loss.append(to_np(loss)) if env.batch_pointer == 0: break return np.mean(np.array(ave_loss))
def generate_commands(self, action_indices, ctrlf_indices): action_indices_np = to_np(action_indices) ctrlf_indices_np = to_np(ctrlf_indices) res_str = [] batch_size = action_indices_np.shape[0] for i in range(batch_size): which = action_indices_np[i][0] if which == self.action2id["ctrl+f"]: which_word = ctrlf_indices_np[i][0] res_str.append("ctrl+f " + self.word_vocab[which_word]) elif which < len(self.id2action): res_str.append(self.id2action[which]) else: raise NotImplementedError return res_str
def choose_random_command(self, word_ranks, word_masks_np): """ Generate a command randomly, for epsilon greedy. Arguments: word_ranks: Q values for each word by model.action_scorer. word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun). """ batch_size = word_ranks[0].size(0) word_ranks_np = [to_np(item) for item in word_ranks] # list of batch x n_vocab word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np)] # list of batch x n_vocab word_indices = [] for i in range(len(word_ranks_np)): indices = [] for j in range(batch_size): msk = word_masks_np[i][j] # vocab indices.append(np.random.choice(len(msk), p=msk / np.sum(msk, -1))) word_indices.append(np.array(indices)) # word_indices: list of batch word_qvalues = [[] for _ in word_masks_np] for i in range(batch_size): for j in range(len(word_qvalues)): word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]]) word_qvalues = [torch.stack(item) for item in word_qvalues] word_indices = [to_pt(item, self.use_cuda) for item in word_indices] word_indices = [item.unsqueeze(-1) for item in word_indices] # list of batch x 1 return word_qvalues, word_indices
def evaluate_deep_graph_infomax(env, agent, valid_test="valid", verbose=False): env.split_reset(valid_test) agent.eval() list_eval_acc, list_eval_loss = [], [] # counter = 0 # to_print = [] while (True): triplets = env.get_batch() with torch.no_grad(): loss, labels, dgi_discriminator_logits, batch_nonzero_idx = agent.get_deep_graph_infomax_logits( triplets) # sigmoid dgi_discriminator_logits = 1.0 / (1.0 + np.exp(-dgi_discriminator_logits)) for i in range(len(triplets)): gt = labels[i] # num_node*2 pred_idx = (dgi_discriminator_logits[i] >= 0.5).astype( "float32") # num_node*2 nonzeros = np.array(batch_nonzero_idx[i].tolist() + (batch_nonzero_idx[i] + len(agent.node_vocab)).tolist()) gt = gt[nonzeros] # num_nonzero pred_idx = pred_idx[nonzeros] # num_nonzero correct = (pred_idx == gt).astype("float32").tolist() list_eval_acc += correct loss = to_np(loss) list_eval_loss.append(loss) if env.batch_pointer == 0: break return np.mean(list_eval_loss), np.mean(list_eval_acc)
def choose_maxQ_command(self, word_ranks, word_masks_np): """ Generate a command by maximum q values, for epsilon greedy. Arguments: word_ranks: Q values for each word by model.action_scorer. word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun). """ batch_size = word_ranks[0].size(0) word_ranks_np = [to_np(item) for item in word_ranks] # list of batch x n_vocab word_ranks_np = [ r - np.min(r) for r in word_ranks_np ] # minus the min value, so that all values are non-negative word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np) ] # list of batch x n_vocab word_indices = [np.argmax(item, -1) for item in word_ranks_np] # list of batch word_qvalues = [[] for _ in word_masks_np] for i in range(batch_size): for j in range(len(word_qvalues)): word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]]) word_qvalues = [torch.stack(item) for item in word_qvalues] word_indices = [to_pt(item, self.use_cuda) for item in word_indices] word_indices = [item.unsqueeze(-1) for item in word_indices] # list of batch x 1 return word_qvalues, word_indices
def point_maxq_position(self, point_distribution, mask): """ Generate a command by maximum q values, for epsilon greedy. Arguments: point_distribution: Q values for each position batch x time x 2. mask: position masks. """ point_distribution_np = to_np(point_distribution) # batch x time mask_np = to_np(mask) # batch x time point_distribution_np = point_distribution_np - np.min( point_distribution_np ) + 1e-2 # minus the min value, so that all values are non-negative point_distribution_np = point_distribution_np * np.expand_dims( mask_np, -1) # batch x time x 2 indices = np.argmax(point_distribution_np, 1) # batch x 2 indices = to_pt(np.array(indices), self.use_cuda) # batch x 2 return indices
def get_chosen_strings(self, chosen_indices): """ Turns list of word indices into actual command strings. chosen_indices: Word indices chosen by model. """ chosen_indices_np = [to_np(item) for item in chosen_indices] res_str = [] batch_size = chosen_indices_np[0].shape[0] for i in range(batch_size): verb, adj, noun = chosen_indices_np[0][i], chosen_indices_np[1][ i], chosen_indices_np[2][i] res_str.append(self.word_ids_to_commands(verb, adj, noun)) return res_str
def evaluate_state_prediction(env, agent, valid_test="valid", verbose=False): env.split_reset(valid_test) agent.eval() list_eval_acc, list_eval_loss = [], [] counter = 0 to_print = [] while (True): target_graph, previous_graph, action, admissible_graphs = env.get_batch( ) with torch.no_grad(): loss, sp_ret, np_labels, admissible_graphs = agent.get_state_prediction_logits( previous_graph, action, target_graph, admissible_graphs) loss = to_np(loss) pred = np.argmax(sp_ret, -1) # batch gt = np.argmax(np_labels, -1) # batch correct = (pred == gt).astype("float32").tolist() list_eval_acc += correct list_eval_loss += [loss] if verbose: for i in range(len(previous_graph)): to_print.append( str(counter) + " -------------------------------------------- acc: " + str(correct[i])) trips = [] for t in previous_graph[i]: trips.append(t[0] + "-" + t[2] + "-" + t[1]) to_print.append("PREV TRIPLETS: %s " % (" | ".join(trips))) to_print.append("ACTION: %s " % (action[i])) trips = [] for t in admissible_graphs[i][pred[i]]: trips.append(t[0] + "-" + t[2] + "-" + t[1]) to_print.append("PRED TRIPLETS: %s " % (" | ".join(trips))) trips = [] for t in target_graph[i]: trips.append(t[0] + "-" + t[2] + "-" + t[1]) to_print.append("GT TRIPLETS: %s " % (" | ".join(trips))) to_print.append("") counter += 1 if env.batch_pointer == 0: break with open(agent.experiment_tag + "_output.txt", "w") as f: f.write("\n".join(to_print)) print("Eval Loss: {:2.3f}, Eval accuracy: {:2.3f}".format( np.mean(list_eval_loss), np.mean(list_eval_acc))) return np.mean(list_eval_loss), np.mean(list_eval_acc)
def point_random_position(self, point_distribution, mask): """ Generate a command by random, for epsilon greedy. Arguments: point_distribution: Q values for each position batch x time x 2. mask: position masks. """ batch_size = point_distribution.size(0) mask_np = to_np(mask) # batch x time indices = [] for i in range(batch_size): msk = mask_np[i] # time indices.append( np.random.choice(len(msk), 2, p=msk / np.sum(msk, -1))) indices = to_pt(np.stack(indices, 0), self.use_cuda) # batch x 2 return indices
def _choose_random_command(word_ranks, word_masks_np, use_cuda): """ Generate a command randomly, for epsilon greedy. Arguments: word_ranks: Q values for each word by model.action_scorer. word_masks_np: Vocabulary masks for words depending on their type (verb, adj, noun, adj2, noun2). """ batch_size = word_ranks[0].size(0) # print("batch_size=", batch_size, len(word_masks_np)) assert len(word_ranks) == len(word_masks_np) word_ranks_np = [ to_np(item) for item in word_ranks ] # list of (batch x n_vocab) arrays, len=5 (5 word output phrases) # word_ranks_np = [r - np.min(r) for r in word_ranks_np] # minus the min value, so that all values are non-negative word_ranks_np = [r * m for r, m in zip(word_ranks_np, word_masks_np) ] # list of batch x n_vocab word_indices = [] for i in range( len(word_ranks_np)): # len=5 (verb, adj1, noun1, adj2, noun2) indices = [] for j in range(batch_size): msk = word_masks_np[i][ j] # msk is of len = vocab, j is index into batch indices.append(np.random.choice( len(msk), p=msk / np.sum(msk, -1))) # choose from non-zero entries of msk word_indices.append(np.array(indices)) # word_indices: list of batch word_qvalues = [[] for _ in word_masks_np] for i in range(batch_size): for j in range(len(word_qvalues)): word_qvalues[j].append(word_ranks[j][i][word_indices[j][i]]) word_qvalues = [torch.stack(item) for item in word_qvalues] word_indices = [to_pt(item, use_cuda) for item in word_indices] word_indices = [item.unsqueeze(-1) for item in word_indices] # list of batch x 1 return word_qvalues, word_indices
def train(): time_1 = datetime.datetime.now() with open("config.yaml") as reader: config = yaml.safe_load(reader) if config['general']['dataset'] == "squad": env = GamifiedSquad(config) else: env = GamifiedNewsQA(config) env.split_reset("train") agent = Agent() # visdom viz = visdom.Visdom() plt_win = None eval_plt_win = None plt_q_value_win = None plt_steps_win = None eval_plt_steps_win = None viz_avg_correct_state_acc, viz_avg_qa_acc = [], [] viz_avg_correct_state_q_value = [] viz_eval_correct_state_acc, viz_eval_qa_acc, viz_eval_steps = [], [], [] viz_avg_steps = [] step_in_total = 0 episode_no = 0 running_avg_qa_acc = HistoryScoreCache(capacity=50) running_avg_correct_state_acc = HistoryScoreCache(capacity=50) running_avg_qa_loss = HistoryScoreCache(capacity=50) running_avg_correct_state_loss = HistoryScoreCache(capacity=50) running_avg_correct_state_q_value = HistoryScoreCache(capacity=50) running_avg_steps = HistoryScoreCache(capacity=50) output_dir, data_dir = ".", "." json_file_name = agent.experiment_tag.replace(" ", "_") best_qa_acc_so_far = 0.0 # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt") agent.update_target_net() elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt") agent.update_target_net() while (True): if episode_no > agent.max_episode: break np.random.seed(episode_no) env.seed(episode_no) obs, infos = env.reset() print( "====================================================================================", episode_no) print("-- Q: %s" % (infos[0]["q"].encode('utf-8'))) print("-- A: %s" % (infos[0]["a"][0].encode('utf-8'))) agent.train() agent.init(obs, infos) quest_list = agent.get_game_quest_info(infos) input_quest, input_quest_char, quest_id_list = agent.get_agent_inputs( quest_list) tmp_replay_buffer = [] print_cmds = [] batch_size = len(obs) act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode for step_no in range(agent.max_nb_steps_per_episode): # generate commands if agent.noisy_net: agent.reset_noise() # Draw a new set of noisy weights commands, replay_info = agent.act(obs, infos, input_quest, input_quest_char, quest_id_list, random=act_randomly) obs, infos = env.step(commands) if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0: agent.reset_noise() # Draw a new set of noisy weights if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0: interaction_loss, interaction_q_value = agent.update_interaction( ) if interaction_loss is not None: running_avg_correct_state_loss.push(interaction_loss) running_avg_correct_state_q_value.push(interaction_q_value) qa_loss = agent.update_qa() if qa_loss is not None: running_avg_qa_loss.push(qa_loss) step_in_total += 1 still_running = generic.to_np(replay_info[-1]) print_cmds.append(commands[0] if still_running[0] else "--") # force stopping if step_no == agent.max_nb_steps_per_episode - 1: replay_info[-1] = torch.zeros_like(replay_info[-1]) tmp_replay_buffer.append(replay_info) if np.sum(still_running) == 0: break print(" / ".join(print_cmds).encode('utf-8')) # The agent has exhausted all steps, now answer question. chosen_head_tails = agent.answer_question_act(agent.naozi.get(), quest_list) # batch chosen_head_tails_np = generic.to_np(chosen_head_tails) chosen_answer_strings = generic.get_answer_strings( agent.naozi.get(), chosen_head_tails_np) answer_strings = [item["a"] for item in infos] qa_reward_np = generic.get_qa_reward(chosen_answer_strings, answer_strings) correct_state_reward_np = generic.get_sufficient_info_reward( agent.naozi.get(), answer_strings) correct_state_reward = generic.to_pt(correct_state_reward_np, enable_cuda=agent.use_cuda, type='float') # batch # push qa experience into qa replay buffer for b in range(batch_size): # data points in batch is_prior = qa_reward_np[ b] > agent.qa_reward_prior_threshold * agent.qa_replay_memory.avg_rewards( ) # if the agent is not in the correct state, do not push it into replay buffer if np.mean(correct_state_reward_np[b]) == 0.0: continue agent.qa_replay_memory.push(is_prior, qa_reward_np[b], agent.naozi.get(b), quest_list[b], answer_strings[b]) # small positive reward whenever it answers question correctly masks_np = [generic.to_np(item[-1]) for item in tmp_replay_buffer] command_rewards_np = [] for i in range(len(tmp_replay_buffer)): if i == len(tmp_replay_buffer) - 1: r = correct_state_reward * tmp_replay_buffer[i][-1] r_np = correct_state_reward_np * masks_np[i] else: # give reward only at that one game step, not all r = correct_state_reward * (tmp_replay_buffer[i][-1] - tmp_replay_buffer[i + 1][-1]) r_np = correct_state_reward_np * (masks_np[i] - masks_np[i + 1]) tmp_replay_buffer[i].append(r) command_rewards_np.append(r_np) command_rewards_np = np.array(command_rewards_np) print(command_rewards_np[:, 0]) # push experience into replay buffer for b in range(len(correct_state_reward_np)): is_prior = np.sum(command_rewards_np, 0)[b] > 0.0 for i in range(len(tmp_replay_buffer)): batch_description_list, batch_chosen_indices, batch_chosen_ctrlf_indices, _, batch_rewards = tmp_replay_buffer[ i] is_final = True if masks_np[i][b] != 0: is_final = False agent.replay_memory.push(is_prior, batch_description_list[b], quest_list[b], batch_chosen_indices[b], batch_chosen_ctrlf_indices[b], batch_rewards[b], is_final) if masks_np[i][b] == 0.0: break qa_acc = np.mean(qa_reward_np) correct_state_acc = np.mean(correct_state_reward_np) step_masks_np = np.sum(np.array(masks_np), 0) # batch for i in range(len(qa_reward_np)): # if the answer is totally wrong, we assume it used all steps if qa_reward_np[i] == 0.0: step_masks_np[i] = agent.max_nb_steps_per_episode used_steps = np.mean(step_masks_np) running_avg_qa_acc.push(qa_acc) running_avg_correct_state_acc.push(correct_state_acc) running_avg_steps.push(used_steps) print_rewards = np.sum(np.mean(command_rewards_np, -1)) obs_string = agent.naozi.get(0) print("-- OBS: %s" % (obs_string.encode('utf-8'))) print("-- PRED: %s" % (chosen_answer_strings[0].encode('utf-8'))) # finish game agent.finish_of_episode(episode_no, batch_size) episode_no += batch_size time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | interaction qvalue: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | sufficient info: {:2.3f}/{:2.3f} | used steps: {:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], running_avg_correct_state_loss.get_avg(), running_avg_correct_state_q_value.get_avg(), running_avg_qa_loss.get_avg(), print_rewards, qa_acc, running_avg_qa_acc.get_avg(), correct_state_acc, running_avg_correct_state_acc.get_avg(), running_avg_steps.get_avg())) if episode_no < agent.learn_start_from_this_episode: continue if agent.report_frequency == 0 or ( episode_no % agent.report_frequency > (episode_no - batch_size) % agent.report_frequency): continue eval_qa_acc, eval_correct_state_acc, eval_used_steps = 0.0, 0.0, 0.0 # evaluate if agent.run_eval: eval_qa_acc, eval_correct_state_acc, eval_used_steps = evaluate.evaluate( env, agent, "valid") env.split_reset("train") # if run eval, then save model by eval accucacy if agent.save_frequency > 0 and ( episode_no % agent.report_frequency <= (episode_no - batch_size) % agent.report_frequency ) and eval_qa_acc > best_qa_acc_so_far: best_qa_acc_so_far = eval_qa_acc agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # save model elif agent.save_frequency > 0 and ( episode_no % agent.report_frequency <= (episode_no - batch_size) % agent.report_frequency): if running_avg_qa_acc.get_avg() > best_qa_acc_so_far: best_qa_acc_so_far = running_avg_qa_acc.get_avg() agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # plot using visdom viz_avg_correct_state_acc.append( running_avg_correct_state_acc.get_avg()) viz_avg_qa_acc.append(running_avg_qa_acc.get_avg()) viz_avg_correct_state_q_value.append( running_avg_correct_state_q_value.get_avg()) viz_eval_correct_state_acc.append(eval_correct_state_acc) viz_eval_qa_acc.append(eval_qa_acc) viz_eval_steps.append(eval_used_steps) viz_avg_steps.append(running_avg_steps.get_avg()) viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist() if plt_win is None: plt_win = viz.line(X=viz_x, Y=viz_avg_correct_state_acc, opts=dict(title=agent.experiment_tag + "_train"), name="sufficient info") viz.line(X=viz_x, Y=viz_avg_qa_acc, opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="qa") else: viz.line(X=[len(viz_avg_correct_state_acc) - 1], Y=[viz_avg_correct_state_acc[-1]], opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="sufficient info") viz.line(X=[len(viz_avg_qa_acc) - 1], Y=[viz_avg_qa_acc[-1]], opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="qa") if plt_q_value_win is None: plt_q_value_win = viz.line(X=viz_x, Y=viz_avg_correct_state_q_value, opts=dict(title=agent.experiment_tag + "_train_q_value"), name="sufficient info") else: viz.line(X=[len(viz_avg_correct_state_q_value) - 1], Y=[viz_avg_correct_state_q_value[-1]], opts=dict(title=agent.experiment_tag + "_train_q_value"), win=plt_q_value_win, update='append', name="sufficient info") if plt_steps_win is None: plt_steps_win = viz.line(X=viz_x, Y=viz_avg_steps, opts=dict(title=agent.experiment_tag + "_train_step"), name="used steps") else: viz.line(X=[len(viz_avg_steps) - 1], Y=[viz_avg_steps[-1]], opts=dict(title=agent.experiment_tag + "_train_step"), win=plt_steps_win, update='append', name="used steps") if eval_plt_win is None: eval_plt_win = viz.line(X=viz_x, Y=viz_eval_correct_state_acc, opts=dict(title=agent.experiment_tag + "_eval"), name="sufficient info") viz.line(X=viz_x, Y=viz_eval_qa_acc, opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="qa") else: viz.line(X=[len(viz_eval_correct_state_acc) - 1], Y=[viz_eval_correct_state_acc[-1]], opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="sufficient info") viz.line(X=[len(viz_eval_qa_acc) - 1], Y=[viz_eval_qa_acc[-1]], opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="qa") if eval_plt_steps_win is None: eval_plt_steps_win = viz.line( X=viz_x, Y=viz_eval_steps, opts=dict(title=agent.experiment_tag + "_eval_step"), name="used steps") else: viz.line(X=[len(viz_avg_steps) - 1], Y=[viz_eval_steps[-1]], opts=dict(title=agent.experiment_tag + "_eval_step"), win=eval_plt_steps_win, update='append', name="used steps") # write accucacies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "sufficient info": str(running_avg_correct_state_acc.get_avg()), "qa": str(running_avg_qa_acc.get_avg()), "sufficient qvalue": str(running_avg_correct_state_q_value.get_avg()), "eval sufficient info": str(eval_correct_state_acc), "eval qa": str(eval_qa_acc), "eval steps": str(eval_used_steps), "used steps": str(running_avg_steps.get_avg()) }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush()
def train(): time_1 = datetime.datetime.now() config = generic.load_config() env = DGIData(config) env.split_reset("train") agent = Agent(config) agent.zero_noise() ave_train_loss = generic.HistoryScoreCache(capacity=500) # visdom if config["general"]["visdom"]: import visdom viz = visdom.Visdom() loss_win = None eval_acc_win = None viz_loss, viz_eval_loss, viz_eval_acc = [], [], [] episode_no = 0 batch_no = 0 output_dir = "." data_dir = "." json_file_name = agent.experiment_tag.replace(" ", "_") # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt", load_partial_graph=False) best_eval_acc, best_training_loss_so_far = 0.0, 10000.0 try: while (True): if episode_no > agent.max_episode: break agent.train() triplets = env.get_batch() curr_batch_size = len(triplets) loss, _, _, _ = agent.get_deep_graph_infomax_logits(triplets) # Update Model agent.online_net.zero_grad() agent.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(agent.online_net.parameters(), agent.clip_grad_norm) agent.optimizer.step() loss = generic.to_np(loss) ave_train_loss.push(loss) # lr schedule if batch_no < agent.learning_rate_warmup_until: cr = agent.init_learning_rate / math.log2( agent.learning_rate_warmup_until) learning_rate = cr * math.log2(batch_no + 1) else: learning_rate = agent.init_learning_rate for param_group in agent.optimizer.param_groups: param_group['lr'] = learning_rate episode_no += curr_batch_size batch_no += 1 if agent.report_frequency == 0 or ( episode_no % agent.report_frequency > (episode_no - curr_batch_size) % agent.report_frequency): continue eval_acc, eval_loss = 0.0, 0.0 if episode_no % agent.report_frequency <= ( episode_no - curr_batch_size) % agent.report_frequency: if agent.run_eval: eval_loss, eval_acc = evaluate.evaluate_deep_graph_infomax( env, agent, "valid") if eval_acc > best_eval_acc: best_eval_acc = eval_acc agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") print( "Saving best model so far! with Eval acc : {:2.3f}" .format(best_eval_acc)) env.split_reset("train") else: if loss < best_training_loss_so_far: best_training_loss_so_far = loss agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | sliding window loss: {:2.3f} | Eval Acc: {:2.3f} | Eval Loss: {:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg(), eval_acc, eval_loss)) # plot using visdom if config["general"]["visdom"]: viz_loss.append(ave_train_loss.get_avg()) viz_eval_acc.append(eval_acc) viz_eval_loss.append(eval_loss) viz_x = np.arange(len(viz_loss)).tolist() viz_eval_x = np.arange(len(viz_eval_acc)).tolist() if loss_win is None: loss_win = viz.line(X=viz_x, Y=viz_loss, opts=dict(title=agent.experiment_tag + "_loss"), name="training loss") viz.line(X=viz_eval_x, Y=viz_eval_loss, opts=dict(title=agent.experiment_tag + "_eval_loss"), win=loss_win, update='append', name="eval loss") else: viz.line(X=[len(viz_loss) - 1], Y=[viz_loss[-1]], opts=dict(title=agent.experiment_tag + "_loss"), win=loss_win, update='append', name="training loss") viz.line(X=[len(viz_eval_loss) - 1], Y=[viz_eval_loss[-1]], opts=dict(title=agent.experiment_tag + "_eval_loss"), win=loss_win, update='append', name="eval loss") if eval_acc_win is None: eval_acc_win = viz.line( X=viz_eval_x, Y=viz_eval_acc, opts=dict(title=agent.experiment_tag + "_eval_acc"), name="eval accuracy") else: viz.line(X=[len(viz_eval_acc) - 1], Y=[viz_eval_acc[-1]], opts=dict(title=agent.experiment_tag + "_eval_acc"), win=eval_acc_win, update='append', name="eval accuracy") # write accuracies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "loss": str(ave_train_loss.get_avg()), "eval loss": str(eval_loss), "eval accuracy": str(eval_acc) }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush() # At any point you can hit Ctrl + C to break out of training early. except KeyboardInterrupt: print('--------------------------------------------') print('Exiting from training early...') if agent.run_eval: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): print('Evaluating on test set and saving log...') agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) _, _ = evaluate.evaluate_deep_graph_infomax(env, agent, "test", verbose=True)
def train(): time_1 = datetime.datetime.now() config = generic.load_config() env = ObservationGenerationData(config) env.split_reset("train") agent = Agent(config) agent.zero_noise() ave_train_loss = generic.HistoryScoreCache(capacity=500) # visdom if config["general"]["visdom"]: import visdom viz = visdom.Visdom() plt_win = None eval_plt_win = None viz_loss, viz_eval_loss, viz_eval_f1 = [], [], [] episode_no = 0 batch_no = 0 output_dir = "." data_dir = "." json_file_name = agent.experiment_tag.replace(" ", "_") best_eval_loss_so_far, best_training_loss_so_far = 10000.0, 10000.0 # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) elif os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt", load_partial_graph=False) try: while(True): if episode_no > agent.max_episode: break agent.train() observation_strings, prev_action_strings = env.get_batch() curr_batch_size = len(observation_strings) lens = [len(elem) for elem in observation_strings] max_len = max(lens) padded_observation_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in observation_strings] padded_prev_action_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in prev_action_strings] masks = torch.zeros((curr_batch_size, max_len), dtype=torch.float).cuda() if agent.use_cuda else torch.zeros((curr_batch_size, max_len), dtype=torch.float) for i in range(curr_batch_size): masks[i, :lens[i]] = 1 preds_last_batch = [] last_k_batches_loss = [] prev_h = None for i in range(max_len): batch_obs_string = [elem[i] for elem in padded_observation_strings] batch_prev_action_string = [elem[i] for elem in padded_prev_action_strings] loss, pred, prev_h = agent.observation_generation_teacher_force(batch_obs_string, batch_prev_action_string, masks[:, i], prev_h) last_k_batches_loss.append(loss) ave_train_loss.push(generic.to_np(loss)) preds_last_batch.append(pred[-1]) if ((i + 1) % agent.backprop_frequency == 0 or i == max_len - 1): # and i > 0: agent.optimizer.zero_grad() ave_k_loss = torch.mean(torch.stack(last_k_batches_loss)) ave_k_loss.backward() agent.optimizer.step() last_k_batches_loss = [] prev_h = prev_h.detach() k = 0 ep_string = [] while(masks[-1][k] > 0): step_string = [] regen_strings = preds_last_batch[k].argmax(-1) for l in range(len(regen_strings)): step_string.append(agent.word_vocab[regen_strings[l]]) ep_string.append((' '.join(step_string).split("<eos>")[0])) k += 1 if k == len(masks[-1]): break if len(ep_string) >= 3: print(' | '.join(ep_string[:3])) ##### # lr schedule # learning_rate = 1.0 * (generic.power(agent.model.block_hidden_dim, -0.5) * min(generic.power(batch_no, -0.5), batch_no * generic.power(agent.learning_rate_warmup_until, -1.5))) if batch_no < agent.learning_rate_warmup_until: cr = agent.init_learning_rate / math.log2(agent.learning_rate_warmup_until) learning_rate = cr * math.log2(batch_no + 1) else: learning_rate = agent.init_learning_rate for param_group in agent.optimizer.param_groups: param_group['lr'] = learning_rate episode_no += curr_batch_size batch_no += 1 time_2 = datetime.datetime.now() print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg())) if agent.report_frequency == 0 or (episode_no % agent.report_frequency > (episode_no - curr_batch_size) % agent.report_frequency): continue eval_loss, eval_f1 = 0.0, 0.0 if episode_no % agent.report_frequency <= (episode_no - curr_batch_size) % agent.report_frequency: if agent.run_eval: eval_loss = evaluate.evaluate_observation_generation_loss(env, agent, "valid") eval_f1 = evaluate.evaluate_observation_generation_free_generation(env, agent, "valid") env.split_reset("train") # if run eval, then save model by eval accuracy if eval_loss < best_eval_loss_so_far: best_eval_loss_so_far = eval_loss agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: if loss < best_training_loss_so_far: best_training_loss_so_far = loss agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") time_2 = datetime.datetime.now() print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f} | valid loss: {:2.3f} | valid f1: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], loss, eval_loss, eval_f1)) # plot using visdom if config["general"]["visdom"]: viz_loss.append(ave_train_loss.get_avg()) viz_eval_loss.append(eval_loss) viz_eval_f1.append(eval_f1) viz_x = np.arange(len(viz_loss)).tolist() if plt_win is None: plt_win = viz.line(X=viz_x, Y=viz_loss, opts=dict(title=agent.experiment_tag + "_loss"), name="training loss") viz.line(X=viz_x, Y=viz_eval_loss, opts=dict(title=agent.experiment_tag + "_eval_loss"), win=plt_win, update='append', name="eval loss") else: viz.line(X=[len(viz_loss) - 1], Y=[viz_loss[-1]], opts=dict(title=agent.experiment_tag + "_loss"), win=plt_win, update='append', name="training loss") viz.line(X=[len(viz_eval_loss) - 1], Y=[viz_eval_loss[-1]], opts=dict(title=agent.experiment_tag + "_eval_loss"), win=plt_win, update='append', name="eval loss") if eval_plt_win is None: eval_plt_win = viz.line(X=viz_x, Y=viz_eval_f1, opts=dict(title=agent.experiment_tag + "_eval_f1"), name="eval f1") else: viz.line(X=[len(viz_eval_f1) - 1], Y=[viz_eval_f1[-1]], opts=dict(title=agent.experiment_tag + "_eval_f1"), win=eval_plt_win, update='append', name="eval f1") # write accuracies down into file _s = json.dumps({"time spent": str(time_2 - time_1).rsplit(".")[0], "loss": str(ave_train_loss.get_avg()), "eval loss": str(eval_loss), "eval f1": str(eval_f1)}) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush() # At any point you can hit Ctrl + C to break out of training early. except KeyboardInterrupt: print('--------------------------------------------') print('Exiting from training early...') if agent.run_eval: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): print('Evaluating on test set and saving log...') agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) test_loss = evaluate.evaluate_observation_generation_loss(env, agent, "test") test_f1 = evaluate.evaluate_observation_generation_free_generation(env, agent, "test") print(test_loss, test_f1)
def train(data_path): time_1 = datetime.datetime.now() agent = Agent() # visdom viz = visdom.Visdom() plt_win = None eval_plt_win = None viz_avg_correct_state_acc, viz_avg_qa_acc = [], [] viz_eval_sufficient_info_reward, viz_eval_qa_reward = [], [] step_in_total = 0 running_avg_qa_reward = generic.HistoryScoreCache(capacity=500) running_avg_sufficient_info_reward = generic.HistoryScoreCache( capacity=500) running_avg_qa_loss = generic.HistoryScoreCache(capacity=500) running_avg_correct_state_loss = generic.HistoryScoreCache(capacity=500) output_dir, data_dir = ".", "." json_file_name = agent.experiment_tag.replace(" ", "_") best_sum_reward_so_far = 0.0 # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt") agent.update_target_net() elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt") agent.update_target_net() else: print( "Failed to load pretrained model... couldn't find the checkpoint file..." ) # Create temporary folder for the generated games. games_dir = tempfile.TemporaryDirectory( prefix="tw_games" ) # This is not deleted upon error. It would be better to use a with statement. games_dir = pjoin(games_dir.name, "") # So path ends with '/'. # copy grammar files into tmp folder so that it works smoothly assert os.path.exists( "./textworld_data"), "Oh no! textworld_data folder is not there..." os.mkdir(games_dir) os.mkdir(pjoin(games_dir, "textworld_data")) copy_tree("textworld_data", games_dir + "textworld_data") if agent.run_eval: assert os.path.exists(pjoin( data_path, agent.testset_path)), "Oh no! test_set folder is not there..." os.mkdir(pjoin(games_dir, agent.testset_path)) copy_tree(pjoin(data_path, agent.testset_path), pjoin(games_dir, agent.testset_path)) if agent.train_data_size == -1: game_queue_size = agent.batch_size * 5 game_queue = [] episode_no = 0 if agent.train_data_size == -1: # endless mode game_generator_queue = game_generator.game_generator_queue( path=games_dir, random_map=agent.random_map, question_type=agent.question_type, max_q_size=agent.batch_size * 2, nb_worker=8) else: # generate the training set all_training_games = game_generator.game_generator( path=games_dir, random_map=agent.random_map, question_type=agent.question_type, train_data_size=agent.train_data_size) all_training_games.sort() all_env_ids = None while (True): if episode_no > agent.max_episode: break np.random.seed(episode_no) if agent.train_data_size == -1: # endless mode for _ in range(agent.batch_size): if not game_generator_queue.empty(): tmp_game = game_generator_queue.get() if os.path.exists(tmp_game): game_queue.append(tmp_game) if len(game_queue) == 0: time.sleep(0.1) continue can_delete_these = [] if len(game_queue) > game_queue_size: can_delete_these = game_queue[:-game_queue_size] game_queue = game_queue[-game_queue_size:] sampled_games = np.random.choice(game_queue, agent.batch_size).tolist() env_ids = [ register_game(gamefile, request_infos=request_infos) for gamefile in sampled_games ] else: if all_env_ids is None: all_env_ids = [ register_game(gamefile, request_infos=request_infos) for gamefile in all_training_games ] env_ids = np.random.choice(all_env_ids, agent.batch_size).tolist() if len(env_ids ) != agent.batch_size: # either less than or greater than env_ids = np.random.choice(env_ids, agent.batch_size).tolist() env_id = make_batch2(env_ids, parallel=True) env = gym.make(env_id) env.seed(episode_no) obs, infos = env.reset() batch_size = len(obs) # generate question-answer pairs here questions, answers, reward_helper_info = game_generator.generate_qa_pairs( infos, question_type=agent.question_type, seed=episode_no) print( "====================================================================================", episode_no) print(questions[0], answers[0]) agent.train() agent.init(obs, infos) commands, last_facts, init_facts = [], [], [] commands_per_step, game_facts_cache = [], [] for i in range(batch_size): commands.append("restart") last_facts.append(None) init_facts.append(None) game_facts_cache.append([]) commands_per_step.append(["restart"]) observation_strings, possible_words = agent.get_game_info_at_certain_step( obs, infos) observation_strings = [ a + " <|> " + item for a, item in zip(commands, observation_strings) ] input_quest, input_quest_char, _ = agent.get_agent_inputs(questions) transition_cache = [] print_cmds = [] counting_rewards_np = [] valid_command_rewards_np = [] act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode # push init state into counting reward dict state_strings = agent.get_state_strings(infos) _ = agent.get_binarized_count(state_strings, update=True) for step_no in range(agent.max_nb_steps_per_episode): # update answerer input for i in range(batch_size): if agent.not_finished_yet[i] == 1: agent.naozi.push_one(i, copy.copy(observation_strings[i])) if agent.prev_step_is_still_interacting[i] == 1: new_facts = process_facts(last_facts[i], infos["game"][i], infos["facts"][i], infos["last_action"][i], commands[i]) game_facts_cache[i].append( new_facts ) # info used in reward computing of existence question last_facts[i] = new_facts if step_no == 0: init_facts[i] = copy.copy(new_facts) # generate commands if agent.noisy_net: agent.reset_noise() # Draw a new set of noisy weights observation_strings_w_history = agent.naozi.get() input_observation, input_observation_char, _ = agent.get_agent_inputs( observation_strings_w_history) commands, replay_info = agent.act(obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words, random=act_randomly) for i in range(batch_size): commands_per_step[i].append(commands[i]) replay_info = [ observation_strings_w_history, questions, possible_words ] + replay_info admissible_commands = [ set(item) - set(["look", "wait", "inventory"]) for item in infos["admissible_commands"] ] vc_rewards = [ float(c in ac) for c, ac in zip(commands, admissible_commands) ] valid_command_rewards_np.append(np.array(vc_rewards)) # pass commands into env obs, _, _, infos = env.step(commands) # possible words no not depend on history, because one can only interact with what is currently accessible observation_strings, possible_words = agent.get_game_info_at_certain_step( obs, infos) observation_strings = [ a + " <|> " + item for a, item in zip(commands, observation_strings) ] # counting rewards state_strings = agent.get_state_strings(infos) c_rewards = agent.get_binarized_count(state_strings, update=True) counting_rewards_np.append(np.array(c_rewards)) if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0: agent.reset_noise() # Draw a new set of noisy weights if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0: interaction_loss = agent.update_interaction() if interaction_loss is not None: running_avg_correct_state_loss.push(interaction_loss) qa_loss = agent.update_qa() if qa_loss is not None: running_avg_qa_loss.push(qa_loss) print_cmds.append(commands[0] if agent. prev_step_is_still_interacting[0] else "--") # force stopping if step_no == agent.max_nb_steps_per_episode - 1: replay_info[-1] = torch.zeros_like(replay_info[-1]) transition_cache.append(replay_info) step_in_total += 1 if (step_no == agent.max_nb_steps_per_episode - 1) or (step_no > 0 and np.sum(generic.to_np(replay_info[-1])) == 0): break print(" / ".join(print_cmds)) # The agent has exhausted all steps, now answer question. answerer_input = agent.naozi.get() answerer_input_observation, answerer_input_observation_char, answerer_observation_ids = agent.get_agent_inputs( answerer_input) chosen_word_indices = agent.answer_question_act_greedy( answerer_input_observation, answerer_input_observation_char, answerer_observation_ids, input_quest, input_quest_char) # batch chosen_word_indices_np = generic.to_np(chosen_word_indices) chosen_answers = [ agent.word_vocab[item] for item in chosen_word_indices_np ] # rewards # qa reward qa_reward_np = reward_helper.get_qa_reward(answers, chosen_answers) # sufficient info rewards masks = [item[-1] for item in transition_cache] masks_np = [generic.to_np(item) for item in masks] # 1 1 0 0 0 --> 1 1 0 0 0 0 game_finishing_mask = np.stack(masks_np + [np.zeros((batch_size, ))], 0) # game step+1 x batch size # 1 1 0 0 0 0 --> 0 1 0 0 0 game_finishing_mask = game_finishing_mask[:-1, :] - game_finishing_mask[ 1:, :] # game step x batch size game_running_mask = np.stack(masks_np, 0) # game step x batch size if agent.question_type == "location": # sufficient info reward: location question reward_helper_info["observation_before_finish"] = answerer_input reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_location( reward_helper_info) elif agent.question_type == "existence": # sufficient info reward: existence question reward_helper_info["observation_before_finish"] = answerer_input reward_helper_info[ "game_facts_per_step"] = game_facts_cache # facts before issuing command (we want to stop at correct state) reward_helper_info["init_game_facts"] = init_facts reward_helper_info["full_facts"] = infos["facts"] reward_helper_info["answers"] = answers reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_existence( reward_helper_info) elif agent.question_type == "attribute": # sufficient info reward: attribute question reward_helper_info["answers"] = answers reward_helper_info[ "game_facts_per_step"] = game_facts_cache # facts before and after issuing commands (we want to compare the differnce) reward_helper_info["init_game_facts"] = init_facts reward_helper_info["full_facts"] = infos["facts"] reward_helper_info[ "commands_per_step"] = commands_per_step # commands before and after issuing commands (we want to compare the differnce) reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_attribute( reward_helper_info) else: raise NotImplementedError # push qa experience into qa replay buffer for b in range(batch_size): # data points in batch # if the agent is not in the correct state, do not push it into replay buffer if np.sum(sufficient_info_reward_np[b]) == 0.0: continue agent.qa_replay_memory.push(False, qa_reward_np[b], answerer_input[b], questions[b], answers[b]) # assign sufficient info reward and counting reward to the corresponding steps counting_rewards_np = np.stack(counting_rewards_np, 1) # batch x game step valid_command_rewards_np = np.stack(valid_command_rewards_np, 1) # batch x game step command_rewards_np = sufficient_info_reward_np + counting_rewards_np * game_running_mask.T * agent.revisit_counting_lambda + valid_command_rewards_np * game_running_mask.T * agent.valid_command_bonus_lambda # batch x game step command_rewards = generic.to_pt(command_rewards_np, enable_cuda=agent.use_cuda, type="float") # batch x game step for i in range(command_rewards_np.shape[1]): transition_cache[i].append(command_rewards[:, i]) print(command_rewards_np[0]) # push command generation experience into replay buffer for b in range(batch_size): is_prior = np.sum(command_rewards_np[b], 0) > 0.0 for i in range(len(transition_cache)): batch_observation_strings, batch_question_strings, batch_possible_words, batch_chosen_indices, _, batch_rewards = transition_cache[ i] is_final = True if masks_np[i][b] != 0: is_final = False agent.command_generation_replay_memory.push( is_prior, batch_observation_strings[b], batch_question_strings[b], [item[b] for item in batch_possible_words], [item[b] for item in batch_chosen_indices], batch_rewards[b], is_final) if masks_np[i][b] == 0.0: break # for printing r_qa = np.mean(qa_reward_np) r_sufficient_info = np.mean(np.sum(sufficient_info_reward_np, -1)) running_avg_qa_reward.push(r_qa) running_avg_sufficient_info_reward.push(r_sufficient_info) print_rewards = np.mean(np.sum(command_rewards_np, -1)) obs_string = answerer_input[0] print(obs_string) # finish game agent.finish_of_episode(episode_no, batch_size) # close env env.close() if agent.train_data_size == -1: # when games are generated on the fly, # remove all files (including .json and .ni) that have been used files_to_delete = [] for gamefile in can_delete_these: if not gamefile.endswith(".ulx"): continue files_to_delete.append(gamefile) files_to_delete.append(gamefile.replace(".ulx", ".json")) files_to_delete.append(gamefile.replace(".ulx", ".ni")) # print("rm -f {}".format(" ".join(files_to_delete))) os.system("rm -f {}".format(" ".join(files_to_delete))) episode_no += batch_size time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | interaction loss: {:2.3f} | qa loss: {:2.3f} | rewards: {:2.3f} | qa acc: {:2.3f}/{:2.3f} | correct state: {:2.3f}/{:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], running_avg_correct_state_loss.get_avg(), running_avg_qa_loss.get_avg(), print_rewards, r_qa, running_avg_qa_reward.get_avg(), r_sufficient_info, running_avg_sufficient_info_reward.get_avg())) if episode_no < agent.learn_start_from_this_episode: continue if episode_no == 0 or ( episode_no % agent.save_frequency > (episode_no - batch_size) % agent.save_frequency): continue eval_qa_reward, eval_sufficient_info_reward = 0.0, 0.0 # evaluate if agent.run_eval: eval_qa_reward, eval_sufficient_info_reward = evaluate.evaluate( data_dir, agent) # if run eval, then save model by eval accucacy if eval_qa_reward + eval_sufficient_info_reward > best_sum_reward_so_far: best_sum_reward_so_far = eval_qa_reward + eval_sufficient_info_reward agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # save model elif agent.save_checkpoint: if running_avg_qa_reward.get_avg( ) + running_avg_sufficient_info_reward.get_avg( ) > best_sum_reward_so_far: best_sum_reward_so_far = running_avg_qa_reward.get_avg( ) + running_avg_sufficient_info_reward.get_avg() agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # plot using visdom viz_avg_correct_state_acc.append( running_avg_sufficient_info_reward.get_avg()) viz_avg_qa_acc.append(running_avg_qa_reward.get_avg()) viz_eval_sufficient_info_reward.append(eval_sufficient_info_reward) viz_eval_qa_reward.append(eval_qa_reward) viz_x = np.arange(len(viz_avg_correct_state_acc)).tolist() if plt_win is None: plt_win = viz.line(X=viz_x, Y=viz_avg_correct_state_acc, opts=dict(title=agent.experiment_tag + "_train"), name="correct state") viz.line(X=viz_x, Y=viz_avg_qa_acc, opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="qa") else: viz.line(X=[len(viz_avg_correct_state_acc) - 1], Y=[viz_avg_correct_state_acc[-1]], opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="correct state") viz.line(X=[len(viz_avg_qa_acc) - 1], Y=[viz_avg_qa_acc[-1]], opts=dict(title=agent.experiment_tag + "_train"), win=plt_win, update='append', name="qa") if eval_plt_win is None: eval_plt_win = viz.line(X=viz_x, Y=viz_eval_sufficient_info_reward, opts=dict(title=agent.experiment_tag + "_eval"), name="correct state") viz.line(X=viz_x, Y=viz_eval_qa_reward, opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="qa") else: viz.line(X=[len(viz_eval_sufficient_info_reward) - 1], Y=[viz_eval_sufficient_info_reward[-1]], opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="correct state") viz.line(X=[len(viz_eval_qa_reward) - 1], Y=[viz_eval_qa_reward[-1]], opts=dict(title=agent.experiment_tag + "_eval"), win=eval_plt_win, update='append', name="qa") # write accucacies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "sufficient info": running_avg_sufficient_info_reward.get_avg(), "qa": running_avg_qa_reward.get_avg(), "eval sufficient info": eval_sufficient_info_reward, "eval qa": eval_qa_reward }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush()
def avg_rewards(self): if len(self._storage) == 0: return 0.0 rewards = [self._storage[i].reward for i in range(len(self._storage))] return to_np(torch.mean(torch.stack(rewards)))
def train(): time_1 = datetime.datetime.now() config = generic.load_config() agent = Agent(config) output_dir = "." data_dir = "." # make game environments requested_infos = agent.select_additional_infos_lite() requested_infos_eval = agent.select_additional_infos() games_dir = "./" # training game env env, _ = reinforcement_learning_dataset.get_training_game_env(games_dir + config['rl']['data_path'], config['rl']['difficulty_level'], config['rl']['training_size'], requested_infos, agent.max_nb_steps_per_episode, agent.batch_size) if agent.run_eval: # training game env eval_env, num_eval_game = reinforcement_learning_dataset.get_evaluation_game_env(games_dir + config['rl']['data_path'], config['rl']['difficulty_level'], requested_infos_eval, agent.eval_max_nb_steps_per_episode, agent.eval_batch_size, valid_or_test="valid") else: eval_env, num_eval_game = None, None # visdom if config["general"]["visdom"]: import visdom viz = visdom.Visdom() reward_win, step_win = None, None dqn_loss_win = None eval_game_points_win, eval_step_win = None, None viz_game_rewards, viz_game_points, viz_game_points_normalized, viz_graph_rewards, viz_count_rewards, viz_step = [], [], [], [], [], [] viz_dqn_loss = [] viz_eval_game_points, viz_eval_game_points_normalized, viz_eval_step = [], [], [] step_in_total = 0 episode_no = 0 running_avg_game_points = HistoryScoreCache(capacity=500) running_avg_game_points_normalized = HistoryScoreCache(capacity=500) running_avg_graph_rewards = HistoryScoreCache(capacity=500) running_avg_count_rewards = HistoryScoreCache(capacity=500) running_avg_game_steps = HistoryScoreCache(capacity=500) running_avg_dqn_loss = HistoryScoreCache(capacity=500) running_avg_game_rewards = HistoryScoreCache(capacity=500) json_file_name = agent.experiment_tag.replace(" ", "_") best_train_performance_so_far, best_eval_performance_so_far = 0.0, 0.0 prev_performance = 0.0 if os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"): agent.load_pretrained_graph_generation_model(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") else: print("No real-valued graph generation module detected... Please check ", data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): # this experiment itself (in case the experiment crashes for unknown reasons on server) agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) agent.update_target_net() elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): # load from pre-trained graph encoder agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt") agent.update_target_net() i_am_patient = 0 perfect_training = 0 while(True): if episode_no > agent.max_episode: break np.random.seed(episode_no) env.seed(episode_no) obs, infos = env.reset() # filter look and examine actions for commands_ in infos["admissible_commands"]: for cmd_ in [cmd for cmd in commands_ if cmd != "examine cookbook" and cmd.split()[0] in ["examine", "look"]]: commands_.remove(cmd_) batch_size = len(obs) agent.train() agent.init() game_name_list = [game.metadata["uuid"].split("-")[-1] for game in infos["game"]] game_max_score_list = [game.max_score for game in infos["game"]] chosen_actions = [] prev_step_dones, prev_rewards = [], [] prev_graph_hidden_state = torch.zeros(batch_size, agent.online_net.block_hidden_dim) if agent.use_cuda: prev_graph_hidden_state = prev_graph_hidden_state.cuda() for _ in range(batch_size): chosen_actions.append("restart") prev_step_dones.append(0.0) prev_rewards.append(0.0) prev_h, prev_c = None, None episodes_masks = 1 - torch.tensor(prev_step_dones) # inverse of `prev_step_dones` episodes_masks = episodes_masks.cuda() if agent.use_cuda else episodes_masks observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite(obs, infos) observation_for_counting = copy.copy(observation_strings) if agent.count_reward_lambda > 0: agent.reset_binarized_counter(batch_size) _ = agent.get_binarized_count(observation_for_counting) # it requires to store sequences of transitions into memory with order, # so we use a cache to keep what agents returns, and push them into memory # altogether in the end of game. transition_cache = [] still_running_mask = [] game_rewards, game_points, graph_rewards, count_rewards = [], [], [], [] print_actions = [] act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode for step_no in range(agent.max_nb_steps_per_episode): if agent.noisy_net: agent.reset_noise() # Draw a new set of noisy weights # generate adj_matrices new_adjacency_matrix, new_graph_hidden_state = agent.generate_adjacency_matrix_for_rl(observation_strings, chosen_actions, prev_graph_hidden_state) new_chosen_actions, chosen_indices, prev_h, prev_c = agent.act(observation_strings, new_adjacency_matrix, action_candidate_list, previous_h=prev_h, previous_c=prev_c, random=act_randomly) replay_info = [observation_strings, action_candidate_list, chosen_indices, generic.to_np(prev_graph_hidden_state), chosen_actions] transition_cache.append(replay_info) chosen_actions = new_chosen_actions chosen_actions_before_parsing = [item[idx] for item, idx in zip(infos["admissible_commands"], chosen_indices)] obs, scores, dones, infos = env.step(chosen_actions_before_parsing) # filter look and examine actions for commands_ in infos["admissible_commands"]: for cmd_ in [cmd for cmd in commands_ if cmd != "examine cookbook" and cmd.split()[0] in ["examine", "look"]]: commands_.remove(cmd_) ## prev_triplets = current_triplets # commented for obs_gen prev_graph_hidden_state = new_graph_hidden_state observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite(obs, infos) observation_for_counting = copy.copy(observation_strings) if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0: agent.reset_noise() # Draw a new set of noisy weights if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0: dqn_loss, _ = agent.update_dqn(episode_no) if dqn_loss is not None: running_avg_dqn_loss.push(dqn_loss) if step_no == agent.max_nb_steps_per_episode - 1: # terminate the game because DQN requires one extra step dones = [True for _ in dones] step_in_total += 1 still_running = [1.0 - float(item) for item in prev_step_dones] # list of float prev_step_dones = dones step_rewards = [float(curr) - float(prev) for curr, prev in zip(scores, prev_rewards)] # list of float game_points.append(copy.copy(step_rewards)) if agent.use_negative_reward: step_rewards = [-1.0 if _lost else r for r, _lost in zip(step_rewards, infos["has_lost"])] # list of float step_rewards = [5.0 if _won else r for r, _won in zip(step_rewards, infos["has_won"])] # list of float prev_rewards = scores step_graph_rewards = [0.0 for _ in range(batch_size)] ## adding for obs_gen # counting bonus if agent.count_reward_lambda > 0: step_revisit_counting_rewards = agent.get_binarized_count(observation_for_counting, update=True) step_revisit_counting_rewards = [r * agent.count_reward_lambda for r in step_revisit_counting_rewards] else: step_revisit_counting_rewards = [0.0 for _ in range(batch_size)] still_running_mask.append(still_running) game_rewards.append(step_rewards) graph_rewards.append(step_graph_rewards) count_rewards.append(step_revisit_counting_rewards) print_actions.append(chosen_actions_before_parsing[0] if still_running[0] else "--") # if all ended, break if np.sum(still_running) == 0: break still_running_mask_np = np.array(still_running_mask) game_rewards_np = np.array(game_rewards) * still_running_mask_np # step x batch game_points_np = np.array(game_points) * still_running_mask_np # step x batch graph_rewards_np = np.array(graph_rewards) * still_running_mask_np # step x batch count_rewards_np = np.array(count_rewards) * still_running_mask_np # step x batch if agent.graph_reward_lambda > 0.0: graph_rewards_pt = generic.to_pt(graph_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch else: graph_rewards_pt = generic.to_pt(np.zeros_like(graph_rewards_np), enable_cuda=agent.use_cuda, type='float') # step x batch if agent.count_reward_lambda > 0.0: count_rewards_pt = generic.to_pt(count_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch else: count_rewards_pt = generic.to_pt(np.zeros_like(count_rewards_np), enable_cuda=agent.use_cuda, type='float') # step x batch command_rewards_pt = generic.to_pt(game_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch # push experience into replay buffer (dqn) avg_rewards_in_buffer = agent.dqn_memory.avg_rewards() for b in range(game_rewards_np.shape[1]): if still_running_mask_np.shape[0] == agent.max_nb_steps_per_episode and still_running_mask_np[-1][b] != 0: # need to pad one transition _need_pad = True tmp_game_rewards = game_rewards_np[:, b].tolist() + [0.0] else: _need_pad = False tmp_game_rewards = game_rewards_np[:, b] if np.mean(tmp_game_rewards) < avg_rewards_in_buffer * agent.buffer_reward_threshold: continue for i in range(game_rewards_np.shape[0]): observation_strings, action_candidate_list, chosen_indices, graph_hidden_state, prev_action_strings = transition_cache[i] is_final = True if still_running_mask_np[i][b] != 0: is_final = False agent.dqn_memory.add(observation_strings[b], prev_action_strings[b], action_candidate_list[b], chosen_indices[b], graph_hidden_state[b], command_rewards_pt[i][b], graph_rewards_pt[i][b], count_rewards_pt[i][b], is_final) if still_running_mask_np[i][b] == 0: break if _need_pad: observation_strings, action_candidate_list, chosen_indices, graph_hidden_state, prev_action_strings = transition_cache[-1] agent.dqn_memory.add(observation_strings[b], prev_action_strings[b], action_candidate_list[b], chosen_indices[b], graph_hidden_state[b], command_rewards_pt[-1][b] * 0.0, graph_rewards_pt[-1][b] * 0.0, count_rewards_pt[-1][b] * 0.0, True) for b in range(batch_size): running_avg_game_points.push(np.sum(game_points_np, 0)[b]) game_max_score_np = np.array(game_max_score_list, dtype="float32") running_avg_game_points_normalized.push((np.sum(game_points_np, 0) / game_max_score_np)[b]) running_avg_game_steps.push(np.sum(still_running_mask_np, 0)[b]) running_avg_game_rewards.push(np.sum(game_rewards_np, 0)[b]) running_avg_graph_rewards.push(np.sum(graph_rewards_np, 0)[b]) running_avg_count_rewards.push(np.sum(count_rewards_np, 0)[b]) # finish game agent.finish_of_episode(episode_no, batch_size) episode_no += batch_size if episode_no < agent.learn_start_from_this_episode: continue if agent.report_frequency == 0 or (episode_no % agent.report_frequency > (episode_no - batch_size) % agent.report_frequency): continue time_2 = datetime.datetime.now() print("Episode: {:3d} | time spent: {:s} | dqn loss: {:2.3f} | game points: {:2.3f} | normalized game points: {:2.3f} | game rewards: {:2.3f} | graph rewards: {:2.3f} | count rewards: {:2.3f} | used steps: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], running_avg_dqn_loss.get_avg(), running_avg_game_points.get_avg(), running_avg_game_points_normalized.get_avg(), running_avg_game_rewards.get_avg(), running_avg_graph_rewards.get_avg(), running_avg_count_rewards.get_avg(), running_avg_game_steps.get_avg())) print(game_name_list[0] + ": " + " | ".join(print_actions)) # evaluate curr_train_performance = running_avg_game_points_normalized.get_avg() eval_game_points, eval_game_points_normalized, eval_game_step = 0.0, 0.0, 0.0 if agent.run_eval: eval_game_points, eval_game_points_normalized, eval_game_step, detailed_scores = evaluate.evaluate_rl_with_real_graphs(eval_env, agent, num_eval_game) curr_eval_performance = eval_game_points_normalized curr_performance = curr_eval_performance if curr_eval_performance > best_eval_performance_so_far: best_eval_performance_so_far = curr_eval_performance agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") elif curr_eval_performance == best_eval_performance_so_far: if curr_eval_performance > 0.0: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: if curr_train_performance >= best_train_performance_so_far: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: curr_eval_performance = 0.0 detailed_scores = "" curr_performance = curr_train_performance if curr_train_performance >= best_train_performance_so_far: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # update best train performance if curr_train_performance >= best_train_performance_so_far: best_train_performance_so_far = curr_train_performance if prev_performance <= curr_performance: i_am_patient = 0 else: i_am_patient += 1 prev_performance = curr_performance # if patient >= patience, resume from checkpoint if agent.patience > 0 and i_am_patient >= agent.patience: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): print('reload from a good checkpoint...') agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) agent.update_target_net() i_am_patient = 0 if running_avg_game_points_normalized.get_avg() >= 0.95: perfect_training += 1 else: perfect_training = 0 # plot using visdom if config["general"]["visdom"]: viz_game_rewards.append(running_avg_game_rewards.get_avg()) viz_game_points.append(running_avg_game_points.get_avg()) viz_game_points_normalized.append(running_avg_game_points_normalized.get_avg()) viz_graph_rewards.append(running_avg_graph_rewards.get_avg()) viz_count_rewards.append(running_avg_count_rewards.get_avg()) viz_step.append(running_avg_game_steps.get_avg()) viz_dqn_loss.append(running_avg_dqn_loss.get_avg()) viz_eval_game_points.append(eval_game_points) viz_eval_game_points_normalized.append(eval_game_points_normalized) viz_eval_step.append(eval_game_step) viz_x = np.arange(len(viz_game_rewards)).tolist() if reward_win is None: reward_win = viz.line(X=viz_x, Y=viz_game_rewards, opts=dict(title=agent.experiment_tag + "_game_rewards"), name="game_rewards") viz.line(X=viz_x, Y=viz_graph_rewards, opts=dict(title=agent.experiment_tag + "_graph_rewards"), win=reward_win, update='append', name="graph_rewards") viz.line(X=viz_x, Y=viz_count_rewards, opts=dict(title=agent.experiment_tag + "_count_rewards"), win=reward_win, update='append', name="count_rewards") viz.line(X=viz_x, Y=viz_game_points, opts=dict(title=agent.experiment_tag + "_game_points"), win=reward_win, update='append', name="game_points") viz.line(X=viz_x, Y=viz_game_points_normalized, opts=dict(title=agent.experiment_tag + "_game_points_normalized"), win=reward_win, update='append', name="game_points_normalized") else: viz.line(X=[len(viz_game_rewards) - 1], Y=[viz_game_rewards[-1]], opts=dict(title=agent.experiment_tag + "_game_rewards"), win=reward_win, update='append', name="game_rewards") viz.line(X=[len(viz_graph_rewards) - 1], Y=[viz_graph_rewards[-1]], opts=dict(title=agent.experiment_tag + "_graph_rewards"), win=reward_win, update='append', name="graph_rewards") viz.line(X=[len(viz_count_rewards) - 1], Y=[viz_count_rewards[-1]], opts=dict(title=agent.experiment_tag + "_count_rewards"), win=reward_win, update='append', name="count_rewards") viz.line(X=[len(viz_game_points) - 1], Y=[viz_game_points[-1]], opts=dict(title=agent.experiment_tag + "_game_points"), win=reward_win, update='append', name="game_points") viz.line(X=[len(viz_game_points_normalized) - 1], Y=[viz_game_points_normalized[-1]], opts=dict(title=agent.experiment_tag + "_game_points_normalized"), win=reward_win, update='append', name="game_points_normalized") if step_win is None: step_win = viz.line(X=viz_x, Y=viz_step, opts=dict(title=agent.experiment_tag + "_step"), name="step") else: viz.line(X=[len(viz_step) - 1], Y=[viz_step[-1]], opts=dict(title=agent.experiment_tag + "_step"), win=step_win, update='append', name="step") if dqn_loss_win is None: dqn_loss_win = viz.line(X=viz_x, Y=viz_dqn_loss, opts=dict(title=agent.experiment_tag + "_dqn_loss"), name="dqn loss") else: viz.line(X=[len(viz_dqn_loss) - 1], Y=[viz_dqn_loss[-1]], opts=dict(title=agent.experiment_tag + "_dqn_loss"), win=dqn_loss_win, update='append', name="dqn loss") if eval_game_points_win is None: eval_game_points_win = viz.line(X=viz_x, Y=viz_eval_game_points, opts=dict(title=agent.experiment_tag + "_eval_game_points"), name="eval game points") viz.line(X=viz_x, Y=viz_eval_game_points_normalized, opts=dict(title=agent.experiment_tag + "_eval_game_points_normalized"), win=eval_game_points_win, update='append', name="eval_game_points_normalized") else: viz.line(X=[len(viz_eval_game_points) - 1], Y=[viz_eval_game_points[-1]], opts=dict(title=agent.experiment_tag + "_eval_game_points"), win=eval_game_points_win, update='append', name="eval game_points") viz.line(X=[len(viz_eval_game_points_normalized) - 1], Y=[viz_eval_game_points_normalized[-1]], opts=dict(title=agent.experiment_tag + "_eval_game_points_normalized"), win=eval_game_points_win, update='append', name="eval_game_points_normalized") if eval_step_win is None: eval_step_win = viz.line(X=viz_x, Y=viz_eval_step, opts=dict(title=agent.experiment_tag + "_eval_step"), name="eval step") else: viz.line(X=[len(viz_eval_step) - 1], Y=[viz_eval_step[-1]], opts=dict(title=agent.experiment_tag + "_eval_step"), win=eval_step_win, update='append', name="eval step") # write accuracies down into file _s = json.dumps({"time spent": str(time_2 - time_1).rsplit(".")[0], "dqn loss": str(running_avg_dqn_loss.get_avg()), "train game points": str(running_avg_game_points.get_avg()), "train normalized game points": str(running_avg_game_points_normalized.get_avg()), "train game rewards": str(running_avg_game_rewards.get_avg()), "train graph rewards": str(running_avg_graph_rewards.get_avg()), "train count rewards": str(running_avg_count_rewards.get_avg()), "train steps": str(running_avg_game_steps.get_avg()), "eval game points": str(eval_game_points), "eval normalized game points": str(eval_game_points_normalized), "eval steps": str(eval_game_step), "detailed scores": detailed_scores}) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush() if curr_performance == 1.0 and curr_train_performance >= 0.95: break if perfect_training >= 3: break
def evaluate(data_path, agent): eval_data_path = pjoin(data_path, agent.eval_data_path) with open(eval_data_path) as f: data = json.load(f) data = data[agent.question_type] data = data["random_map"] if agent.random_map else data["fixed_map"] correct_answers = [] predicted_answers = [] print_qa_reward, print_sufficient_info_reward = [], [] for game_path in tqdm(data): game_file_path = pjoin(data_path, game_path) assert os.path.exists( game_file_path ), "Oh no! game path %s does not exist!" % game_file_path env_id = register_games([game_file_path], request_infos=request_infos) env_id = make_batch(env_id, batch_size=agent.eval_batch_size, parallel=True) env = gym.make(env_id) data_questions = [item["question"] for item in data[game_path]] data_answers = [item["answer"] for item in data[game_path]] data_entities = [item["entity"] for item in data[game_path]] if agent.question_type == "attribute": data_attributes = [item["attribute"] for item in data[game_path]] for q_no in range(len(data_questions)): questions = data_questions[q_no:q_no + 1] answers = data_answers[q_no:q_no + 1] reward_helper_info = { "_entities": data_entities[q_no:q_no + 1], "_answers": data_answers[q_no:q_no + 1] } if agent.question_type == "attribute": reward_helper_info["_attributes"] = data_attributes[q_no:q_no + 1] obs, infos = env.reset() batch_size = len(obs) agent.eval() agent.init(obs, infos) # get inputs commands, last_facts, init_facts = [], [], [] commands_per_step, game_facts_cache = [], [] for i in range(batch_size): commands.append("restart") last_facts.append(None) init_facts.append(None) game_facts_cache.append([]) commands_per_step.append(["restart"]) observation_strings, possible_words = agent.get_game_info_at_certain_step( obs, infos) observation_strings = [ a + " <|> " + item for a, item in zip(commands, observation_strings) ] input_quest, input_quest_char, _ = agent.get_agent_inputs( questions) transition_cache = [] for step_no in range(agent.eval_max_nb_steps_per_episode): # update answerer input for i in range(batch_size): if agent.not_finished_yet[i] == 1: agent.naozi.push_one(i, copy.copy(observation_strings[i])) if agent.prev_step_is_still_interacting[i] == 1: new_facts = process_facts(last_facts[i], infos["game"][i], infos["facts"][i], infos["last_action"][i], commands[i]) game_facts_cache[i].append( new_facts ) # info used in reward computing of existence question last_facts[i] = new_facts if step_no == 0: init_facts[i] = copy.copy(new_facts) observation_strings_w_history = agent.naozi.get() input_observation, input_observation_char, _ = agent.get_agent_inputs( observation_strings_w_history) commands, replay_info = agent.act(obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words, random=False) for i in range(batch_size): commands_per_step[i].append(commands[i]) replay_info = [ observation_strings_w_history, questions, possible_words ] + replay_info transition_cache.append(replay_info) obs, _, _, infos = env.step(commands) # possible words no not depend on history, because one can only interact with what is currently accessible observation_strings, possible_words = agent.get_game_info_at_certain_step( obs, infos) observation_strings = [ a + " <|> " + item for a, item in zip(commands, observation_strings) ] if (step_no == agent.eval_max_nb_steps_per_episode - 1) or (step_no > 0 and np.sum( generic.to_np(replay_info[-1])) == 0): break # The agent has exhausted all steps, now answer question. answerer_input = agent.naozi.get() answerer_input_observation, answerer_input_observation_char, answerer_observation_ids = agent.get_agent_inputs( answerer_input) chosen_word_indices = agent.answer_question_act_greedy( answerer_input_observation, answerer_input_observation_char, answerer_observation_ids, input_quest, input_quest_char) # batch chosen_word_indices_np = generic.to_np(chosen_word_indices) chosen_answers = [ agent.word_vocab[item] for item in chosen_word_indices_np ] correct_answers.extend(answers) predicted_answers.extend(chosen_answers) # rewards # qa reward qa_reward_np = reward_helper.get_qa_reward(answers, chosen_answers) # sufficient info rewards masks = [item[-1] for item in transition_cache] masks_np = [generic.to_np(item) for item in masks] # 1 1 0 0 0 --> 1 1 0 0 0 0 game_finishing_mask = np.stack( masks_np + [np.zeros( (batch_size, ))], 0) # game step+1 x batch size # 1 1 0 0 0 0 --> 0 1 0 0 0 game_finishing_mask = game_finishing_mask[:-1, :] - game_finishing_mask[ 1:, :] # game step x batch size if agent.question_type == "location": # sufficient info reward: location question reward_helper_info[ "observation_before_finish"] = answerer_input reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_location( reward_helper_info) elif agent.question_type == "existence": # sufficient info reward: existence question reward_helper_info[ "observation_before_finish"] = answerer_input reward_helper_info[ "game_facts_per_step"] = game_facts_cache # facts before issuing command (we want to stop at correct state) reward_helper_info["init_game_facts"] = init_facts reward_helper_info["full_facts"] = infos["facts"] reward_helper_info["answers"] = answers reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_existence( reward_helper_info) elif agent.question_type == "attribute": # sufficient info reward: attribute question reward_helper_info["answers"] = answers reward_helper_info[ "game_facts_per_step"] = game_facts_cache # facts before and after issuing commands (we want to compare the differnce) reward_helper_info["init_game_facts"] = init_facts reward_helper_info["full_facts"] = infos["facts"] reward_helper_info[ "commands_per_step"] = commands_per_step # commands before and after issuing commands (we want to compare the differnce) reward_helper_info["game_finishing_mask"] = game_finishing_mask sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_attribute( reward_helper_info) else: raise NotImplementedError r_qa = np.mean(qa_reward_np) r_sufficient_info = np.mean(np.sum(sufficient_info_reward_np, -1)) print_qa_reward.append(r_qa) print_sufficient_info_reward.append(r_sufficient_info) env.close() precision, recall, fscore, _ = precision_recall_fscore_support( correct_answers, predicted_answers, average='micro') print("\n\n---------- From evaluation --------\n") print("precision: %f, recall: %f, f1 score: %f" % (precision, recall, fscore)) print("\n\n---------------------------------") print("===== Eval =====: qa acc: {:2.3f} | correct state: {:2.3f}".format( np.mean(print_qa_reward), np.mean(print_sufficient_info_reward))) return np.mean(print_qa_reward), np.mean(print_sufficient_info_reward)