def test(): test_env = UserSimulator(disease_symptom_path, disease_symptom_mapping_path) total_rewards = 0 for i_episode in range(25): state = test_env.reset() top_10_predictions = 0 top_5_predictions = 0 for t in count(): action = select_action(np.expand_dims(state, axis=0)) next_state, reward, done, _ = test_env.step(action.item()) state = next_state if done: total_rewards += reward with torch.no_grad(): q_values = policy_net(np.expand_dims(state, axis=0)).squeeze(0) diagnosis_q_values = q_values[test_env.num_symptom:] # # print (diagnosis_q_values.shape) top_10_disease, top_5_disease = test_env.get_top_diseases( diagnosis_q_values) if test_env.goal in top_10_disease: top_10_predictions += 1 if test_env.goal in top_5_disease: top_5_predictions += 1 break # print ('Test Rewards : ', total_rewards/25.) return total_rewards / 25., top_10_predictions / 25., top_5_predictions / 25.
class DialogEnv(gym.Env): def __init__( self, user_goals: List[UserGoal], emc_params: Dict, max_round_num: int, database: Dict, slot2values: Dict[str, List[Any]], ) -> None: self.user = UserSimulator(user_goals, max_round_num) self.emc = ErrorModelController(slot2values, emc_params) self.state_tracker = StateTracker(database, max_round_num) self.action_space = gym.spaces.Discrete(len(AGENT_ACTIONS)) self.observation_space = gym.spaces.multi_binary.MultiBinary( self.state_tracker.get_state_size()) def step(self, agent_action_index: int): agent_action = map_index_to_action(agent_action_index) self.state_tracker.update_state_agent(agent_action) user_action, reward, done, success = self.user.step(agent_action) if not done: self.emc.infuse_error(user_action) self.state_tracker.update_state_user(user_action) next_state = self.state_tracker.get_state(done) return next_state, reward, done, success def reset(self): self.state_tracker.reset() init_user_action = self.user.reset() self.emc.infuse_error(init_user_action) self.state_tracker.update_state_user(init_user_action) return self.state_tracker.get_state()
plt.ylabel('Prediction Accuracys') plt.savefig(default_pred_path) plt.close() test_rewards_list = [] test_episode_list = [] test_top_5_pred_list = [] test_top_10_pred_list = [] for i_episode in range(num_episodes): # Initialize the environment and state state = env.reset() for t in count(): # Select and perform an action action = select_action(np.expand_dims(state, axis=0)) next_state, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) # Store the transition in memory memory.push(torch.tensor([state], device=device), torch.tensor([[action]], device=device), torch.tensor([next_state], device=device), reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model() if done: episode_durations.append(t + 1) break # Update the target network, copying all weights and biases in DQN if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict())