class D3QNetAgent(AgentWithConverter): def __init__(self, observation_space, action_space, num_frames=4, batch_size=32, learning_rate=1e-5, learning_rate_decay_steps=10000, learning_rate_decay_rate=0.95, discount_factor=0.95, tau=1e-2, lam=1, per_size=50000, per_alpha=0.6, per_beta=0.4, per_anneal_rate=1.5e6, epsilon=0.99, decay_epsilon=1024 * 32, final_epsilon=0.0001): # initializes AgentWithConverter class to handle action conversiions AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) self.obs_space = observation_space self.act_space = action_space self.num_frames = num_frames self.batch_size = batch_size self.lr = learning_rate self.lr_decay_steps = learning_rate_decay_steps self.lr_decay_rate = learning_rate_decay_rate self.gamma = discount_factor self.lam = lam self.tau = tau # epsilon is the degree of exploraation self.initial_epsilon = epsilon # Adaptive epsilon decay constants self.decay_epsilon = decay_epsilon self.final_epsilon = final_epsilon #PER data self.buff_size = per_size self.alpha = per_alpha self.beta = per_beta self.anneal = per_anneal_rate self.observation_size = self.obs_space.size_obs() self.action_size = self.act_space.size() self.d3qn = D3QNet(self.action_size, self.observation_size, self.num_frames, self.lr, self.lr_decay_steps, self.lr_decay_rate, self.batch_size, self.gamma, self.tau) #State variables self.obs = None self.done = None self.epsilon = self.initial_epsilon self.state = [] self.frames = [] self.next_frames = [] self.replay_buffer = PrioritizedReplayBuffer(self.buff_size, self.alpha, self.beta, self.anneal) return ## Helper Functions # Adds to current frame buffer, enforfes length to num_frames def update_curr_frame(self, obs): self.frames.append(obs.copy()) if (len(self.frames) > self.num_frames): self.frames.pop(0) return # Adds next frame to next frame buffer, enforces length to num_frames def update_next_frame(self, next_obs): self.next_frames.append(next_obs.copy()) if (len(self.next_frames) > self.num_frames): self.next_frames.pop(0) return # Adaptive epsilon decay determines next epsilon based off number of steps # completed and curren epsilon def set_next_epsilon(self, current_step): ada_div = self.decay_epsilon / 10.0 step_off = current_step + ada_div ada_eps = self.initial_epsilon * -math.log10( (step_off + 1) / (self.decay_epsilon + ada_div)) ada_eps_up_clip = min(self.initial_epsilon, ada_eps) ada_eps_low_clip = max(self.final_epsilon, ada_eps_up_clip) self.epsilon = ada_eps_low_clip return ## Agent Interface #Adapted from l2rpn-baselines from RTE-France # Vectorizes observations from grid2op environment for neural network uses def convert_obs(self, obs): li_vect = [] for el in obs.attr_list_vect: v = obs._get_array_from_attr_name(el).astype(np.float32) v_fix = np.nan_to_num(v) v_norm = np.linalg.norm(v_fix) if v_norm > 1e6: v_res = (v_fix / v_norm) * 10.0 else: v_res = v_fix li_vect.append(v_res) return np.concatenate(li_vect) # converts encoded action number to action used to interact with grid2op # environment def convert_act(self, encoded_act): return super().convert_act(encoded_act) # Required for agent evaluation # Returns random action or best action as estimated by Q network based on # exploration parameter (espilon) def my_act(self, state, reward, done): if (len(self.frames) < self.num_frames): return 0 #do nothing random_act = random.randint(0, self.action_size) self.update_curr_frame(state) qnet_act, _ = self.dqn.model_action(np.array(self.frames)) if (np.random.rand(1) < self.epsilon): return random_act else: return qnet_act ## Training Loop def learn(self, env, num_epochs, num_steps, soft_update_freq=250, hard_update_freq=1000): #pre-training to fill buffer print("Starting Pretraining...\n") self.done = True # Plays random moves and saves the resulting (s, a, r, s', d) pair to # replay buffer. Resets environment when done and continues while (len(self.replay_buffer) < self.buff_size): if (self.done): # reset environment and state parameters new_env = env.reset() self.frames = [] self.next_frames = [] self.done = False self.obs = new_env self.state = self.convert_obs(self.obs) self.update_curr_frame(self.state) # action is random encoded_act = np.random.randint(0, self.action_size) act = self.convert_act(encoded_act) new_obs, reward, self.done, info = env.step(act) gplay_reward = info['rewards']['gameplay'] adj_reward = self.lam * reward + (1 - self.lam) * gplay_reward new_state = self.convert_obs(new_obs) self.update_next_frame(new_state) # only add to buffer if num_frames states are seen if (len(self.frames) == self.num_frames and len(self.next_frames) == self.num_frames): agg_state = np.array(self.frames) agg_next_state = np.array(self.next_frames) #(s,a,r,s',d) pair self.replay_buffer.add(agg_state, encoded_act, adj_reward, agg_next_state, self.done) self.obs = new_obs self.state = new_state epoch = 0 # number of complete runs through environment total_steps = 0 # total number of training steps across all epochs losses = [] # losses[i] is loss from dqn at total_step i avg_losses = [ ] # avg_losses[i] is avg loss during training during epcoh i net_reward = [] #net_reward[i] is total reward during epoch i alive = [] # alive[i] is number of steps survived for at epoch i print("Starting training...\n") # Trains a minimum of num_steps or num_epochs while (total_steps < num_steps or epoch < num_epochs): total_reward = 0 curr_steps = 0 total_loss = [] # Reset state parameters self.frames = [] self.next_frames = [] self.done = False self.obs = env.reset() self.state = self.convert_obs(self.obs) # continues until failure while (not self.done): self.update_curr_frame(self.state) # Determine action if (len(self.frames) < self.num_frames): enc_act = 0 # do nothing elif (np.random.rand(1) < self.epsilon): enc_act = np.random.randint(0, self.action_size) else: input = np.array(self.frames) enc_act, _ = self.d3qn.model_action(input) # converts action and steps in environment act = self.convert_act(enc_act) new_obs, reward, self.done, info = env.step(act) gplay_reward = info['rewards']['gameplay'] adj_reward = self.lam * reward + (1 - self.lam) * gplay_reward new_state = self.convert_obs(new_obs) # updates next_state frame self.update_next_frame(new_state) if (len(self.frames) == self.num_frames and len(self.next_frames) == self.num_frames): agg_state = np.array(self.frames) agg_next_state = np.array(self.next_frames) # Adds (s,a,r,s',d) tuple to replay buffer self.replay_buffer.add(agg_state, encoded_act, adj_reward, agg_next_state, self.done) # finds the next epsilon self.set_next_epsilon(total_steps) # samples a batch_size number of experience samples from replay # buffer (s_batch, a_batch, r_batch, s_next_batch, d_batch, w_batch, ind_batch) = (self.replay_buffer.sample( self.batch_size, total_steps)) # updates network estimates based on replay loss = self.d3qn.train_on_minibatch(s_batch, a_batch, r_batch, s_next_batch, d_batch, w_batch) priorities = self.d3qn.prio self.replay_buffer.update_priorities(ind_batch, priorities) # periodically hard updates the network if (total_steps % hard_update_freq): self.d3qn.hard_update_target_network() # periodically soft update the network elif (total_steps % soft_update_freq): self.d3qn.soft_update_target_network() # update state variables self.obs = new_obs self.state = new_state # increase steps, updates metrics curr_steps += 1 total_steps += 1 total_reward += reward losses.append(loss) total_loss.append(loss) # updates metrics throughout epoch alive.append(curr_steps) net_reward.append(total_reward) avg_losses.append(np.average(np.array(total_loss))) epoch += 1 # sanity check to ensure it's working if (epoch % 100 == 0): print("Completed epoch {}".format(epoch)) print("Total steps: {}".format(total_steps)) return (epoch, total_steps, losses, avg_losses, net_reward, alive)
class PrioritizedDQNAgent(Agent): """Interacts with and learns from the environment.""" def __init__(self, movie_dict=None, act_set=None, slot_set=None, params=None, seed=1, compute_weights=False): self.movie_dict = movie_dict self.act_set = act_set self.slot_set = slot_set self.act_cardinality = len(act_set.keys()) self.slot_cardinality = len(slot_set.keys()) self.seed = seed self.compute_weights = compute_weights self.feasible_actions = dialog_config.feasible_actions self.num_actions = len(self.feasible_actions) self.movie_dict = movie_dict self.act_set = act_set self.slot_set = slot_set self.act_cardinality = len(act_set.keys()) self.slot_cardinality = len(slot_set.keys()) self.feasible_actions = dialog_config.feasible_actions self.num_actions = len(self.feasible_actions) self.epsilon = params['epsilon'] self.agent_run_mode = params['agent_run_mode'] self.agent_act_level = params['agent_act_level'] self.experience_replay_pool = [] # experience replay pool <s_t, a_t, r_t, s_t+1> # Replay memory self.memory = PrioritizedReplayBuffer( self.num_actions, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, seed, compute_weights) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.experience_replay_pool_size = params.get( 'experience_replay_pool_size', 1000) self.hidden_size = params.get('dqn_hidden_size', 60) self.gamma = params.get('gamma', 0.9) self.predict_mode = params.get('predict_mode', False) self.warm_start = params.get('warm_start', 0) self.max_turn = params['max_turn'] + 4 self.state_dimension = 2 * self.act_cardinality + \ 7 * self.slot_cardinality + 3 + self.max_turn self.qnetwork_local = QNetwork( self.state_dimension, self.num_actions, seed).to(self.device) self.qnetwork_target = QNetwork( self.state_dimension, self.num_actions, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=5e-4) self.cur_bellman_err = 0 # Prediction Mode: load trained DQN model if params['trained_model_path'] != None: self.dqn.model = copy.deepcopy( self.load_trained_DQN(params['trained_model_path'])) self.clone_dqn = copy.deepcopy(self.dqn) self.predict_mode = True self.warm_start = 2 def initialize_episode(self): """ Initialize a new episode. This function is called every time a new episode is run. """ self.current_slot_id = 0 self.phase = 0 self.request_set = ['moviename', 'starttime', 'city', 'date', 'theater', 'numberofpeople'] def state_to_action(self, state): """ DQN: Input state, output action """ self.representation = self.prepare_state_representation(state) self.action = self.run_policy(self.representation) act_slot_response = copy.deepcopy(self.feasible_actions[self.action]) return {'act_slot_response': act_slot_response, 'act_slot_value_response': None} def prepare_state_representation(self, state): """ Create the representation for each state """ user_action = state['user_action'] current_slots = state['current_slots'] kb_results_dict = state['kb_results_dict'] agent_last = state['agent_action'] ######################################################################## # Create one-hot of acts to represent the current user action ######################################################################## user_act_rep = np.zeros((1, self.act_cardinality)) user_act_rep[0, self.act_set[user_action['diaact']]] = 1.0 ######################################################################## # Create bag of inform slots representation to represent the current user action ######################################################################## user_inform_slots_rep = np.zeros((1, self.slot_cardinality)) for slot in user_action['inform_slots'].keys(): user_inform_slots_rep[0, self.slot_set[slot]] = 1.0 ######################################################################## # Create bag of request slots representation to represent the current user action ######################################################################## user_request_slots_rep = np.zeros((1, self.slot_cardinality)) for slot in user_action['request_slots'].keys(): user_request_slots_rep[0, self.slot_set[slot]] = 1.0 ######################################################################## # Creat bag of filled_in slots based on the current_slots ######################################################################## current_slots_rep = np.zeros((1, self.slot_cardinality)) for slot in current_slots['inform_slots']: current_slots_rep[0, self.slot_set[slot]] = 1.0 ######################################################################## # Encode last agent act ######################################################################## agent_act_rep = np.zeros((1, self.act_cardinality)) if agent_last: agent_act_rep[0, self.act_set[agent_last['diaact']]] = 1.0 ######################################################################## # Encode last agent inform slots ######################################################################## agent_inform_slots_rep = np.zeros((1, self.slot_cardinality)) if agent_last: for slot in agent_last['inform_slots'].keys(): agent_inform_slots_rep[0, self.slot_set[slot]] = 1.0 ######################################################################## # Encode last agent request slots ######################################################################## agent_request_slots_rep = np.zeros((1, self.slot_cardinality)) if agent_last: for slot in agent_last['request_slots'].keys(): agent_request_slots_rep[0, self.slot_set[slot]] = 1.0 turn_rep = np.zeros((1, 1)) + state['turn'] / 10. ######################################################################## # One-hot representation of the turn count? ######################################################################## turn_onehot_rep = np.zeros((1, self.max_turn)) turn_onehot_rep[0, state['turn']] = 1.0 ######################################################################## # Representation of KB results (scaled counts) ######################################################################## kb_count_rep = np.zeros((1, self.slot_cardinality + 1)) + \ kb_results_dict['matching_all_constraints'] / 100. for slot in kb_results_dict: if slot in self.slot_set: kb_count_rep[0, self.slot_set[slot] ] = kb_results_dict[slot] / 100. ######################################################################## # Representation of KB results (binary) ######################################################################## kb_binary_rep = np.zeros((1, self.slot_cardinality + 1)) + \ np.sum(kb_results_dict['matching_all_constraints'] > 0.) for slot in kb_results_dict: if slot in self.slot_set: kb_binary_rep[0, self.slot_set[slot]] = np.sum( kb_results_dict[slot] > 0.) self.final_representation = np.hstack([user_act_rep, user_inform_slots_rep, user_request_slots_rep, agent_act_rep, agent_inform_slots_rep, agent_request_slots_rep, current_slots_rep, turn_rep, turn_onehot_rep, kb_binary_rep, kb_count_rep]) return self.final_representation def run_policy(self, state): """ epsilon-greedy policy """ if random.random() < self.epsilon: return random.randint(0, self.num_actions - 1) else: if self.warm_start == 1: if len(self.experience_replay_pool) > self.experience_replay_pool_size: self.warm_start = 2 return self.rule_policy() else: state = torch.from_numpy( state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() return np.argmax(action_values.cpu().data.numpy()) def rule_policy(self): """ Rule Policy """ if self.current_slot_id < len(self.request_set): slot = self.request_set[self.current_slot_id] self.current_slot_id += 1 act_slot_response = {} act_slot_response['diaact'] = "request" act_slot_response['inform_slots'] = {} act_slot_response['request_slots'] = {slot: "UNK"} elif self.phase == 0: act_slot_response = {'diaact': "inform", 'inform_slots': { 'taskcomplete': "PLACEHOLDER"}, 'request_slots': {}} self.phase += 1 elif self.phase == 1: act_slot_response = {'diaact': "thanks", 'inform_slots': {}, 'request_slots': {}} return self.action_index(act_slot_response) def action_index(self, act_slot_response): """ Return the index of action """ for (i, action) in enumerate(self.feasible_actions): if act_slot_response == action: return i print act_slot_response raise Exception("action index not found") return None def register_experience_replay_tuple(self, s_t, a_t, reward, s_tplus1, episode_over): """ Register feedback from the environment, to be stored as future training data """ state = self.prepare_state_representation(s_t) action = self.action reward = reward next_state = self.prepare_state_representation(s_tplus1) done = episode_over if self.predict_mode == False: # Training Mode if self.warm_start == 1: self.memory.add(state, action, reward, next_state, done) else: # Prediction Mode self.memory.add(state, action, reward, next_state, done) def train(self, batch_size=16, num_batches=100, gamma = 0.99): """ Train DQN with experience replay """ for iter_batch in range(num_batches): self.cur_bellman_err = 0 self.memory.update_memory_sampling() self.memory.update_parameters() for iter in range(int(EXPERIENCES_PER_SAMPLING/batch_size)): experiences = self.memory.sample() self.learn(experiences, gamma) def learn(self, sampling, gamma): """Update value parameters using given batch of experience tuples. Params ====== sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, indices = sampling ## TODO: compute and minimize the loss q_target = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) expected_values = rewards + gamma * q_target * (1 - dones) output = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(output, expected_values) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) # ------------------- update priorities ------------------- # delta = abs(expected_values - output.detach()).numpy() self.memory.update_priorities(delta, indices) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reinitialize_memory(self): self.memory = PrioritizedReplayBuffer( self.num_actions, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, self.seed, self.compute_weights) ################################################################################ # Debug Functions ################################################################################ def save_experience_replay_to_file(self, path): """ Save the experience replay pool to a file """ try: pickle.dump(self.experience_replay_pool, open(path, "wb")) print 'saved model in %s' % (path, ) except Exception, e: print 'Error: Writing model fails: %s' % (path, ) print e