class SACAgent(): def __init__(self, action_size, state_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) self.env = gym.make(config["env_name"]) self.env = FrameStack(self.env, config) self.env.seed(self.seed) self.action_size = action_size self.state_size = state_size self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.history_length = config["history_length"] self.size = config["size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] self.vid_path = config["vid_path"] print("actions size ", action_size) self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"]) self.policy = SACActor(state_size, action_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"]) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) self.episodes = config["episodes"] self.memory = ReplayBuffer((self.history_length, self.size, self.size), (1, ), config["buffer_size"], config["image_pad"], self.seed, self.device) pathname = config["seed"] tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.target_entropy = -torch.prod( torch.Tensor(action_size).to(self.device)).item() def act(self, state, evaluate=False): with torch.no_grad(): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) state = state.type(torch.float32).div_(255) self.encoder.eval() state = self.encoder.create_vector(state) self.encoder.train() if evaluate is False: action = self.policy.sample(state) else: action_prob, _ = self.policy(state) action = torch.argmax(action_prob) action = action.cpu().numpy() return action # action = np.clip(action, self.min_action, self.max_action) action = action.cpu().numpy()[0] return action def train_agent(self): average_reward = 0 scores_window = deque(maxlen=100) t0 = time.time() for i_epiosde in range(1, self.episodes): episode_reward = 0 state = self.env.reset() t = 0 while True: t += 1 action = self.act(state) next_state, reward, done, _ = self.env.step(action) episode_reward += reward if i_epiosde > 10: self.learn() self.memory.add(state, reward, action, next_state, done) state = next_state if done: scores_window.append(episode_reward) break if i_epiosde % self.eval == 0: self.eval_policy() ave_reward = np.mean(scores_window) print("Epiosde {} Steps {} Reward {} Reward averge{:.2f} Time {}". format(i_epiosde, t, episode_reward, np.mean(scores_window), time_format(time.time() - t0))) self.writer.add_scalar('Aver_reward', ave_reward, self.steps) def learn(self): self.steps += 1 states, rewards, actions, next_states, dones = self.memory.sample( self.batch_size) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) states_detached = states.detach() qf1, qf2 = self.critic(states) q_value1 = qf1.gather(1, actions) q_value2 = qf2.gather(1, actions) with torch.no_grad(): next_states = next_states.type(torch.float32).div_(255) next_states = self.encoder.create_vector(next_states) q1_target, q2_target = self.target_critic(next_states) min_q_target = torch.min(q1_target, q2_target) next_action_prob, next_action_log_prob = self.policy(next_states) next_q_target = ( next_action_prob * (min_q_target - self.alpha * next_action_log_prob)).sum( dim=1, keepdim=True) next_q_value = rewards + (1 - dones) * self.gamma * next_q_target # --------------------------update-q-------------------------------------------------------- loss = F.mse_loss(q_value1, next_q_value) + F.mse_loss( q_value2, next_q_value) self.q_optim.zero_grad() self.encoder_optimizer.zero_grad() loss.backward() self.q_optim.step() self.encoder_optimizer.zero_grad() self.writer.add_scalar('loss/q', loss, self.steps) # --------------------------update-policy-------------------------------------------------------- action_prob, log_action_prob = self.policy(states_detached) with torch.no_grad(): q_pi1, q_pi2 = self.critic(states_detached) min_q_values = torch.min(q_pi1, q_pi2) #policy_loss = (action_prob * ((self.alpha * log_action_prob) - min_q_values).detach()).sum(dim=1).mean() policy_loss = (action_prob * ((self.alpha * log_action_prob) - min_q_values)).sum( dim=1).mean() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() self.writer.add_scalar('loss/policy', policy_loss, self.steps) # --------------------------update-alpha-------------------------------------------------------- alpha_loss = (action_prob.detach() * (-self.log_alpha * (log_action_prob + self.target_entropy).detach())).sum( dim=1).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.writer.add_scalar('loss/alpha', alpha_loss, self.steps) self.soft_udapte(self.critic, self.target_critic) self.alpha = self.log_alpha.exp() def soft_udapte(self, online, target): for param, target_parm in zip(online.parameters(), target.parameters()): target_parm.data.copy_(self.tau * param.data + (1 - self.tau) * target_parm.data) def eval_policy(self, eval_episodes=4): env = gym.make(self.env_name) env = wrappers.Monitor(env, str(self.vid_path) + "/{}".format(self.steps), video_callable=lambda episode_id: True, force=True) average_reward = 0 scores_window = deque(maxlen=100) for i_epiosde in range(eval_episodes): print("Eval Episode {} of {} ".format(i_epiosde, eval_episodes)) episode_reward = 0 state = env.reset() while True: action = self.act(state, evaluate=True) state, reward, done, _ = env.step(action) episode_reward += reward if done: break scores_window.append(episode_reward) average_reward = np.mean(scores_window) self.writer.add_scalar('Eval_reward', average_reward, self.steps)
class Agent(): def __init__(self, state_size, action_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) env = gym.make(config["env_name"]) self.env = FrameStack(env, config) self.env.seed(self.seed) self.state_size = state_size self.action_size = action_size self.clip = config["clip"] self.device = 'cuda' self.double_dqn = config["DDQN"] self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.steps = 0 self.predicter = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) #self.encoder_freq = Encoder(config).to(self.device) #self.encoder_optimizer_frq = torch.optim.Adam(self.encoder_freq.parameters(), self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format( self.lr, self.batch_size, self.fc1, self.fc2, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.vid_path = str(config["locexp"]) + '/vid' self.writer = SummaryWriter(tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device)) def learn(self, memory_ex): logging.debug( "--------------------------New update-----------------------------------------------" ) self.steps += 1 states, next_states, actions, dones = memory_ex.expert_policy( self.batch_size) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) next_states = next_states.type(torch.float32).div_(255) next_states = self.encoder.create_vector(next_states) self.state_action_frq(states, actions) actions = torch.randint(0, 3, (self.batch_size, 1), dtype=torch.int64, device=self.device) self.compute_shift_function(states.detach(), next_states, actions, dones) self.compute_r_function(states.detach(), actions) self.compute_q_function(states.detach(), next_states, actions, dones) self.soft_update(self.R_local, self.R_target, self.tau) self.soft_update(self.q_shift_local, self.q_shift_target, self.tau) self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) return def compute_q_function(self, states, next_states, actions, dones): """Update value parameters using given batch of experience tuples. """ actions = actions.type(torch.int64) # Get max predicted Q values (for next states) from target model if self.double_dqn: q_values = self.qnetwork_local(next_states).detach() _, best_action = q_values.max(1) best_action = best_action.unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach() Q_targets_next = Q_targets_next.gather(1, best_action) else: Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states # Get expected Q values from local model # Compute loss rewards = self.R_target(states).detach().gather( 1, actions.detach()).squeeze(0) Q_targets = rewards + (self.gamma * Q_targets_next * (dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets.detach()) # Get max predicted Q values (for next states) from target model self.writer.add_scalar('Q_loss', loss, self.steps) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.encoder_optimizer.step() # torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1) self.optimizer.step() def compute_shift_function(self, states, next_states, actions, dones): """Update Q shift parameters using given batch of experience tuples """ actions = actions.type(torch.int64) with torch.no_grad(): # Get max predicted Q values (for next states) from target model if self.double_dqn: q_shift = self.q_shift_local(next_states) max_q, max_actions = q_shift.max(1) Q_targets_next = self.qnetwork_target(next_states).gather( 1, max_actions.unsqueeze(1)) else: Q_targets_next = self.qnetwork_target( next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = self.gamma * Q_targets_next # Get expected Q values from local model Q_expected = self.q_shift_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets.detach()) # Minimize the loss self.optimizer_shift.zero_grad() loss.backward() self.writer.add_scalar('Shift_loss', loss, self.steps) self.optimizer_shift.step() def compute_r_function(self, states, actions, debug=False, log=False): """ compute reward for the state action pair """ actions = actions.type(torch.int64) # sum all other actions size = states.shape[0] idx = 0 all_zeros = [1 for i in range(actions.shape[0])] zeros = False y_shift = self.q_shift_target(states).gather(1, actions).detach() log_a = self.get_action_prob(states, actions).detach() y_r_part1 = log_a - y_shift y_r_part2 = torch.empty((size, 1), dtype=torch.float32).to(self.device) for a, s in zip(actions, states): y_h = 0 taken_actions = 0 for b in self.all_actions: b = b.type(torch.int64).unsqueeze(1) n_b = self.get_action_prob(s.unsqueeze(0), b) if torch.eq(a, b) or n_b is None: continue taken_actions += 1 y_s = self.q_shift_target(s.unsqueeze(0)).detach().gather( 1, b).item() n_b = n_b.data.item() - y_s r_hat = self.R_target(s.unsqueeze(0)).gather(1, b).item() y_h += (r_hat - n_b) if log: text = "a {} r _hat {:.2f} - n_b {:.2f} | sh {:.2f} ".format( b.item(), r_hat, n_b, y_s) logging.debug(text) if taken_actions == 0: all_zeros[idx] = 0 zeros = True y_r_part2[idx] = 0.0 else: y_r_part2[idx] = (1. / taken_actions) * y_h idx += 1 y_r = y_r_part1 + y_r_part2 # check if there are zeros (no update for this tuble) remove them from states and if zeros: mask = torch.BoolTensor(all_zeros) states = states[mask] actions = actions[mask] y_r = y_r[mask] y = self.R_local(states).gather(1, actions) if log: text = "Action {:.2f} r target {:.2f} = n_a {:.2f} + n_b {:.2f} y {:.2f}".format( actions[0].item(), y_r[0].item(), y_r_part1[0].item(), y_r_part2[0].item(), y[0].item()) logging.debug(text) r_loss = F.mse_loss(y, y_r.detach()) # Minimize the loss self.optimizer_r.zero_grad() r_loss.backward() # torch.nn.utils.clip_grad_norm_(self.R_local.parameters(), 5) self.optimizer_r.step() self.writer.add_scalar('Reward_loss', r_loss, self.steps) def get_action_prob(self, states, actions): """ compute prob for state action pair """ actions = actions.type(torch.long) # check if action prob is zero output = self.predicter(states) output = F.softmax(output, dim=1) action_prob = output.gather(1, actions) action_prob = action_prob + torch.finfo(torch.float32).eps # check if one action if its to small if action_prob.shape[0] == 1: if action_prob.cpu().detach().numpy()[0][0] < 1e-4: return None action_prob = torch.log(action_prob) action_prob = torch.clamp(action_prob, min=self.clip, max=0) return action_prob def state_action_frq(self, states, action): """ Train classifer to compute state action freq """ self.predicter.train() output = self.predicter(states) output = output.squeeze(0) y = action.type(torch.long).squeeze(1) loss = nn.CrossEntropyLoss()(output, y) self.optimizer_pre.zero_grad() self.encoder_optimizer.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(self.predicter.parameters(), 1) self.optimizer_pre.step() self.writer.add_scalar('Predict_loss', loss, self.steps) self.predicter.eval() def test_predicter(self, memory): """ Test the classifier """ self.predicter.eval() same_state_predition = 0 for i in range(memory.idx): states = memory.obses[i] actions = memory.actions[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) actions = torch.as_tensor(actions, device=self.device) output = self.predicter(states) output = F.softmax(output, dim=1) # create one hot encode y from actions y = actions.type(torch.long).item() p = torch.argmax(output.data).item() if y == p: same_state_predition += 1 text = "Same prediction {} of {} ".format(same_state_predition, memory.idx) print(text) logging.debug(text) def soft_update(self, local_model, target_model, tau=4): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ # print("use tau", tau) for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load(self, filename): self.predicter.load_state_dict(torch.load(filename + "_predicter.pth")) self.optimizer_pre.load_state_dict( torch.load(filename + "_predicter_optimizer.pth")) self.R_local.load_state_dict(torch.load(filename + "_r_net.pth")) self.qnetwork_local.load_state_dict(torch.load(filename + "_q_net.pth")) print("Load models to {}".format(filename)) def save(self, filename): """ """ mkdir("", filename) torch.save(self.predicter.state_dict(), filename + "_predicter.pth") torch.save(self.optimizer_pre.state_dict(), filename + "_predicter_optimizer.pth") torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth") torch.save(self.optimizer.state_dict(), filename + "_q_net_optimizer.pth") torch.save(self.R_local.state_dict(), filename + "_r_net.pth") torch.save(self.q_shift_local.state_dict(), filename + "_q_shift_net.pth") print("save models to {}".format(filename)) def test_q_value(self, memory): test_elements = memory.idx test_elements = 100 all_diff = 0 error = True used_elements_r = 0 used_elements_q = 0 r_error = 0 q_error = 0 for i in range(test_elements): states = memory.obses[i] actions = memory.actions[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) actions = torch.as_tensor(actions, device=self.device) one_hot = torch.Tensor([0 for i in range(self.action_size)], device="cpu") one_hot[actions.item()] = 1 with torch.no_grad(): r_values = self.R_local(states.detach()).detach() q_values = self.qnetwork_local(states.detach()).detach() soft_r = F.softmax(r_values, dim=1).to("cpu") soft_q = F.softmax(q_values, dim=1).to("cpu") actions = actions.type(torch.int64) kl_q = F.kl_div(soft_q.log(), one_hot, None, None, 'sum') kl_r = F.kl_div(soft_r.log(), one_hot, None, None, 'sum') if kl_r == float("inf"): pass else: r_error += kl_r used_elements_r += 1 if kl_q == float("inf"): pass else: q_error += kl_q used_elements_q += 1 average_q_kl = q_error / used_elements_q average_r_kl = r_error / used_elements_r text = "Kl div of Reward {} of {} elements".format( average_q_kl, used_elements_r) print(text) text = "Kl div of Q_values {} of {} elements".format( average_r_kl, used_elements_q) print(text) self.writer.add_scalar('KL_reward', average_r_kl, self.steps) self.writer.add_scalar('KL_q_values', average_q_kl, self.steps) def act(self, states): states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) q_values = self.qnetwork_local(states.detach()).detach() action = torch.argmax(q_values).item() return action def eval_policy(self, record=False, eval_episodes=2): if record: env = wrappers.Monitor(self.env, str(self.vid_path) + "/{}".format(self.steps), video_callable=lambda episode_id: True, force=True) else: env = self.env average_reward = 0 scores_window = deque(maxlen=100) s = 0 for i_epiosde in range(eval_episodes): episode_reward = 0 state = env.reset() while True: s += 1 action = self.act(state) state, reward, done, _ = env.step(action) episode_reward += reward if done: break scores_window.append(episode_reward) if record: return average_reward = np.mean(scores_window) print("Eval Episode {} average Reward {} ".format( eval_episodes, average_reward)) self.writer.add_scalar('Eval_reward', average_reward, self.steps)
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(config["seed"]) self.seed = config["seed"] self.gamma = 0.99 self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.device = config["device"] # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) # Replay memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, memory, writer): self.t_step += 1 if len(memory) > self.batch_size: if self.t_step % 4 == 0: experiences = memory.sample(self.batch_size) self.learn(experiences, writer) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) state = state.type(torch.float32).div_(255) self.qnetwork_local.eval() self.encoder.eval() with torch.no_grad(): state = self.encoder.create_vector(state) action_values = self.qnetwork_local(state) self.qnetwork_local.train() self.encoder.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, writer): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) next_states = next_states.type(torch.float32).div_(255) next_states = self.encoder.create_vector(next_states) actions = actions.type(torch.int64) # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * dones) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) writer.add_scalar('Q_loss', loss, self.t_step) # Minimize the loss self.optimizer.zero_grad() self.encoder_optimizer.zero_grad() loss.backward() self.optimizer.step() self.encoder_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def save(self, filename): """ """ mkdir("", filename) torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth") torch.save(self.optimizer.state_dict(), filename + "_q_net_optimizer.pth") torch.save(self.encoder.state_dict(), filename + "_encoder.pth") torch.save(self.encoder_optimizer.state_dict(), filename + "_encoder_optimizer.pth") print("Save models to {}".format(filename))