def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.actor.configs.state_size = ( self.head_cfg.critic.configs.state_size) = self.state_dim self.head_cfg.actor.configs.output_size = self.action_dim # create actor self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) self.critic = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) # create optimizer self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.critic_optim = optim.Adam( self.critic.parameters(), lr=self.optim_cfg.lr_critic, weight_decay=self.optim_cfg.weight_decay, ) # load model parameters if self.args.load_from is not None: self.load_params(self.args.load_from)
def __call__( self, model: BaseNetwork, target_model: BaseNetwork, experiences: Tuple[torch.Tensor, ...], gamma: float, head_cfg: ConfigDict, ) -> Tuple[torch.Tensor, torch.Tensor]: """Return element-wise C51 loss and Q-values.""" states, actions, rewards, next_states, dones = experiences[:5] batch_size = states.shape[0] support = torch.linspace( head_cfg.configs.v_min, head_cfg.configs.v_max, head_cfg.configs.atom_size ).to(device) delta_z = float(head_cfg.configs.v_max - head_cfg.configs.v_min) / ( head_cfg.configs.atom_size - 1 ) with torch.no_grad(): # According to noisynet paper, # it resamples noisynet parameters on online network when using double q # but we don't because there is no remarkable difference in performance. next_actions = model.forward_(next_states)[1].argmax(1) next_dist = target_model.forward_(next_states)[0] next_dist = next_dist[range(batch_size), next_actions] t_z = rewards + (1 - dones) * gamma * support t_z = t_z.clamp(min=head_cfg.configs.v_min, max=head_cfg.configs.v_max) b = (t_z - head_cfg.configs.v_min) / delta_z l = b.floor().long() # noqa: E741 u = b.ceil().long() offset = ( torch.linspace( 0, (batch_size - 1) * head_cfg.configs.atom_size, batch_size ) .long() .unsqueeze(1) .expand(batch_size, head_cfg.configs.atom_size) .to(device) ) proj_dist = torch.zeros(next_dist.size(), device=device) proj_dist.view(-1).index_add_( 0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1) ) proj_dist.view(-1).index_add_( 0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1) ) dist, q_values = model.forward_(states) log_p = torch.log(dist[range(batch_size), actions.long()]) dq_loss_element_wise = -(proj_dist * log_p).sum(1) return dq_loss_element_wise, q_values
def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.actor.configs.state_size = ( self.head_cfg.critic_vf.configs.state_size) = self.state_dim self.head_cfg.critic_qf.configs.state_size = (self.state_dim[0] + self.action_dim, ) self.head_cfg.actor.configs.output_size = self.action_dim # create actor self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) # create v_critic self.vf = BaseNetwork(self.backbone_cfg.critic_vf, self.head_cfg.critic_vf).to(device) self.vf_target = BaseNetwork(self.backbone_cfg.critic_vf, self.head_cfg.critic_vf).to(device) self.vf_target.load_state_dict(self.vf.state_dict()) # create q_critic self.qf_1 = BaseNetwork(self.backbone_cfg.critic_qf, self.head_cfg.critic_qf).to(device) self.qf_2 = BaseNetwork(self.backbone_cfg.critic_qf, self.head_cfg.critic_qf).to(device) # create optimizers self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.vf_optim = optim.Adam( self.vf.parameters(), lr=self.optim_cfg.lr_vf, weight_decay=self.optim_cfg.weight_decay, ) self.qf_1_optim = optim.Adam( self.qf_1.parameters(), lr=self.optim_cfg.lr_qf1, weight_decay=self.optim_cfg.weight_decay, ) self.qf_2_optim = optim.Adam( self.qf_2.parameters(), lr=self.optim_cfg.lr_qf2, weight_decay=self.optim_cfg.weight_decay, ) # load the optimizer and model parameters if self.args.load_from is not None: self.load_params(self.args.load_from)
def test_base_network(): """Test wheter base_network make fc layer based on backbone's output size.""" head_cfg.configs.state_size = test_state_dim head_cfg.configs.output_size = 8 try: _ = BaseNetwork(resnet_cfg, head_cfg) except Exception as e: raise e
def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.configs.state_size = self.state_dim self.head_cfg.configs.output_size = self.action_dim self.dqn = BaseNetwork(self.backbone_cfg, self.head_cfg).to(device) self.dqn_target = BaseNetwork(self.backbone_cfg, self.head_cfg).to(device) self.loss_fn = build_loss(self.hyper_params.loss_type) self.dqn_target.load_state_dict(self.dqn.state_dict()) # create optimizer self.dqn_optim = optim.Adam( self.dqn.parameters(), lr=self.optim_cfg.lr_dqn, weight_decay=self.optim_cfg.weight_decay, eps=self.optim_cfg.adam_eps, ) # load the optimizer and model parameters if self.args.load_from is not None: self.load_params(self.args.load_from)
def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.actor.configs.state_size = self.state_dim # ddpg critic gets state & action as input, # and make the type to tuple to conform the gym action_space type. self.head_cfg.critic.configs.state_size = (self.state_dim[0] + self.action_dim,) self.head_cfg.actor.configs.output_size = self.action_dim # create actor self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to( device ) self.actor_target = BaseNetwork( self.backbone_cfg.actor, self.head_cfg.actor ).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) # create critic self.critic = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to( device ) self.critic_target = BaseNetwork( self.backbone_cfg.critic, self.head_cfg.critic ).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) # create optimizer self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.critic_optim = optim.Adam( self.critic.parameters(), lr=self.optim_cfg.lr_critic, weight_decay=self.optim_cfg.weight_decay, ) # load the optimizer and model parameters if self.args.load_from is not None: self.load_params(self.args.load_from)
class TD3Agent(Agent): """ActorCritic interacting with environment. Attributes: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings hyper_params (ConfigDict): hyper-parameters network_cfg (ConfigDict): config of network for training agent optim_cfg (ConfigDict): config of optimizer state_dim (int): state size of env action_dim (int): action size of env memory (ReplayBuffer): replay memory exploration_noise (GaussianNoise): random noise for exploration target_policy_noise (GaussianNoise): random noise for target values actor (nn.Module): actor model to select actions critic1 (nn.Module): critic model to predict state values critic2 (nn.Module): critic model to predict state values critic_target1 (nn.Module): target critic model to predict state values critic_target2 (nn.Module): target critic model to predict state values actor_target (nn.Module): target actor model to select actions critic_optim (Optimizer): optimizer for training critic actor_optim (Optimizer): optimizer for training actor curr_state (np.ndarray): temporary storage of the current state total_steps (int): total step numbers episode_steps (int): step number of the current episode i_episode (int): current episode number noise_cfg (ConfigDict): config of noise """ def __init__( self, env: gym.Env, args: argparse.Namespace, log_cfg: ConfigDict, hyper_params: ConfigDict, backbone: ConfigDict, head: ConfigDict, optim_cfg: ConfigDict, noise_cfg: ConfigDict, ): """Initialize. Args: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings """ Agent.__init__(self, env, args, log_cfg) self.curr_state = np.zeros((1, )) self.total_step = 0 self.episode_step = 0 self.update_step = 0 self.i_episode = 0 self.hyper_params = hyper_params self.noise_cfg = noise_cfg self.backbone_cfg = backbone self.head_cfg = head self.optim_cfg = optim_cfg self.state_dim = self.env.observation_space.shape self.action_dim = self.env.action_space.shape[0] # noise instance to make randomness of action self.exploration_noise = GaussianNoise(self.action_dim, noise_cfg.exploration_noise, noise_cfg.exploration_noise) self.target_policy_noise = GaussianNoise( self.action_dim, noise_cfg.target_policy_noise, noise_cfg.target_policy_noise, ) if not self.args.test: # replay memory self.memory = ReplayBuffer(self.hyper_params.buffer_size, self.hyper_params.batch_size) self._init_network() def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.actor.configs.state_size = self.state_dim self.head_cfg.critic.configs.state_size = (self.state_dim[0] + self.action_dim, ) self.head_cfg.actor.configs.output_size = self.action_dim # create actor self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) self.actor_target = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) # create q_critic self.critic1 = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) self.critic2 = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) self.critic_target1 = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) self.critic_target2 = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) self.critic_target1.load_state_dict(self.critic1.state_dict()) self.critic_target2.load_state_dict(self.critic2.state_dict()) # concat critic parameters to use one optim critic_parameters = list(self.critic1.parameters()) + list( self.critic2.parameters()) # create optimizers self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.critic_optim = optim.Adam( critic_parameters, lr=self.optim_cfg.lr_critic, weight_decay=self.optim_cfg.weight_decay, ) # load the optimizer and model parameters if self.args.load_from is not None: self.load_params(self.args.load_from) def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input space.""" # initial training step, try random action for exploration self.curr_state = state if (self.total_step < self.hyper_params.initial_random_action and not self.args.test): return np.array(self.env.action_space.sample()) state = torch.FloatTensor(state).to(device) selected_action = self.actor(state).detach().cpu().numpy() if not self.args.test: noise = self.exploration_noise.sample() selected_action = np.clip(selected_action + noise, -1.0, 1.0) return selected_action def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: """Take an action and return the response of the env.""" next_state, reward, done, info = self.env.step(action) if not self.args.test: # if last state is not terminal state in episode, done is false done_bool = (False if self.episode_step == self.args.max_episode_steps else done) self.memory.add( (self.curr_state, action, reward, next_state, done_bool)) return next_state, reward, done, info def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" self.update_step += 1 experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences masks = 1 - dones # get actions with noise noise = torch.FloatTensor(self.target_policy_noise.sample()).to(device) clipped_noise = torch.clamp( noise, -self.noise_cfg.target_policy_noise_clip, self.noise_cfg.target_policy_noise_clip, ) next_actions = (self.actor_target(next_states) + clipped_noise).clamp( -1.0, 1.0) # min (Q_1', Q_2') next_states_actions = torch.cat((next_states, next_actions), dim=-1) next_values1 = self.critic_target1(next_states_actions) next_values2 = self.critic_target2(next_states_actions) next_values = torch.min(next_values1, next_values2) # G_t = r + gamma * v(s_{t+1}) if state != Terminal # = r otherwise curr_returns = rewards + self.hyper_params.gamma * next_values * masks curr_returns = curr_returns.detach() # critic loss state_actions = torch.cat((states, actions), dim=-1) values1 = self.critic1(state_actions) values2 = self.critic2(state_actions) critic1_loss = F.mse_loss(values1, curr_returns) critic2_loss = F.mse_loss(values2, curr_returns) # train critic critic_loss = critic1_loss + critic2_loss self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() if self.update_step % self.hyper_params.policy_update_freq == 0: # policy loss actions = self.actor(states) state_actions = torch.cat((states, actions), dim=-1) actor_loss = -self.critic1(state_actions).mean() # train actor self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() # update target networks tau = self.hyper_params.tau common_utils.soft_update(self.critic1, self.critic_target1, tau) common_utils.soft_update(self.critic2, self.critic_target2, tau) common_utils.soft_update(self.actor, self.actor_target, tau) else: actor_loss = torch.zeros(1) return actor_loss.item(), critic1_loss.item(), critic2_loss.item() def load_params(self, path: str): """Load model and optimizer parameters.""" Agent.load_params(self, path) params = torch.load(path) self.critic1.load_state_dict(params["critic1"]) self.critic2.load_state_dict(params["critic2"]) self.critic_target1.load_state_dict(params["critic_target1"]) self.critic_target2.load_state_dict(params["critic_target2"]) self.critic_optim.load_state_dict(params["critic_optim"]) self.actor.load_state_dict(params["actor"]) self.actor_target.load_state_dict(params["actor_target"]) self.actor_optim.load_state_dict(params["actor_optim"]) print("[INFO] loaded the model and optimizer from", path) def save_params(self, n_episode: int): # type: ignore """Save model and optimizer parameters.""" params = { "actor": self.actor.state_dict(), "actor_target": self.actor_target.state_dict(), "actor_optim": self.actor_optim.state_dict(), "critic1": self.critic1.state_dict(), "critic2": self.critic2.state_dict(), "critic_target1": self.critic_target1.state_dict(), "critic_target2": self.critic_target2.state_dict(), "critic_optim": self.critic_optim.state_dict(), } Agent.save_params(self, params, n_episode) def write_log(self, log_value: tuple): """Write log about loss and score""" i, loss, score, policy_update_freq, avg_time_cost = log_value total_loss = loss.sum() print( "[INFO] episode %d, episode_step: %d, total_step: %d, total score: %d\n" "total loss: %f actor_loss: %.3f critic1_loss: %.3f critic2_loss: %.3f " "(spent %.6f sec/step)\n" % ( i, self.episode_step, self.total_step, score, total_loss, loss[0] * policy_update_freq, # actor loss loss[1], # critic1 loss loss[2], # critic2 loss avg_time_cost, )) if self.args.log: wandb.log({ "score": score, "total loss": total_loss, "actor loss": loss[0] * policy_update_freq, "critic1 loss": loss[1], "critic2 loss": loss[2], "time per each step": avg_time_cost, }) def train(self): """Train the agent.""" # logger if self.args.log: self.set_wandb() # wandb.watch([self.actor, self.critic1, self.critic2], log="parameters") for self.i_episode in range(1, self.args.episode_num + 1): state = self.env.reset() done = False score = 0 loss_episode = list() self.episode_step = 0 t_begin = time.time() while not done: if self.args.render and self.i_episode >= self.args.render_after: self.env.render() action = self.select_action(state) next_state, reward, done, _ = self.step(action) self.total_step += 1 self.episode_step += 1 state = next_state score += reward if len(self.memory) >= self.hyper_params.batch_size: loss = self.update_model() loss_episode.append(loss) # for logging t_end = time.time() avg_time_cost = (t_end - t_begin) / self.episode_step # logging if loss_episode: avg_loss = np.vstack(loss_episode).mean(axis=0) log_value = ( self.i_episode, avg_loss, score, self.hyper_params.policy_update_freq, avg_time_cost, ) self.write_log(log_value) if self.i_episode % self.args.save_period == 0: self.save_params(self.i_episode) self.interim_test() # termination self.env.close() self.save_params(self.i_episode) self.interim_test()
def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.actor.configs.state_size = self.state_dim self.head_cfg.critic.configs.state_size = (self.state_dim[0] + self.action_dim, ) self.head_cfg.actor.configs.output_size = self.action_dim # create actor self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) self.actor_target = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) # create q_critic self.critic1 = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) self.critic2 = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) self.critic_target1 = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) self.critic_target2 = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) self.critic_target1.load_state_dict(self.critic1.state_dict()) self.critic_target2.load_state_dict(self.critic2.state_dict()) # concat critic parameters to use one optim critic_parameters = list(self.critic1.parameters()) + list( self.critic2.parameters()) # create optimizers self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.critic_optim = optim.Adam( critic_parameters, lr=self.optim_cfg.lr_critic, weight_decay=self.optim_cfg.weight_decay, ) # load the optimizer and model parameters if self.args.load_from is not None: self.load_params(self.args.load_from)
class PPOAgent(Agent): """PPO Agent. Attributes: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings hyper_params (ConfigDict): hyper-parameters network_cfg (ConfigDict): config of network for training agent optim_cfg (ConfigDict): config of optimizer state_dim (int): state size of env action_dim (int): action size of env actor (nn.Module): policy gradient model to select actions critic (nn.Module): policy gradient model to predict values actor_optim (Optimizer): optimizer for training actor critic_optim (Optimizer): optimizer for training critic episode_steps (np.ndarray): step numbers of the current episode states (list): memory for experienced states actions (list): memory for experienced actions rewards (list): memory for experienced rewards values (list): memory for experienced values masks (list): memory for masks log_probs (list): memory for log_probs i_episode (int): current episode number epsilon (float): value for clipping loss """ def __init__( self, env: gym.Env, # for testing args: argparse.Namespace, log_cfg: ConfigDict, hyper_params: ConfigDict, backbone: ConfigDict, head: ConfigDict, optim_cfg: ConfigDict, ): """Initialize. Args: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings """ env_gen = env_generator(env.spec.id, args) env_multi = make_envs(env_gen, n_envs=hyper_params.n_workers) Agent.__init__(self, env, args, log_cfg) self.episode_steps = np.zeros(hyper_params.n_workers, dtype=np.int) self.states: list = [] self.actions: list = [] self.rewards: list = [] self.values: list = [] self.masks: list = [] self.log_probs: list = [] self.i_episode = 0 self.next_state = np.zeros((1, )) self.hyper_params = hyper_params self.backbone_cfg = backbone self.head_cfg = head self.optim_cfg = optim_cfg if not self.args.test: self.env = env_multi self.state_dim = self.env.observation_space.shape self.action_dim = self.env.action_space.shape[0] self.epsilon = hyper_params.max_epsilon self._init_network() def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.actor.configs.state_size = ( self.head_cfg.critic.configs.state_size) = self.state_dim self.head_cfg.actor.configs.output_size = self.action_dim # create actor self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) self.critic = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) # create optimizer self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.critic_optim = optim.Adam( self.critic.parameters(), lr=self.optim_cfg.lr_critic, weight_decay=self.optim_cfg.weight_decay, ) # load model parameters if self.args.load_from is not None: self.load_params(self.args.load_from) def select_action(self, state: np.ndarray) -> torch.Tensor: """Select an action from the input space.""" state = torch.FloatTensor(state).to(device) selected_action, dist = self.actor(state) if self.args.test and not self.is_discrete: selected_action = dist.mean if not self.args.test: value = self.critic(state) self.states.append(state) self.actions.append(selected_action) self.values.append(value) self.log_probs.append(dist.log_prob(selected_action)) return selected_action def step( self, action: torch.Tensor) -> Tuple[np.ndarray, np.float64, bool, dict]: next_state, reward, done, info = self.env.step( action.detach().cpu().numpy()) if not self.args.test: # if the last state is not a terminal state, store done as false done_bool = done.copy() done_bool[np.where( self.episode_steps == self.args.max_episode_steps)] = False self.rewards.append( torch.FloatTensor(reward).unsqueeze(1).to(device)) self.masks.append( torch.FloatTensor(1 - done_bool).unsqueeze(1).to(device)) return next_state, reward, done, info def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after every N episodes.""" next_state = torch.FloatTensor(self.next_state).to(device) next_value = self.critic(next_state) returns = ppo_utils.compute_gae( next_value, self.rewards, self.masks, self.values, self.hyper_params.gamma, self.hyper_params.tau, ) states = torch.cat(self.states) actions = torch.cat(self.actions) returns = torch.cat(returns).detach() values = torch.cat(self.values).detach() log_probs = torch.cat(self.log_probs).detach() advantages = returns - values if self.is_discrete: actions = actions.unsqueeze(1) log_probs = log_probs.unsqueeze(1) if self.hyper_params.standardize_advantage: advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-7) actor_losses, critic_losses, total_losses = [], [], [] for state, action, old_value, old_log_prob, return_, adv in ppo_utils.ppo_iter( self.hyper_params.epoch, self.hyper_params.batch_size, states, actions, values, log_probs, returns, advantages, ): # calculate ratios _, dist = self.actor(state) log_prob = dist.log_prob(action) ratio = (log_prob - old_log_prob).exp() # actor_loss surr_loss = ratio * adv clipped_surr_loss = ( torch.clamp(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * adv) actor_loss = -torch.min(surr_loss, clipped_surr_loss).mean() # critic_loss value = self.critic(state) if self.hyper_params.use_clipped_value_loss: value_pred_clipped = old_value + torch.clamp( (value - old_value), -self.epsilon, self.epsilon) value_loss_clipped = (return_ - value_pred_clipped).pow(2) value_loss = (return_ - value).pow(2) critic_loss = 0.5 * torch.max(value_loss, value_loss_clipped).mean() else: critic_loss = 0.5 * (return_ - value).pow(2).mean() # entropy entropy = dist.entropy().mean() # total_loss w_value = self.hyper_params.w_value w_entropy = self.hyper_params.w_entropy total_loss = actor_loss + w_value * critic_loss - w_entropy * entropy # train critic gradient_clip_ac = self.hyper_params.gradient_clip_ac gradient_clip_cr = self.hyper_params.gradient_clip_cr self.critic_optim.zero_grad() total_loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(self.critic.parameters(), gradient_clip_ac) self.critic_optim.step() # train actor self.actor_optim.zero_grad() total_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) self.actor_optim.step() actor_losses.append(actor_loss.item()) critic_losses.append(critic_loss.item()) total_losses.append(total_loss.item()) self.states, self.actions, self.rewards = [], [], [] self.values, self.masks, self.log_probs = [], [], [] actor_loss = sum(actor_losses) / len(actor_losses) critic_loss = sum(critic_losses) / len(critic_losses) total_loss = sum(total_losses) / len(total_losses) return actor_loss, critic_loss, total_loss def decay_epsilon(self, t: int = 0): """Decay epsilon until reaching the minimum value.""" max_epsilon = self.hyper_params.max_epsilon min_epsilon = self.hyper_params.min_epsilon epsilon_decay_period = self.hyper_params.epsilon_decay_period self.epsilon = self.epsilon - (max_epsilon - min_epsilon) * min( 1.0, t / (epsilon_decay_period + 1e-7)) def load_params(self, path: str): """Load model and optimizer parameters.""" Agent.load_params(self, path) params = torch.load(path) self.actor.load_state_dict(params["actor_state_dict"]) self.critic.load_state_dict(params["critic_state_dict"]) self.actor_optim.load_state_dict(params["actor_optim_state_dict"]) self.critic_optim.load_state_dict(params["critic_optim_state_dict"]) print("[INFO] loaded the model and optimizer from", path) def save_params(self, n_episode: int): # type: ignore """Save model and optimizer parameters.""" params = { "actor_state_dict": self.actor.state_dict(), "critic_state_dict": self.critic.state_dict(), "actor_optim_state_dict": self.actor_optim.state_dict(), "critic_optim_state_dict": self.critic_optim.state_dict(), } Agent.save_params(self, params, n_episode) def write_log( self, log_value: tuple, ): i_episode, n_step, score, actor_loss, critic_loss, total_loss = log_value print("[INFO] episode %d\tepisode steps: %d\ttotal score: %d\n" "total loss: %f\tActor loss: %f\tCritic loss: %f\n" % (i_episode, n_step, score, total_loss, actor_loss, critic_loss)) if self.args.log: wandb.log({ "total loss": total_loss, "actor loss": actor_loss, "critic loss": critic_loss, "score": score, }) def train(self): """Train the agent.""" # logger if self.args.log: self.set_wandb() # wandb.watch([self.actor, self.critic], log="parameters") score = 0 i_episode_prev = 0 loss = [0.0, 0.0, 0.0] state = self.env.reset() while self.i_episode <= self.args.episode_num: for _ in range(self.hyper_params.rollout_len): if self.args.render and self.i_episode >= self.args.render_after: self.env.render() action = self.select_action(state) next_state, reward, done, _ = self.step(action) self.episode_steps += 1 state = next_state score += reward[0] i_episode_prev = self.i_episode self.i_episode += done.sum() if (self.i_episode // self.args.save_period) != ( i_episode_prev // self.args.save_period): self.save_params(self.i_episode) if done[0]: n_step = self.episode_steps[0] log_value = ( self.i_episode, n_step, score, loss[0], loss[1], loss[2], ) self.write_log(log_value) score = 0 self.episode_steps[np.where(done)] = 0 self.next_state = next_state loss = self.update_model() self.decay_epsilon(self.i_episode) # termination self.env.close() self.save_params(self.i_episode)
class DDPGAgent(Agent): """ActorCritic interacting with environment. Attributes: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings hyper_params (ConfigDict): hyper-parameters network_cfg (ConfigDict): config of network for training agent optim_cfg (ConfigDict): config of optimizer state_dim (int): state size of env action_dim (int): action size of env memory (ReplayBuffer): replay memory noise (OUNoise): random noise for exploration actor (nn.Module): actor model to select actions actor_target (nn.Module): target actor model to select actions critic (nn.Module): critic model to predict state values critic_target (nn.Module): target critic model to predict state values actor_optim (Optimizer): optimizer for training actor critic_optim (Optimizer): optimizer for training critic curr_state (np.ndarray): temporary storage of the current state total_step (int): total step numbers episode_step (int): step number of the current episode i_episode (int): current episode number """ def __init__( self, env: gym.Env, args: argparse.Namespace, log_cfg: ConfigDict, hyper_params: ConfigDict, backbone: ConfigDict, head: ConfigDict, optim_cfg: ConfigDict, noise_cfg: ConfigDict, ): """Initialize.""" Agent.__init__(self, env, args, log_cfg) self.curr_state = np.zeros((1,)) self.total_step = 0 self.episode_step = 0 self.i_episode = 0 self.hyper_params = hyper_params self.backbone_cfg = backbone self.head_cfg = head self.optim_cfg = optim_cfg self.state_dim = self.env.observation_space.shape self.action_dim = self.env.action_space.shape[0] # set noise self.noise = OUNoise( self.action_dim, theta=noise_cfg.ou_noise_theta, sigma=noise_cfg.ou_noise_sigma, ) self._initialize() self._init_network() # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory self.memory = ReplayBuffer( self.hyper_params.buffer_size, self.hyper_params.batch_size ) # pylint: disable=attribute-defined-outside-init def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.actor.configs.state_size = self.state_dim # ddpg critic gets state & action as input, # and make the type to tuple to conform the gym action_space type. self.head_cfg.critic.configs.state_size = (self.state_dim[0] + self.action_dim,) self.head_cfg.actor.configs.output_size = self.action_dim # create actor self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to( device ) self.actor_target = BaseNetwork( self.backbone_cfg.actor, self.head_cfg.actor ).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) # create critic self.critic = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to( device ) self.critic_target = BaseNetwork( self.backbone_cfg.critic, self.head_cfg.critic ).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) # create optimizer self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.critic_optim = optim.Adam( self.critic.parameters(), lr=self.optim_cfg.lr_critic, weight_decay=self.optim_cfg.weight_decay, ) # load the optimizer and model parameters if self.args.load_from is not None: self.load_params(self.args.load_from) def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input space.""" self.curr_state = state state = self._preprocess_state(state) # if initial random action should be conducted if ( self.total_step < self.hyper_params.initial_random_action and not self.args.test ): return np.array(self.env.action_space.sample()) selected_action = self.actor(state).detach().cpu().numpy() if not self.args.test: noise = self.noise.sample() selected_action = np.clip(selected_action + noise, -1.0, 1.0) return selected_action # pylint: disable=no-self-use def _preprocess_state(self, state: np.ndarray) -> torch.Tensor: """Preprocess state so that actor selects an action.""" state = torch.FloatTensor(state).to(device) return state def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: """Take an action and return the response of the env.""" next_state, reward, done, info = self.env.step(action) if not self.args.test: # if the last state is not a terminal state, store done as false done_bool = ( False if self.episode_step == self.args.max_episode_steps else done ) transition = (self.curr_state, action, reward, next_state, done_bool) self._add_transition_to_memory(transition) return next_state, reward, done, info def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): """Add 1 step and n step transitions to memory.""" self.memory.add(transition) def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences # G_t = r + gamma * v(s_{t+1}) if state != Terminal # = r otherwise masks = 1 - dones next_actions = self.actor_target(next_states) next_values = self.critic_target(torch.cat((next_states, next_actions), dim=-1)) curr_returns = rewards + self.hyper_params.gamma * next_values * masks curr_returns = curr_returns.to(device) # train critic gradient_clip_ac = self.hyper_params.gradient_clip_ac gradient_clip_cr = self.hyper_params.gradient_clip_cr values = self.critic(torch.cat((states, actions), dim=-1)) critic_loss = F.mse_loss(values, curr_returns) self.critic_optim.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) self.critic_optim.step() # train actor actions = self.actor(states) actor_loss = -self.critic(torch.cat((states, actions), dim=-1)).mean() self.actor_optim.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), gradient_clip_ac) self.actor_optim.step() # update target networks common_utils.soft_update(self.actor, self.actor_target, self.hyper_params.tau) common_utils.soft_update(self.critic, self.critic_target, self.hyper_params.tau) return actor_loss.item(), critic_loss.item() def load_params(self, path: str): """Load model and optimizer parameters.""" Agent.load_params(self, path) params = torch.load(path) self.actor.load_state_dict(params["actor_state_dict"]) self.actor_target.load_state_dict(params["actor_target_state_dict"]) self.critic.load_state_dict(params["critic_state_dict"]) self.critic_target.load_state_dict(params["critic_target_state_dict"]) self.actor_optim.load_state_dict(params["actor_optim_state_dict"]) self.critic_optim.load_state_dict(params["critic_optim_state_dict"]) print("[INFO] loaded the model and optimizer from", path) def save_params(self, n_episode: int): """Save model and optimizer parameters.""" params = { "actor_state_dict": self.actor.state_dict(), "actor_target_state_dict": self.actor_target.state_dict(), "critic_state_dict": self.critic.state_dict(), "critic_target_state_dict": self.critic_target.state_dict(), "actor_optim_state_dict": self.actor_optim.state_dict(), "critic_optim_state_dict": self.critic_optim.state_dict(), } Agent._save_params(self, params, n_episode) def write_log(self, log_value: tuple): """Write log about loss and score""" i, loss, score, avg_time_cost = log_value total_loss = loss.sum() print( "[INFO] episode %d, episode step: %d, total step: %d, total score: %d\n" "total loss: %f actor_loss: %.3f critic_loss: %.3f (spent %.6f sec/step)\n" % ( i, self.episode_step, self.total_step, score, total_loss, loss[0], loss[1], avg_time_cost, ) # actor loss # critic loss ) if self.args.log: wandb.log( { "score": score, "total loss": total_loss, "actor loss": loss[0], "critic loss": loss[1], "time per each step": avg_time_cost, } ) # pylint: disable=no-self-use, unnecessary-pass def pretrain(self): """Pretraining steps.""" pass def train(self): """Train the agent.""" # logger if self.args.log: self.set_wandb() # wandb.watch([self.actor, self.critic], log="parameters") # pre-training if needed self.pretrain() for self.i_episode in range(1, self.args.episode_num + 1): state = self.env.reset() done = False score = 0 self.episode_step = 0 losses = list() t_begin = time.time() while not done: if self.args.render and self.i_episode >= self.args.render_after: self.env.render() action = self.select_action(state) next_state, reward, done, _ = self.step(action) self.total_step += 1 self.episode_step += 1 if len(self.memory) >= self.hyper_params.batch_size: for _ in range(self.hyper_params.multiple_update): loss = self.update_model() losses.append(loss) # for logging state = next_state score += reward t_end = time.time() avg_time_cost = (t_end - t_begin) / self.episode_step # logging if losses: avg_loss = np.vstack(losses).mean(axis=0) log_value = (self.i_episode, avg_loss, score, avg_time_cost) self.write_log(log_value) losses.clear() if self.i_episode % self.args.save_period == 0: self.save_params(self.i_episode) self.interim_test() # termination self.env.close() self.save_params(self.i_episode) self.interim_test()
class SACAgent(Agent): """SAC agent interacting with environment. Attrtibutes: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings hyper_params (ConfigDict): hyper-parameters network_cfg (ConfigDict): config of network for training agent optim_cfg (ConfigDict): config of optimizer state_dim (int): state size of env action_dim (int): action size of env memory (ReplayBuffer): replay memory actor (nn.Module): actor model to select actions actor_target (nn.Module): target actor model to select actions actor_optim (Optimizer): optimizer for training actor critic_1 (nn.Module): critic model to predict state values critic_2 (nn.Module): critic model to predict state values critic_target1 (nn.Module): target critic model to predict state values critic_target2 (nn.Module): target critic model to predict state values critic_optim1 (Optimizer): optimizer for training critic_1 critic_optim2 (Optimizer): optimizer for training critic_2 curr_state (np.ndarray): temporary storage of the current state total_step (int): total step numbers episode_step (int): step number of the current episode update_step (int): step number of updates i_episode (int): current episode number target_entropy (int): desired entropy used for the inequality constraint log_alpha (torch.Tensor): weight for entropy alpha_optim (Optimizer): optimizer for alpha """ def __init__( self, env: gym.Env, args: argparse.Namespace, log_cfg: ConfigDict, hyper_params: ConfigDict, backbone: ConfigDict, head: ConfigDict, optim_cfg: ConfigDict, ): """Initialize. Args: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings """ Agent.__init__(self, env, args, log_cfg) self.curr_state = np.zeros((1, )) self.total_step = 0 self.episode_step = 0 self.update_step = 0 self.i_episode = 0 self.hyper_params = hyper_params self.backbone_cfg = backbone self.head_cfg = head self.optim_cfg = optim_cfg self.state_dim = self.env.observation_space.shape self.action_dim = self.env.action_space.shape[0] # target entropy target_entropy = -np.prod((self.action_dim, )).item() # heuristic # automatic entropy tuning if hyper_params.auto_entropy_tuning: self.target_entropy = target_entropy self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = optim.Adam([self.log_alpha], lr=optim_cfg.lr_entropy) self._initialize() self._init_network() # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory self.memory = ReplayBuffer(self.hyper_params.buffer_size, self.hyper_params.batch_size) # pylint: disable=attribute-defined-outside-init def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.actor.configs.state_size = ( self.head_cfg.critic_vf.configs.state_size) = self.state_dim self.head_cfg.critic_qf.configs.state_size = (self.state_dim[0] + self.action_dim, ) self.head_cfg.actor.configs.output_size = self.action_dim # create actor self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) # create v_critic self.vf = BaseNetwork(self.backbone_cfg.critic_vf, self.head_cfg.critic_vf).to(device) self.vf_target = BaseNetwork(self.backbone_cfg.critic_vf, self.head_cfg.critic_vf).to(device) self.vf_target.load_state_dict(self.vf.state_dict()) # create q_critic self.qf_1 = BaseNetwork(self.backbone_cfg.critic_qf, self.head_cfg.critic_qf).to(device) self.qf_2 = BaseNetwork(self.backbone_cfg.critic_qf, self.head_cfg.critic_qf).to(device) # create optimizers self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.vf_optim = optim.Adam( self.vf.parameters(), lr=self.optim_cfg.lr_vf, weight_decay=self.optim_cfg.weight_decay, ) self.qf_1_optim = optim.Adam( self.qf_1.parameters(), lr=self.optim_cfg.lr_qf1, weight_decay=self.optim_cfg.weight_decay, ) self.qf_2_optim = optim.Adam( self.qf_2.parameters(), lr=self.optim_cfg.lr_qf2, weight_decay=self.optim_cfg.weight_decay, ) # load the optimizer and model parameters if self.args.load_from is not None: self.load_params(self.args.load_from) def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input space.""" self.curr_state = state state = self._preprocess_state(state) # if initial random action should be conducted if (self.total_step < self.hyper_params.initial_random_action and not self.args.test): return np.array(self.env.action_space.sample()) if self.args.test: _, _, _, selected_action, _ = self.actor(state) else: selected_action, _, _, _, _ = self.actor(state) return selected_action.detach().cpu().numpy() # pylint: disable=no-self-use def _preprocess_state(self, state: np.ndarray) -> torch.Tensor: """Preprocess state so that actor selects an action.""" state = torch.FloatTensor(state).to(device) return state def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: """Take an action and return the response of the env.""" next_state, reward, done, info = self.env.step(action) if not self.args.test: # if the last state is not a terminal state, store done as false done_bool = (False if self.episode_step == self.args.max_episode_steps else done) transition = (self.curr_state, action, reward, next_state, done_bool) self._add_transition_to_memory(transition) return next_state, reward, done, info def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): """Add 1 step and n step transitions to memory.""" self.memory.add(transition) def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" self.update_step += 1 experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states) # train alpha if self.hyper_params.auto_entropy_tuning: alpha_loss = (-self.log_alpha * (log_prob + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() alpha = self.log_alpha.exp() else: alpha_loss = torch.zeros(1) alpha = self.hyper_params.w_entropy # Q function loss masks = 1 - dones states_actions = torch.cat((states, actions), dim=-1) q_1_pred = self.qf_1(states_actions) q_2_pred = self.qf_2(states_actions) v_target = self.vf_target(next_states) q_target = rewards + self.hyper_params.gamma * v_target * masks qf_1_loss = F.mse_loss(q_1_pred, q_target.detach()) qf_2_loss = F.mse_loss(q_2_pred, q_target.detach()) # V function loss states_actions = torch.cat((states, new_actions), dim=-1) v_pred = self.vf(states) q_pred = torch.min(self.qf_1(states_actions), self.qf_2(states_actions)) v_target = q_pred - alpha * log_prob vf_loss = F.mse_loss(v_pred, v_target.detach()) # train Q functions self.qf_1_optim.zero_grad() qf_1_loss.backward() self.qf_1_optim.step() self.qf_2_optim.zero_grad() qf_2_loss.backward() self.qf_2_optim.step() # train V function self.vf_optim.zero_grad() vf_loss.backward() self.vf_optim.step() if self.update_step % self.hyper_params.policy_update_freq == 0: # actor loss advantage = q_pred - v_pred.detach() actor_loss = (alpha * log_prob - advantage).mean() # regularization mean_reg = self.hyper_params.w_mean_reg * mu.pow(2).mean() std_reg = self.hyper_params.w_std_reg * std.pow(2).mean() pre_activation_reg = self.hyper_params.w_pre_activation_reg * ( pre_tanh_value.pow(2).sum(dim=-1).mean()) actor_reg = mean_reg + std_reg + pre_activation_reg # actor loss + regularization actor_loss += actor_reg # train actor self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() # update target networks common_utils.soft_update(self.vf, self.vf_target, self.hyper_params.tau) else: actor_loss = torch.zeros(1) return ( actor_loss.item(), qf_1_loss.item(), qf_2_loss.item(), vf_loss.item(), alpha_loss.item(), ) def load_params(self, path: str): """Load model and optimizer parameters.""" Agent.load_params(self, path) params = torch.load(path) self.actor.load_state_dict(params["actor"]) self.qf_1.load_state_dict(params["qf_1"]) self.qf_2.load_state_dict(params["qf_2"]) self.vf.load_state_dict(params["vf"]) self.vf_target.load_state_dict(params["vf_target"]) self.actor_optim.load_state_dict(params["actor_optim"]) self.qf_1_optim.load_state_dict(params["qf_1_optim"]) self.qf_2_optim.load_state_dict(params["qf_2_optim"]) self.vf_optim.load_state_dict(params["vf_optim"]) if self.hyper_params.auto_entropy_tuning: self.alpha_optim.load_state_dict(params["alpha_optim"]) print("[INFO] loaded the model and optimizer from", path) def save_params(self, n_episode: int): # type: ignore """Save model and optimizer parameters.""" params = { "actor": self.actor.state_dict(), "qf_1": self.qf_1.state_dict(), "qf_2": self.qf_2.state_dict(), "vf": self.vf.state_dict(), "vf_target": self.vf_target.state_dict(), "actor_optim": self.actor_optim.state_dict(), "qf_1_optim": self.qf_1_optim.state_dict(), "qf_2_optim": self.qf_2_optim.state_dict(), "vf_optim": self.vf_optim.state_dict(), } if self.hyper_params.auto_entropy_tuning: params["alpha_optim"] = self.alpha_optim.state_dict() Agent.save_params(self, params, n_episode) def write_log(self, log_value: tuple): """Write log about loss and score""" i, loss, score, policy_update_freq, avg_time_cost = log_value total_loss = loss.sum() print( "[INFO] episode %d, episode_step %d, total step %d, total score: %d\n" "total loss: %.3f actor_loss: %.3f qf_1_loss: %.3f qf_2_loss: %.3f " "vf_loss: %.3f alpha_loss: %.3f (spent %.6f sec/step)\n" % ( i, self.episode_step, self.total_step, score, total_loss, loss[0] * policy_update_freq, # actor loss loss[1], # qf_1 loss loss[2], # qf_2 loss loss[3], # vf loss loss[4], # alpha loss avg_time_cost, )) if self.args.log: wandb.log({ "score": score, "total loss": total_loss, "actor loss": loss[0] * policy_update_freq, "qf_1 loss": loss[1], "qf_2 loss": loss[2], "vf loss": loss[3], "alpha loss": loss[4], "time per each step": avg_time_cost, }) # pylint: disable=no-self-use, unnecessary-pass def pretrain(self): """Pretraining steps.""" pass def train(self): """Train the agent.""" # logger if self.args.log: self.set_wandb() # wandb.watch([self.actor, self.vf, self.qf_1, self.qf_2], log="parameters") # pre-training if needed self.pretrain() for self.i_episode in range(1, self.args.episode_num + 1): state = self.env.reset() done = False score = 0 self.episode_step = 0 loss_episode = list() t_begin = time.time() while not done: if self.args.render and self.i_episode >= self.args.render_after: self.env.render() action = self.select_action(state) next_state, reward, done, _ = self.step(action) self.total_step += 1 self.episode_step += 1 state = next_state score += reward # training if len(self.memory) >= self.hyper_params.batch_size: for _ in range(self.hyper_params.multiple_update): loss = self.update_model() loss_episode.append(loss) # for logging t_end = time.time() avg_time_cost = (t_end - t_begin) / self.episode_step # logging if loss_episode: avg_loss = np.vstack(loss_episode).mean(axis=0) log_value = ( self.i_episode, avg_loss, score, self.hyper_params.policy_update_freq, avg_time_cost, ) self.write_log(log_value) if self.i_episode % self.args.save_period == 0: self.save_params(self.i_episode) self.interim_test() # termination self.env.close() self.save_params(self.i_episode) self.interim_test()
class A2CAgent(Agent): """1-Step Advantage Actor-Critic interacting with environment. Attributes: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings hyper_params (ConfigDict): hyper-parameters network_cfg (ConfigDict): config of network for training agent optim_cfg (ConfigDict): config of optimizer state_dim (int): state size of env action_dim (int): action size of env actor (nn.Module): policy model to select actions critic (nn.Module): critic model to evaluate states actor_optim (Optimizer): optimizer for actor critic_optim (Optimizer): optimizer for critic episode_step (int): step number of the current episode i_episode (int): current episode number transition (list): recent transition information """ def __init__( self, env: gym.Env, args: argparse.Namespace, log_cfg: ConfigDict, hyper_params: ConfigDict, backbone: ConfigDict, head: ConfigDict, optim_cfg: ConfigDict, ): """Initialize.""" Agent.__init__(self, env, args, log_cfg) self.transition: list = list() self.episode_step = 0 self.i_episode = 0 self.hyper_params = hyper_params self.backbone_cfg = backbone self.head_cfg = head self.optim_cfg = optim_cfg self.state_dim = self.env.observation_space.shape self.action_dim = self.env.action_space.shape[0] self._init_network() def _init_network(self): # create models self.head_cfg.actor.configs.state_size = ( self.head_cfg.critic.configs.state_size) = self.state_dim self.head_cfg.actor.configs.output_size = self.action_dim self.actor = BaseNetwork(self.backbone_cfg.actor, self.head_cfg.actor).to(device) self.critic = BaseNetwork(self.backbone_cfg.critic, self.head_cfg.critic).to(device) # create optimizer self.actor_optim = optim.Adam( self.actor.parameters(), lr=self.optim_cfg.lr_actor, weight_decay=self.optim_cfg.weight_decay, ) self.critic_optim = optim.Adam( self.critic.parameters(), lr=self.optim_cfg.lr_critic, weight_decay=self.optim_cfg.weight_decay, ) if self.args.load_from is not None: self.load_params(self.args.load_from) def select_action(self, state: np.ndarray) -> torch.Tensor: """Select an action from the input space.""" state = torch.FloatTensor(state).to(device) selected_action, dist = self.actor(state) if self.args.test: selected_action = dist.mean else: predicted_value = self.critic(state) log_prob = dist.log_prob(selected_action).sum(dim=-1) self.transition = [] self.transition.extend([log_prob, predicted_value]) return selected_action def step( self, action: torch.Tensor) -> Tuple[np.ndarray, np.float64, bool, dict]: """Take an action and return the response of the env.""" action = action.detach().cpu().numpy() next_state, reward, done, info = self.env.step(action) if not self.args.test: done_bool = done if self.episode_step == self.args.max_episode_steps: done_bool = False self.transition.extend([next_state, reward, done_bool]) return next_state, reward, done, info def update_model(self) -> Tuple[torch.Tensor, ...]: log_prob, pred_value, next_state, reward, done = self.transition next_state = torch.FloatTensor(next_state).to(device) # Q_t = r + gamma * V(s_{t+1}) if state != Terminal # = r otherwise mask = 1 - done next_value = self.critic(next_state).detach() q_value = reward + self.hyper_params.gamma * next_value * mask q_value = q_value.to(device) # advantage = Q_t - V(s_t) advantage = q_value - pred_value # calculate loss at the current step policy_loss = -advantage.detach( ) * log_prob # adv. is not backpropagated policy_loss += self.hyper_params.w_entropy * -log_prob # entropy value_loss = F.smooth_l1_loss(pred_value, q_value.detach()) # train gradient_clip_ac = self.hyper_params.gradient_clip_ac gradient_clip_cr = self.hyper_params.gradient_clip_cr self.actor_optim.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), gradient_clip_ac) self.actor_optim.step() self.critic_optim.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) self.critic_optim.step() return policy_loss.item(), value_loss.item() def load_params(self, path: str): """Load model and optimizer parameters.""" Agent.load_params(self, path) params = torch.load(path) self.actor.load_state_dict(params["actor_state_dict"]) self.critic.load_state_dict(params["critic_state_dict"]) self.actor_optim.load_state_dict(params["actor_optim_state_dict"]) self.critic_optim.load_state_dict(params["critic_optim_state_dict"]) print("[INFO] Loaded the model and optimizer from", path) def save_params(self, n_episode: int): """Save model and optimizer parameters.""" params = { "actor_state_dict": self.actor.state_dict(), "critic_state_dict": self.critic.state_dict(), "actor_optim_state_dict": self.actor_optim.state_dict(), "critic_optim_state_dict": self.critic_optim.state_dict(), } Agent._save_params(self, params, n_episode) def write_log(self, log_value: tuple): i, score, policy_loss, value_loss = log_value total_loss = policy_loss + value_loss print( "[INFO] episode %d\tepisode step: %d\ttotal score: %d\n" "total loss: %.4f\tpolicy loss: %.4f\tvalue loss: %.4f\n" % (i, self.episode_step, score, total_loss, policy_loss, value_loss)) if self.args.log: wandb.log({ "total loss": total_loss, "policy loss": policy_loss, "value loss": value_loss, "score": score, }) def train(self): """Train the agent.""" # logger if self.args.log: self.set_wandb() # wandb.watch([self.actor, self.critic], log="parameters") for self.i_episode in range(1, self.args.episode_num + 1): state = self.env.reset() done = False score = 0 policy_loss_episode = list() value_loss_episode = list() self.episode_step = 0 while not done: if self.args.render and self.i_episode >= self.args.render_after: self.env.render() action = self.select_action(state) next_state, reward, done, _ = self.step(action) self.episode_step += 1 policy_loss, value_loss = self.update_model() policy_loss_episode.append(policy_loss) value_loss_episode.append(value_loss) state = next_state score += reward # logging policy_loss = np.array(policy_loss_episode).mean() value_loss = np.array(value_loss_episode).mean() log_value = (self.i_episode, score, policy_loss, value_loss) self.write_log(log_value) if self.i_episode % self.args.save_period == 0: self.save_params(self.i_episode) self.interim_test() # termination self.env.close() self.save_params(self.i_episode) self.interim_test()
def calculate_iqn_loss( model: BaseNetwork, target_model: BaseNetwork, experiences: Tuple[torch.Tensor, ...], gamma: float, batch_size: int, n_tau_samples: int, n_tau_prime_samples: int, kappa: float, ) -> Tuple[torch.Tensor, torch.Tensor]: """Return element-wise IQN loss and Q-values. Reference: https://github.com/google/dopamine """ states, actions, rewards, next_states, dones = experiences[:5] # size of rewards: (n_tau_prime_samples x batch_size) x 1. rewards = rewards.repeat(n_tau_prime_samples, 1) # size of gamma_with_terminal: (n_tau_prime_samples x batch_size) x 1. masks = 1 - dones gamma_with_terminal = masks * gamma gamma_with_terminal = gamma_with_terminal.repeat(n_tau_prime_samples, 1) # Get the indices of the maximium Q-value across the action dimension. # Shape of replay_next_qt_argmax: (n_tau_prime_samples x batch_size) x 1. next_actions = model(next_states).argmax(dim=1) # double Q next_actions = next_actions[:, None] next_actions = next_actions.repeat(n_tau_prime_samples, 1) # Shape of next_target_values: (n_tau_prime_samples x batch_size) x 1. target_quantile_values, _ = target_model.forward_(next_states, n_tau_prime_samples) target_quantile_values = target_quantile_values.gather(1, next_actions) target_quantile_values = rewards + gamma_with_terminal * target_quantile_values target_quantile_values = target_quantile_values.detach() # Reshape to n_tau_prime_samples x batch_size x 1 since this is # the manner in which the target_quantile_values are tiled. target_quantile_values = target_quantile_values.view( n_tau_prime_samples, batch_size, 1) # Transpose dimensions so that the dimensionality is batch_size x # n_tau_prime_samples x 1 to prepare for computation of Bellman errors. target_quantile_values = torch.transpose(target_quantile_values, 0, 1) # Get quantile values: (n_tau_samples x batch_size) x action_dim. quantile_values, quantiles = model.forward_(states, n_tau_samples) reshaped_actions = actions[:, None].repeat(n_tau_samples, 1) chosen_action_quantile_values = quantile_values.gather( 1, reshaped_actions.long()) chosen_action_quantile_values = chosen_action_quantile_values.view( n_tau_samples, batch_size, 1) # Transpose dimensions so that the dimensionality is batch_size x # n_tau_prime_samples x 1 to prepare for computation of Bellman errors. chosen_action_quantile_values = torch.transpose( chosen_action_quantile_values, 0, 1) # Shape of bellman_erors and huber_loss: # batch_size x num_tau_prime_samples x num_tau_samples x 1. bellman_errors = (target_quantile_values[:, :, None, :] - chosen_action_quantile_values[:, None, :, :]) # The huber loss (introduced in QR-DQN) is defined via two cases: # case_one: |bellman_errors| <= kappa # case_two: |bellman_errors| > kappa huber_loss_case_one = ((torch.abs(bellman_errors) <= kappa).float() * 0.5 * bellman_errors**2) huber_loss_case_two = ((torch.abs(bellman_errors) > kappa).float() * kappa * (torch.abs(bellman_errors) - 0.5 * kappa)) huber_loss = huber_loss_case_one + huber_loss_case_two # Reshape quantiles to batch_size x num_tau_samples x 1 quantiles = quantiles.view(n_tau_samples, batch_size, 1) quantiles = torch.transpose(quantiles, 0, 1) # Tile by num_tau_prime_samples along a new dimension. Shape is now # batch_size x num_tau_prime_samples x num_tau_samples x 1. # These quantiles will be used for computation of the quantile huber loss # below (see section 2.3 of the paper). quantiles = quantiles[:, None, :, :].repeat(1, n_tau_prime_samples, 1, 1) quantiles = quantiles.to(device) # Shape: batch_size x n_tau_prime_samples x n_tau_samples x 1. quantile_huber_loss = (torch.abs(quantiles - (bellman_errors < 0).float().detach()) * huber_loss / kappa) # Sum over current quantile value (n_tau_samples) dimension, # average over target quantile value (n_tau_prime_samples) dimension. # Shape: batch_size x n_tau_prime_samples x 1. loss = torch.sum(quantile_huber_loss, dim=2) # Shape: batch_size x 1. iqn_loss_element_wise = torch.mean(loss, dim=1) # q values for regularization. q_values = model(states) return iqn_loss_element_wise, q_values
class DQNAgent(Agent): """DQN interacting with environment. Attribute: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings hyper_params (ConfigDict): hyper-parameters network_cfg (ConfigDict): config of network for training agent optim_cfg (ConfigDict): config of optimizer state_dim (int): state size of env action_dim (int): action size of env memory (PrioritizedReplayBuffer): replay memory dqn (nn.Module): actor model to select actions dqn_target (nn.Module): target actor model to select actions dqn_optim (Optimizer): optimizer for training actor curr_state (np.ndarray): temporary storage of the current state total_step (int): total step number episode_step (int): step number of the current episode i_episode (int): current episode number epsilon (float): parameter for epsilon greedy policy n_step_buffer (deque): n-size buffer to calculate n-step returns per_beta (float): beta parameter for prioritized replay buffer use_conv (bool): whether or not to use convolution layer use_n_step (bool): whether or not to use n-step returns """ def __init__( self, env: gym.Env, args: argparse.Namespace, log_cfg: ConfigDict, hyper_params: ConfigDict, backbone: ConfigDict, head: ConfigDict, optim_cfg: ConfigDict, ): """Initialize.""" Agent.__init__(self, env, args, log_cfg) self.curr_state = np.zeros(1) self.episode_step = 0 self.total_step = 0 self.i_episode = 0 self.hyper_params = hyper_params self.optim_cfg = optim_cfg self.backbone_cfg = backbone self.head_cfg = head self.state_dim = self.env.observation_space.shape self.action_dim = self.env.action_space.n self.per_beta = hyper_params.per_beta self.use_conv = len(self.state_dim) > 1 self.use_n_step = hyper_params.n_step > 1 if head.configs.use_noisy_net: self.max_epsilon = 0.0 self.min_epsilon = 0.0 self.epsilon = 0.0 else: self.max_epsilon = hyper_params.max_epsilon self.min_epsilon = hyper_params.min_epsilon self.epsilon = hyper_params.max_epsilon self._initialize() self._init_network() # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory for a single step self.memory = PrioritizedReplayBuffer( self.hyper_params.buffer_size, self.hyper_params.batch_size, alpha=self.hyper_params.per_alpha, ) # replay memory for multi-steps if self.use_n_step: self.memory_n = ReplayBuffer( self.hyper_params.buffer_size, self.hyper_params.batch_size, n_step=self.hyper_params.n_step, gamma=self.hyper_params.gamma, ) # pylint: disable=attribute-defined-outside-init def _init_network(self): """Initialize networks and optimizers.""" self.head_cfg.configs.state_size = self.state_dim self.head_cfg.configs.output_size = self.action_dim self.dqn = BaseNetwork(self.backbone_cfg, self.head_cfg).to(device) self.dqn_target = BaseNetwork(self.backbone_cfg, self.head_cfg).to(device) self.loss_fn = build_loss(self.hyper_params.loss_type) self.dqn_target.load_state_dict(self.dqn.state_dict()) # create optimizer self.dqn_optim = optim.Adam( self.dqn.parameters(), lr=self.optim_cfg.lr_dqn, weight_decay=self.optim_cfg.weight_decay, eps=self.optim_cfg.adam_eps, ) # load the optimizer and model parameters if self.args.load_from is not None: self.load_params(self.args.load_from) def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input space.""" self.curr_state = state # epsilon greedy policy # pylint: disable=comparison-with-callable if not self.args.test and self.epsilon > np.random.random(): selected_action = np.array(self.env.action_space.sample()) else: state = self._preprocess_state(state) selected_action = self.dqn(state).argmax() selected_action = selected_action.detach().cpu().numpy() return selected_action # pylint: disable=no-self-use def _preprocess_state(self, state: np.ndarray) -> torch.Tensor: """Preprocess state so that actor selects an action.""" state = torch.FloatTensor(state).to(device) return state def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: """Take an action and return the response of the env.""" next_state, reward, done, info = self.env.step(action) if not self.args.test: # if the last state is not a terminal state, store done as false done_bool = ( False if self.episode_step == self.args.max_episode_steps else done ) transition = (self.curr_state, action, reward, next_state, done_bool) self._add_transition_to_memory(transition) return next_state, reward, done, info def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): """Add 1 step and n step transitions to memory.""" # add n-step transition if self.use_n_step: transition = self.memory_n.add(transition) # add a single step transition # if transition is not an empty tuple if transition: self.memory.add(transition) def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" # 1 step loss experiences_1 = self.memory.sample(self.per_beta) weights, indices = experiences_1[-3:-1] gamma = self.hyper_params.gamma dq_loss_element_wise, q_values = self.loss_fn( self.dqn, self.dqn_target, experiences_1, gamma, self.head_cfg ) dq_loss = torch.mean(dq_loss_element_wise * weights) # n step loss if self.use_n_step: experiences_n = self.memory_n.sample(indices) gamma = self.hyper_params.gamma ** self.hyper_params.n_step dq_loss_n_element_wise, q_values_n = self.loss_fn( self.dqn, self.dqn_target, experiences_n, gamma, self.head_cfg ) # to update loss and priorities q_values = 0.5 * (q_values + q_values_n) dq_loss_element_wise += dq_loss_n_element_wise * self.hyper_params.w_n_step dq_loss = torch.mean(dq_loss_element_wise * weights) # q_value regularization q_regular = torch.norm(q_values, 2).mean() * self.hyper_params.w_q_reg # total loss loss = dq_loss + q_regular self.dqn_optim.zero_grad() loss.backward() clip_grad_norm_(self.dqn.parameters(), self.hyper_params.gradient_clip) self.dqn_optim.step() # update target networks common_utils.soft_update(self.dqn, self.dqn_target, self.hyper_params.tau) # update priorities in PER loss_for_prior = dq_loss_element_wise.detach().cpu().numpy() new_priorities = loss_for_prior + self.hyper_params.per_eps self.memory.update_priorities(indices, new_priorities) # increase beta fraction = min(float(self.i_episode) / self.args.episode_num, 1.0) self.per_beta = self.per_beta + fraction * (1.0 - self.per_beta) if self.head_cfg.configs.use_noisy_net: self.dqn.head.reset_noise() self.dqn_target.head.reset_noise() return loss.item(), q_values.mean().item() def load_params(self, path: str): """Load model and optimizer parameters.""" Agent.load_params(self, path) params = torch.load(path) self.dqn.load_state_dict(params["dqn_state_dict"]) self.dqn_target.load_state_dict(params["dqn_target_state_dict"]) self.dqn_optim.load_state_dict(params["dqn_optim_state_dict"]) print("[INFO] loaded the model and optimizer from", path) def save_params(self, n_episode: int): """Save model and optimizer parameters.""" params = { "dqn_state_dict": self.dqn.state_dict(), "dqn_target_state_dict": self.dqn_target.state_dict(), "dqn_optim_state_dict": self.dqn_optim.state_dict(), } Agent._save_params(self, params, n_episode) def write_log(self, log_value: tuple): """Write log about loss and score""" i, loss, score, avg_time_cost = log_value print( "[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n" "epsilon: %f, loss: %f, avg q-value: %f (spent %.6f sec/step)\n" % ( i, self.episode_step, self.total_step, score, self.epsilon, loss[0], loss[1], avg_time_cost, ) ) if self.args.log: wandb.log( { "score": score, "epsilon": self.epsilon, "dqn loss": loss[0], "avg q values": loss[1], "time per each step": avg_time_cost, } ) # pylint: disable=no-self-use, unnecessary-pass def pretrain(self): """Pretraining steps.""" pass def train(self): """Train the agent.""" # logger if self.args.log: self.set_wandb() # wandb.watch([self.dqn], log="parameters") # pre-training if needed self.pretrain() for self.i_episode in range(1, self.args.episode_num + 1): state = self.env.reset() self.episode_step = 0 losses = list() done = False score = 0 t_begin = time.time() while not done: if self.args.render and self.i_episode >= self.args.render_after: self.env.render() action = self.select_action(state) next_state, reward, done, _ = self.step(action) self.total_step += 1 self.episode_step += 1 if len(self.memory) >= self.hyper_params.update_starts_from: if self.total_step % self.hyper_params.train_freq == 0: for _ in range(self.hyper_params.multiple_update): loss = self.update_model() losses.append(loss) # for logging # decrease epsilon self.epsilon = max( self.epsilon - (self.max_epsilon - self.min_epsilon) * self.hyper_params.epsilon_decay, self.min_epsilon, ) state = next_state score += reward t_end = time.time() avg_time_cost = (t_end - t_begin) / self.episode_step if losses: avg_loss = np.vstack(losses).mean(axis=0) log_value = (self.i_episode, avg_loss, score, avg_time_cost) self.write_log(log_value) if self.i_episode % self.args.save_period == 0: self.save_params(self.i_episode) self.interim_test() # termination self.env.close() self.save_params(self.i_episode) self.interim_test()