def create_model(self): state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "Qsa", False).to(self.device) # load paramaters if already trained if self.pretrained is not None: self.load(self) self.ac.load_state_dict(self.checkpoint["weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size) self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_p) self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)
def create_model(self) -> None: """ Initialize the model Initializes optimizer and replay buffers as well. """ state_dim, action_dim, discrete, _ = get_env_properties(self.env) if discrete: raise Exception( "Discrete Environments not supported for {}.".format( __class__.__name__)) if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "Qsa", False).to(self.device) self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_p) self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)
def create_model(self) -> None: """ Initialize the model Initializes optimizer and replay buffers as well. """ state_dim, action_dim, discrete, _ = get_env_properties(self.env) self.q1 = (get_model("v", self.network_type)( state_dim, action_dim, "Qsa", self.layers).to(self.device).float()) self.q2 = (get_model("v", self.network_type)( state_dim, action_dim, "Qsa", self.layers).to(self.device).float()) self.policy = (get_model( "p", self.network_type)(state_dim, action_dim, self.layers, discrete, False, sac=True).to(self.device).float()) self.q1_targ = deepcopy(self.q1).to(self.device).float() self.q2_targ = deepcopy(self.q2).to(self.device).float() # freeze target parameters for param in self.q1_targ.parameters(): param.requires_grad = False for param in self.q2_targ.parameters(): param.requires_grad = False # optimizers self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr) self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr) self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr) if self.entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr) self.replay_buffer = ReplayBuffer(self.replay_size, self.env) # set action scales if self.env.action_space is None: self.action_scale = torch.tensor(1.0).to(self.device) self.action_bias = torch.tensor(0.0).to(self.device) else: self.action_scale = torch.FloatTensor( (self.env.action_space.high - self.env.action_space.low) / 2.0).to(self.device) self.action_bias = torch.FloatTensor( (self.env.action_space.high + self.env.action_space.low) / 2.0).to(self.device)
def create_model(self): state_dim, action_dim, disc = self.get_env_properties() if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "Qsa", False).to(self.device) self.ac.qf1 = self.ac.critic self.ac.qf2 = get_model("v", self.network_type)(state_dim, action_dim, hidden=self.layers, val_type="Qsa") self.ac.qf1.to(self.device) self.ac.qf2.to(self.device) if self.pretrained is not None: self.load(self) self.ac.actor.load_state_dict(self.checkpoint["policy_weights"]) self.ac.qf1.load_state_dict(self.checkpoint["q1_weights"]) self.ac.qf2.load_state_dict(self.checkpoint["q2_weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size) self.q_params = (list(self.ac.qf1.parameters()) + list(self.ac.qf2.parameters())) self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q) self.optimizer_policy = torch.optim.Adam(self.ac.actor.parameters(), lr=self.lr_p)
def create_model(self) -> None: """ Initialize the model and target model for various variants of DQN. Initializes optimizer and replay buffers as well. """ state_dim, action_dim, _, _ = get_env_properties(self.env) if self.network_type == "mlp": if self.dueling_dqn: self.model = DuelingDQNValueMlp(state_dim, action_dim) elif self.categorical_dqn: self.model = CategoricalDQNValue(state_dim, action_dim, self.num_atoms) elif self.noisy_dqn: self.model = NoisyDQNValue(state_dim, action_dim) else: self.model = get_model("v", self.network_type)(state_dim, action_dim, "Qs") elif self.network_type == "cnn": self.framestack = self.env.framestack if self.dueling_dqn: self.model = DuelingDQNValueCNN(action_dim, self.framestack) elif self.noisy_dqn: self.model = NoisyDQNValueCNN(action_dim, self.framestack) elif self.categorical_dqn: self.model = CategoricalDQNValueCNN(action_dim, self.num_atoms, self.framestack) else: self.model = get_model("v", self.network_type)(action_dim, self.framestack, "Qs") self.target_model = deepcopy(self.model) if self.prioritized_replay: self.replay_buffer = PrioritizedBuffer( self.replay_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)
def create_model(self) -> None: state_dim, action_dim, discrete, _ = get_env_properties(self.env) if discrete: raise Exception( "Discrete Environments not supported for {}.".format(__class__.__name__) ) if self.noise is not None: self.noise = self.noise( np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) self.ac = get_model("ac", self.network_type)( state_dim, action_dim, self.layers, "Qsa", False ).to(self.device) self.ac.qf1 = self.ac.critic self.ac.qf2 = get_model("v", self.network_type)( state_dim, action_dim, hidden=self.layers, val_type="Qsa" ) self.ac.qf1.to(self.device) self.ac.qf2.to(self.device) self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.q_params = list(self.ac.qf1.parameters()) + list(self.ac.qf2.parameters()) self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q) self.optimizer_policy = torch.optim.Adam( self.ac.actor.parameters(), lr=self.lr_p )
class TD3: """ Twin Delayed DDPG Paper: https://arxiv.org/abs/1509.02971 :param network_type: (str) The deep neural network layer types ['mlp'] :param env: (Gym environment) The environment to learn from :param gamma: (float) discount factor :param replay_size: (int) Replay memory size :param batch_size: (int) Update batch size :param lr_p: (float) Policy network learning rate :param lr_q: (float) Q network learning rate :param polyak: (float) Polyak averaging weight to update target network :param policy_frequency: (int) Update actor and target networks every policy_frequency steps :param epochs: (int) Number of epochs :param start_steps: (int) Number of exploratory steps at start :param steps_per_epoch: (int) Number of steps per epoch :param noise_std: (float) Standard deviation for action noise :param max_ep_len: (int) Maximum steps per episode :param start_update: (int) Number of steps before first parameter update :param update_interval: (int) Number of steps between parameter updates :param layers: (tuple or list) Number of neurons in hidden layers :param seed (int): seed for torch and gym :param render (boolean): if environment is to be rendered :param device (str): device to use for tensor operations; 'cpu' for cpu and 'cuda' for gpu :type network_type: str :type env: Gym environment :type gamma: float :type replay_size: int :type batch_size: int :type lr_p: float :type lr_q: float :type polyak: float :type policy_frequency: int :type epochs: int :type start_steps: int :type steps_per_epoch: int :type noise_std: float :type max_ep_len: int :type start_update: int :type update_interval: int :type layers: tuple or list :type seed: int :type render: boolean :type device: str """ def __init__( self, network_type: str, env: Union[gym.Env, VecEnv], gamma: float = 0.99, replay_size: int = 1000, batch_size: int = 100, lr_p: float = 0.001, lr_q: float = 0.001, polyak: float = 0.995, policy_frequency: int = 2, epochs: int = 100, start_steps: int = 10000, steps_per_epoch: int = 4000, noise: Optional[Any] = None, noise_std: float = 0.1, max_ep_len: int = 1000, start_update: int = 1000, update_interval: int = 50, layers: Tuple = (256, 256), seed: Optional[int] = None, render: bool = False, device: Union[torch.device, str] = "cpu", ): self.network_type = network_type self.env = env self.gamma = gamma self.replay_size = replay_size self.batch_size = batch_size self.lr_p = lr_p self.lr_q = lr_q self.polyak = polyak self.policy_frequency = policy_frequency self.epochs = epochs self.start_steps = start_steps self.steps_per_epoch = steps_per_epoch self.noise = noise self.noise_std = noise_std self.max_ep_len = max_ep_len self.start_update = start_update self.update_interval = update_interval self.layers = layers self.seed = seed self.render = render # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) self.empty_logs() self.create_model() def create_model(self) -> None: state_dim, action_dim, discrete, _ = get_env_properties(self.env) if discrete: raise Exception( "Discrete Environments not supported for {}.".format(__class__.__name__) ) if self.noise is not None: self.noise = self.noise( np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) self.ac = get_model("ac", self.network_type)( state_dim, action_dim, self.layers, "Qsa", False ).to(self.device) self.ac.qf1 = self.ac.critic self.ac.qf2 = get_model("v", self.network_type)( state_dim, action_dim, hidden=self.layers, val_type="Qsa" ) self.ac.qf1.to(self.device) self.ac.qf2.to(self.device) self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.q_params = list(self.ac.qf1.parameters()) + list(self.ac.qf2.parameters()) self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q) self.optimizer_policy = torch.optim.Adam( self.ac.actor.parameters(), lr=self.lr_p ) def update_params_before_select_action(self, timestep: int) -> None: """ Update any parameters before selecting action like epsilon for decaying epsilon greedy :param timestep: Timestep in the training process :type timestep: int """ pass def select_action( self, state: np.ndarray, deterministic: bool = False ) -> np.ndarray: with torch.no_grad(): action = self.ac_target.get_action( torch.as_tensor(state, dtype=torch.float32, device=self.device), deterministic=deterministic, )[0].numpy() # add noise to output from policy network if self.noise is not None: action += self.noise() return np.clip( action, -self.env.action_space.high[0], self.env.action_space.high[0] ) def get_q_loss( self, state: np.ndarray, action: np.ndarray, reward: np.ndarray, next_state: np.ndarray, done: np.ndarray, ) -> torch.Tensor: q1 = self.ac.qf1.get_value(torch.cat([state, action], dim=-1)) q2 = self.ac.qf2.get_value(torch.cat([state, action], dim=-1)) with torch.no_grad(): target_q1 = self.ac_target.qf1.get_value( torch.cat( [ next_state, self.ac_target.get_action(next_state, deterministic=True)[0], ], dim=-1, ) ) target_q2 = self.ac_target.qf2.get_value( torch.cat( [ next_state, self.ac_target.get_action(next_state, deterministic=True)[0], ], dim=-1, ) ) target_q = torch.min(target_q1, target_q2).unsqueeze(1) target = reward.squeeze(1) + self.gamma * (1 - done) * target_q.squeeze(1) l1 = nn.MSELoss()(q1, target) l2 = nn.MSELoss()(q2, target) return l1 + l2 def get_p_loss(self, state: np.array) -> torch.Tensor: q_pi = self.ac.get_value( torch.cat([state, self.ac.get_action(state, deterministic=True)[0]], dim=-1) ) return -torch.mean(q_pi) def update_params(self, update_interval: int) -> None: for timestep in range(update_interval): batch = self.replay_buffer.sample(self.batch_size) state, action, reward, next_state, done = (x.to(self.device) for x in batch) self.optimizer_q.zero_grad() # print(state.shape, action.shape, reward.shape, next_state.shape, done.shape) loss_q = self.get_q_loss(state, action, reward, next_state, done) loss_q.backward() self.optimizer_q.step() # Delayed Update if timestep % self.policy_frequency == 0: # freeze critic params for policy update for param in self.q_params: param.requires_grad = False self.optimizer_policy.zero_grad() loss_p = self.get_p_loss(state) loss_p.backward() self.optimizer_policy.step() # unfreeze critic params for param in self.ac.critic.parameters(): param.requires_grad = True # update target network with torch.no_grad(): for param, param_target in zip( self.ac.parameters(), self.ac_target.parameters() ): param_target.data.mul_(self.polyak) param_target.data.add_((1 - self.polyak) * param.data) self.logs["policy_loss"].append(loss_p.item()) self.logs["value_loss"].append(loss_q.item()) def learn(self) -> None: # pragma: no cover state, episode_reward, episode_len, episode = ( self.env.reset(), np.zeros(self.env.n_envs), np.zeros(self.env.n_envs), np.zeros(self.env.n_envs), ) total_steps = self.steps_per_epoch * self.epochs * self.env.n_envs if self.noise is not None: self.noise.reset() for timestep in range(0, total_steps, self.env.n_envs): # execute single transition if timestep > self.start_steps: action = self.select_action(state) else: action = self.env.sample() next_state, reward, done, _ = self.env.step(action) if self.render: self.env.render() episode_reward += reward episode_len += 1 # dont set d to True if max_ep_len reached # done = self.env.n_envs*[False] if np.any(episode_len == self.max_ep_len) else done done = np.array( [ False if episode_len[i] == self.max_ep_len else done[i] for i, ep_len in enumerate(episode_len) ] ) self.replay_buffer.extend(zip(state, action, reward, next_state, done)) state = next_state if np.any(done) or np.any(episode_len == self.max_ep_len): if sum(episode) % 20 == 0: print( "Ep: {}, reward: {}, t: {}".format( sum(episode), np.mean(episode_reward), timestep ) ) for i, di in enumerate(done): # print(d) if di or episode_len[i] == self.max_ep_len: episode_reward[i] = 0 episode_len[i] = 0 episode += 1 if self.noise is not None: self.noise.reset() state, episode_reward, episode_len = ( self.env.reset(), np.zeros(self.env.n_envs), np.zeros(self.env.n_envs), ) episode += 1 # update params if timestep >= self.start_update and timestep % self.update_interval == 0: self.update_params(self.update_interval) self.env.close() def get_hyperparams(self) -> Dict[str, Any]: hyperparams = { "network_type": self.network_type, "gamma": self.gamma, "lr_p": self.lr_p, "lr_q": self.lr_q, "polyak": self.polyak, "policy_frequency": self.policy_frequency, "noise_std": self.noise_std, "q1_weights": self.ac.qf1.state_dict(), "q2_weights": self.ac.qf2.state_dict(), "actor_weights": self.ac.actor.state_dict(), } return hyperparams def load_weights(self, weights) -> None: """ Load weights for the agent from pretrained model """ self.ac.actor.load_state_dict(weights["actor_weights"]) self.ac.qf1.load_state_dict(weights["q1_weights"]) self.ac.qf2.load_state_dict(weights["q2_weights"]) def get_logging_params(self) -> Dict[str, Any]: """ :returns: Logging parameters for monitoring training :rtype: dict """ logs = { "policy_loss": safe_mean(self.logs["policy_loss"]), "value_loss": safe_mean(self.logs["value_loss"]), } self.empty_logs() return logs def empty_logs(self): """ Empties logs """ self.logs = {} self.logs["policy_loss"] = [] self.logs["value_loss"] = []
class SAC: """ Soft Actor Critic algorithm (SAC) Paper: https://arxiv.org/abs/1812.05905 :param network_type: (str) The deep neural network layer types ['mlp'] :param env: (Gym environment) The environment to learn from :param gamma: (float) discount factor :param replay_size: (int) Replay memory size :param batch_size: (int) Update batch size :param lr: (float) network learning rate :param alpha: (float) entropy weight :param polyak: (float) Polyak averaging weight to update target network :param epochs: (int) Number of epochs :param start_steps: (int) Number of exploratory steps at start :param steps_per_epoch: (int) Number of steps per epoch :param max_ep_len: (int) Maximum steps per episode :param start_update: (int) Number of steps before first parameter update :param update_interval: (int) Number of steps between parameter updates :param save_interval: (int) Number of steps between saves of models :param layers: (tuple or list) Number of neurons in hidden layers :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param seed (int): seed for torch and gym :param render (boolean): if environment is to be rendered :param device (str): device to use for tensor operations; 'cpu' for cpu and 'cuda' for gpu :param run_num: (boolean) if model has already been trained :param save_name: (str) model save name (if None, model hasn't been pretrained) :param save_version: (int) model save version (if None, model hasn't been pretrained) """ def __init__( self, network_type, env, gamma=0.99, replay_size=1000000, batch_size=256, lr=3e-4, alpha=0.01, polyak=0.995, entropy_tuning=True, epochs=1000, start_steps=0, steps_per_epoch=1000, max_ep_len=1000, start_update=256, update_interval=1, layers=(256, 256), pretrained=None, tensorboard_log=None, seed=None, render=False, device="cpu", run_num=None, save_model=None, save_interval=5000, ): self.network_type = network_type self.env = env self.gamma = gamma self.replay_size = replay_size self.batch_size = batch_size self.lr = lr self.alpha = alpha self.polyak = polyak self.entropy_tuning = entropy_tuning self.epochs = epochs self.start_steps = start_steps self.steps_per_epoch = steps_per_epoch self.max_ep_len = max_ep_len self.start_update = start_update self.update_interval = update_interval self.save_interval = save_interval self.layers = layers self.pretrained = pretrained self.tensorboard_log = tensorboard_log self.seed = seed self.render = render self.run_num = run_num self.save_model = save_model self.save = save_params self.load = load_params self.evaluate = evaluate # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) # Setup tensorboard writer self.writer = None if self.tensorboard_log is not None: # pragma: no cover from torch.utils.tensorboard import SummaryWriter self.writer = SummaryWriter(log_dir=self.tensorboard_log) self.create_model() def create_model(self): state_dim = self.env.observation_space.shape[0] # initialize models if isinstance(self.env.action_space, gym.spaces.Discrete): action_dim = self.env.action_space.n disc = True elif isinstance(self.env.action_space, gym.spaces.Box): action_dim = self.env.action_space.shape[0] disc = False else: raise NotImplementedError self.q1 = get_model("v", self.network_type)(state_dim, action_dim, "Qsa", self.layers).to(self.device) self.q2 = get_model("v", self.network_type)(state_dim, action_dim, "Qsa", self.layers).to(self.device) self.policy = get_model("p", self.network_type)(state_dim, action_dim, self.layers, disc, False, sac=True).to(self.device) if self.pretrained is not None: self.load(self) self.q1.load_state_dict(self.checkpoint["q1_weights"]) self.q2.load_state_dict(self.checkpoint["q2_weights"]) self.policy.load_state_dict(self.checkpoint["policy_weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.q1_targ = deepcopy(self.q1).to(self.device) self.q2_targ = deepcopy(self.q2).to(self.device) # freeze target parameters for p in self.q1_targ.parameters(): p.requires_grad = False for p in self.q2_targ.parameters(): p.requires_grad = False # optimizers self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr) self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr) self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr) if self.entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr) self.replay_buffer = ReplayBuffer(self.replay_size) # set action scales if self.env.action_space is None: self.action_scale = torch.tensor(1.0).to(self.device) self.action_bias = torch.tensor(0.0).to(self.device) else: self.action_scale = torch.FloatTensor( (self.env.action_space.high - self.env.action_space.low) / 2.0).to(self.device) self.action_bias = torch.FloatTensor( (self.env.action_space.high + self.env.action_space.low) / 2.0).to(self.device) def sample_action(self, state): mean, log_std = self.policy.forward(state) std = log_std.exp() # reparameterization trick distribution = Normal(mean, std) xi = distribution.rsample() yi = torch.tanh(xi) action = yi * self.action_scale + self.action_bias log_pi = distribution.log_prob(xi) # enforcing action bound (appendix of paper) log_pi -= torch.log(self.action_scale * (1 - yi.pow(2)) + np.finfo(np.float32).eps) log_pi = log_pi.sum(1, keepdim=True) mean = torch.tanh(mean) * self.action_scale + self.action_bias return action, log_pi, mean def select_action(self, state): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) action, _, _ = self.sample_action(state) return action.detach().cpu().numpy()[0] def update_params(self, state, action, reward, next_state, done): reward = reward.unsqueeze(1) done = done.unsqueeze(1) # compute targets with torch.no_grad(): next_action, next_log_pi, _ = self.sample_action(next_state) next_q1_targ = self.q1_targ( torch.cat([next_state, next_action], dim=-1)) next_q2_targ = self.q2_targ( torch.cat([next_state, next_action], dim=-1)) next_q_targ = (torch.min(next_q1_targ, next_q2_targ) - self.alpha * next_log_pi) next_q = reward + self.gamma * (1 - done) * next_q_targ # compute losses q1 = self.q1(torch.cat([state, action], dim=-1)) q2 = self.q2(torch.cat([state, action], dim=-1)) q1_loss = nn.MSELoss()(q1, next_q) q2_loss = nn.MSELoss()(q2, next_q) pi, log_pi, _ = self.sample_action(state) q1_pi = self.q1(torch.cat([state, pi], dim=-1)) q2_pi = self.q2(torch.cat([state, pi], dim=-1)) min_q_pi = torch.min(q1_pi, q2_pi) policy_loss = ((self.alpha * log_pi) - min_q_pi).mean() # gradient step self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # alpha loss if self.entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() else: alpha_loss = torch.tensor(0.0).to(self.device) # soft update target params for target_param, param in zip(self.q1_targ.parameters(), self.q1.parameters()): target_param.data.copy_(target_param.data * self.polyak + param.data * (1 - self.polyak)) for target_param, param in zip(self.q2_targ.parameters(), self.q2.parameters()): target_param.data.copy_(target_param.data * self.polyak + param.data * (1 - self.polyak)) return (q1_loss.item(), q2_loss.item(), policy_loss.item(), alpha_loss.item()) def learn(self): # pragma: no cover if self.tensorboard_log: writer = SummaryWriter(self.tensorboard_log) timestep = 0 episode = 1 total_steps = self.steps_per_epoch * self.epochs while episode >= 1: episode_reward = 0 state = env.reset() done = False j = 0 while not done: # sample action if timestep > self.start_steps: action = self.select_action(state) else: action = self.env.action_space.sample() if (timestep >= self.start_update and timestep % self.update_interval == 0 and self.replay_buffer.get_len() > self.batch_size): # get losses batch = self.replay_buffer.sample(self.batch_size) states, actions, next_states, rewards, dones = (x.to( self.device) for x in batch) (q1_loss, q2_loss, policy_loss, alpha_loss) = self.update_params(states, actions, next_states, rewards, dones) # write loss logs to tensorboard if self.tensorboard_log: writer.add_scalar("loss/q1_loss", q1_loss, timestep) writer.add_scalar("loss/q2_loss", q2_loss, timestep) writer.add_scalar("loss/policy_loss", policy_loss, timestep) writer.add_scalar("loss/alpha_loss", alpha_loss, timestep) if self.save_model is not None: if (timestep >= self.start_update and timestep % self.save_interval == 0): self.checkpoint = self.get_hyperparams() self.save(self, timestep) print("Saved current model") # prepare transition for replay memory push next_state, reward, done, _ = self.env.step(action) if self.render: self.env.render() timestep += 1 j += 1 episode_reward += reward ndone = 1 if j == self.max_ep_len else float(not done) self.replay_buffer.push( (state, action, reward, next_state, 1 - ndone)) state = next_state if timestep > total_steps: break # write episode reward to tensorboard logs if self.tensorboard_log: writer.add_scalar("reward/episode_reward", episode_reward, timestep) if episode % 5 == 0: print("Episode: {}, Total Timesteps: {}, Reward: {}".format( episode, timestep, episode_reward)) episode += 1 self.env.close() if self.tensorboard_log: self.writer.close() def get_hyperparams(self): hyperparams = { "network_type": self.network_type, "gamma": self.gamma, "lr": self.lr, "replay_size": self.replay_size, "entropy_tuning": self.entropy_tuning, "alpha": self.alpha, "polyak": self.polyak, "q1_weights": self.q1.state_dict(), "q2_weights": self.q2.state_dict(), "policy_weights": self.policy.state_dict(), } return hyperparams
def create_model(self): state_dim = self.env.observation_space.shape[0] # initialize models if isinstance(self.env.action_space, gym.spaces.Discrete): action_dim = self.env.action_space.n disc = True elif isinstance(self.env.action_space, gym.spaces.Box): action_dim = self.env.action_space.shape[0] disc = False else: raise NotImplementedError self.q1 = get_model("v", self.network_type)(state_dim, action_dim, "Qsa", self.layers).to(self.device) self.q2 = get_model("v", self.network_type)(state_dim, action_dim, "Qsa", self.layers).to(self.device) self.policy = get_model("p", self.network_type)(state_dim, action_dim, self.layers, disc, False, sac=True).to(self.device) if self.pretrained is not None: self.load(self) self.q1.load_state_dict(self.checkpoint["q1_weights"]) self.q2.load_state_dict(self.checkpoint["q2_weights"]) self.policy.load_state_dict(self.checkpoint["policy_weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.q1_targ = deepcopy(self.q1).to(self.device) self.q2_targ = deepcopy(self.q2).to(self.device) # freeze target parameters for p in self.q1_targ.parameters(): p.requires_grad = False for p in self.q2_targ.parameters(): p.requires_grad = False # optimizers self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr) self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr) self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr) if self.entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr) self.replay_buffer = ReplayBuffer(self.replay_size) # set action scales if self.env.action_space is None: self.action_scale = torch.tensor(1.0).to(self.device) self.action_bias = torch.tensor(0.0).to(self.device) else: self.action_scale = torch.FloatTensor( (self.env.action_space.high - self.env.action_space.low) / 2.0).to(self.device) self.action_bias = torch.FloatTensor( (self.env.action_space.high + self.env.action_space.low) / 2.0).to(self.device)
class DQN: """ Deep Q Networks Paper (DQN) https://arxiv.org/pdf/1312.5602.pdf Paper (Double DQN) https://arxiv.org/abs/1509.06461 :param network_type: The deep neural network layer types ['mlp', 'cnn'] :param env: The environment to learn from :param double_dqn: For training Double DQN :param dueling_dqn: For training Dueling DQN :param noisy_dqn: For using Noisy Q :param categorical_dqn: For using Distributional DQN :param parameterized_replay: For using a prioritized buffer :param epochs: Number of epochs :param max_iterations_per_epoch: Number of iterations per epoch :param max_ep_len: Maximum steps per episode :param gamma: discount factor :param lr: learing rate for the optimizer :param batch_size: Update batch size :param replay_size: Replay memory size :param tensorboard_log: the log location for tensorboard :param seed: seed for torch and gym :param render: if environment is to be rendered :param device: device to use for tensor operations; 'cpu' for cpu and 'cuda' for gpu :type network_type: string :type env: Gym environment :type double_dqn: bool :type dueling_dqn: bool :type noisy_dqn: bool :type categorical_dqn: bool :type parameterized_replay: bool :type epochs: int :type max_iterations_per_epoch: int :type max_ep_len: int :type gamma: float :type lr: float :type batch_size: int :type replay_size: int :type tensorboard_log: string :type seed: int :type render: bool :type device: string """ def __init__( self, network_type, env, double_dqn=False, dueling_dqn=False, noisy_dqn=False, categorical_dqn=False, prioritized_replay=False, epochs=100, max_iterations_per_epoch=100, max_ep_len=1000, gamma=0.99, lr=0.001, batch_size=32, replay_size=100, prioritized_replay_alpha=0.6, max_epsilon=1.0, min_epsilon=0.01, epsilon_decay=1000, num_atoms=51, Vmin=-10, Vmax=10, tensorboard_log=None, seed=None, render=False, device="cpu", save_interval=5000, pretrained=None, run_num=None, save_model=None, transform=None, ): self.env = env self.double_dqn = double_dqn self.dueling_dqn = dueling_dqn self.noisy_dqn = noisy_dqn self.categorical_dqn = categorical_dqn self.prioritized_replay = prioritized_replay self.max_epochs = epochs self.max_iterations_per_epoch = max_iterations_per_epoch self.max_ep_len = max_ep_len self.replay_size = replay_size self.prioritized_replay_alpha = prioritized_replay_alpha self.lr = lr self.gamma = gamma self.batch_size = batch_size self.num_atoms = num_atoms self.Vmin = Vmin self.Vmax = Vmax self.tensorboard_log = tensorboard_log self.render = render self.loss_hist = [] self.reward_hist = [] self.max_epsilon = max_epsilon self.min_epsilon = min_epsilon self.epsilon_decay = epsilon_decay self.evaluate = evaluate self.run_num = run_num self.save_model = save_model self.save_interval = save_interval self.save = save_params self.load = load_params self.pretrained = pretrained self.network_type = network_type self.history_length = None self.transform = transform # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) # Setup tensorboard writer self.writer = None if self.tensorboard_log is not None: # pragma: no cover from torch.utils.tensorboard import SummaryWriter self.writer = SummaryWriter(log_dir=self.tensorboard_log) self.create_model() def create_model(self): ''' Initialize the model and target model for various variants of DQN. Initializes optimizer and replay buffers as well. ''' state_dim, action_dim, disc = self.get_env_properties() if self.network_type == "mlp": if self.dueling_dqn: self.model = DuelingDQNValueMlp(state_dim, action_dim) elif self.categorical_dqn: self.model = CategoricalDQNValue( state_dim, action_dim, self.num_atoms, ) elif self.noisy_dqn: self.model = NoisyDQNValue(state_dim, action_dim) else: self.model = get_model("v", self.network_type)(state_dim, action_dim, "Qs") elif self.network_type == "cnn": if self.history_length is None: self.history_length = 4 if self.transform is None: self.transform = transforms.Compose([ transforms.ToPILImage(), transforms.Grayscale(), transforms.Resize((110, 84)), transforms.CenterCrop(84), transforms.ToTensor() ]) self.state_history = deque([ self.transform(self.env.observation_space.sample()).reshape( -1, 84, 84) for _ in range(self.history_length) ], maxlen=self.history_length) if self.dueling_dqn: self.model = DuelingDQNValueCNN(self.env.action_space.n, self.history_length) elif self.noisy_dqn: self.model = NoisyDQNValueCNN(self.env.action_space.n, self.history_length) elif self.categorical_dqn: self.model = CategoricalDQNValueCNN(self.env.action_space.n, self.num_atoms, self.history_length) else: self.model = get_model("v", self.network_type)( self.env.action_space.n, self.history_length, "Qs") # load paramaters if already trained if self.pretrained is not None: self.load(self) self.model.load_state_dict(self.checkpoint["weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.target_model = deepcopy(self.model) if self.prioritized_replay: self.replay_buffer = PrioritizedBuffer( self.replay_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.replay_size) self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr) def get_env_properties(self): ''' Helper function to extract the observation and action space :returns: Observation space, Action Space and whether the action space is discrete or not :rtype: int, float, ... ; int, float, ... ; bool ''' state_dim = self.env.observation_space.shape[0] if isinstance(self.env.action_space, gym.spaces.Discrete): action_dim = self.env.action_space.n disc = True elif isinstance(self.env.action_space, gym.spaces.Box): action_dim = self.env.action_space.shape[0] disc = False else: raise NotImplementedError return state_dim, action_dim, disc def update_target_model(self): ''' Copy the target model weights with the model ''' self.target_model.load_state_dict(self.model.state_dict()) def select_action(self, state): ''' Epsilon Greedy selection of action :param state: Observation state :type state: int, float, ... :returns: Action based on the state and epsilon value :rtype: int, float, ... ''' if np.random.rand() > self.epsilon: if self.categorical_dqn: with torch.no_grad(): state = Variable(torch.FloatTensor(state)) dist = self.model(state).data.cpu() dist = ( dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms)) action = dist.sum(2).max(1)[1].numpy()[0] else: state = Variable(torch.FloatTensor(state)) q_value = self.model(state) action = np.argmax(q_value.detach().numpy()) else: action = self.env.action_space.sample() return action def get_td_loss(self): ''' Computes loss for various variants :returns: the TD loss depending upon the variant :rtype: float ''' if self.prioritized_replay: ( state, action, reward, next_state, done, indices, weights, ) = self.replay_buffer.sample(self.batch_size) weights = Variable(torch.FloatTensor(weights)) else: (state, action, reward, next_state, done) = self.replay_buffer.sample(self.batch_size) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action.long())) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) if self.network_type == "cnn": state = state.view(-1, 4, 84, 84) next_state = next_state.view(-1, 4, 84, 84) if self.categorical_dqn: projection_dist = self.projection_distribution( next_state, reward, done) dist = self.model(state) action = (action.unsqueeze(1).unsqueeze(1).expand( self.batch_size, 1, self.num_atoms)) dist = dist.gather(1, action).squeeze(1) dist.data.clamp_(0.01, 0.99) elif self.double_dqn: q_values = self.model(state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) q_next_state_values = self.model(next_state) action_next = q_next_state_values.max(1)[1] q_target_next_state_values = self.target_model(next_state) q_target_s_a_prime = q_target_next_state_values.gather( 1, action_next.unsqueeze(1)).squeeze(1) expected_q_value = (reward + self.gamma * q_target_s_a_prime * (1 - done)) else: q_values = self.model(state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) q_next_state_values = self.target_model(next_state) q_s_a_prime = q_next_state_values.max(1)[0] expected_q_value = reward + self.gamma * q_s_a_prime * (1 - done) if self.categorical_dqn: loss = -(Variable(projection_dist) * dist.log()).sum(1).mean() else: if self.prioritized_replay: loss = (q_value - expected_q_value.detach()).pow(2) * weights priorities = loss + 1e-5 loss = loss.mean() self.replay_buffer.update_priorities( indices, priorities.data.cpu().numpy()) else: loss = (q_value - expected_q_value.detach()).pow(2).mean() self.loss_hist.append(loss) return loss def update_params(self): ''' Takes the step for optimizer. This internally call get_td_loss(), so no need to call the function explicitly. ''' loss = self.get_td_loss() self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.noisy_dqn or self.categorical_dqn: self.model.reset_noise() self.target_model.reset_noise() def calculate_epsilon_by_frame(self, frame_idx): ''' A helper function to calculate the value of epsilon after every step. :param frame_idx: Current step :type frame_idx: int :returns: epsilon value for the step :rtype: float ''' return (self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-1.0 * frame_idx / self.epsilon_decay)) def projection_distribution(self, next_state, rewards, dones): ''' A helper function used for categorical DQN :param next_state: next observation state :param rewards: rewards collected :param dones: dones :type next_state: int, float, ... :type rewards: list :type dones: list :returns: projection distribution :rtype: float ''' batch_size = next_state.size(0) delta_z = float(self.Vmax - self.Vmin) / (self.num_atoms - 1) support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms) next_dist = self.target_model(next_state).data.cpu() * support next_action = next_dist.sum(2).max(1)[1] next_action = (next_action.unsqueeze(1).unsqueeze(1).expand( next_dist.size(0), 1, next_dist.size(2))) next_dist = next_dist.gather(1, next_action).squeeze(1) rewards = rewards.unsqueeze(1).expand_as(next_dist) dones = dones.unsqueeze(1).expand_as(next_dist) support = support.unsqueeze(0).expand_as(next_dist) Tz = rewards + (1 - dones) * 0.99 * support Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) b = (Tz - self.Vmin) / delta_z lower = b.floor().long() upper = b.ceil().long() offset = torch.linspace(0, (batch_size - 1) * self.num_atoms, batch_size).long().unsqueeze(1).expand( self.batch_size, self.num_atoms) projection_dist = torch.zeros(next_dist.size()) projection_dist.view(-1).index_add_(0, (lower + offset).view(-1), (next_dist * (upper.float() - b)).view(-1)) projection_dist.view(-1).index_add_(0, (upper + offset).view(-1), (next_dist * (b - lower.float())).view(-1)) return projection_dist def learn(self): # pragma: no cover total_steps = self.max_epochs * self.max_iterations_per_epoch state, episode_reward, episode, episode_len = self.env.reset(), 0, 0, 0 if self.network_type == "cnn": self.state_history.append(self.transform(state)) phi_state = torch.stack(list(self.state_history), dim=1) if self.double_dqn: self.update_target_model() for frame_idx in range(1, total_steps + 1): self.epsilon = self.calculate_epsilon_by_frame(frame_idx) if self.network_type == "mlp": action = self.select_action(state) elif self.network_type == "cnn": action = self.select_action(phi_state) next_state, reward, done, _ = self.env.step(action) if self.render: self.env.render() if self.network_type == "cnn": self.state_history.append(self.transform(next_state)) phi_next_state = torch.stack(list(self.state_history), dim=1) self.replay_buffer.push( (phi_state, action, reward, phi_next_state, done)) phi_state = phi_next_state else: self.replay_buffer.push( (state, action, reward, next_state, done)) state = next_state episode_reward += reward episode_len += 1 done = False if episode_len == self.max_ep_len else done if done or (episode_len == self.max_ep_len): if episode % 2 == 0: print("Episode: {}, Reward: {}, Frame Index: {}".format( episode, episode_reward, frame_idx)) if self.tensorboard_log: self.writer.add_scalar("episode_reward", episode_reward, frame_idx) self.reward_hist.append(episode_reward) state, episode_reward, episode_len = self.env.reset(), 0, 0 episode += 1 if self.replay_buffer.get_len() > self.batch_size: self.update_params() if self.save_model is not None: if frame_idx % self.save_interval == 0: self.checkpoint = self.get_hyperparams() self.save(self, frame_idx) print("Saved current model") if frame_idx % 100 == 0: self.update_target_model() self.env.close() if self.tensorboard_log: self.writer.close() def get_hyperparams(self): hyperparams = { "gamma": self.gamma, "batch_size": self.batch_size, "lr": self.lr, "replay_size": self.replay_size, "double_dqn": self.double_dqn, "dueling_dqn": self.dueling_dqn, "noisy_dqn": self.noisy_dqn, "categorical_dqn": self.categorical_dqn, "prioritized_replay": self.prioritized_replay, "prioritized_replay_alpha": self.prioritized_replay_alpha, "weights": self.model.state_dict(), } return hyperparams
def create_model(self): ''' Initialize the model and target model for various variants of DQN. Initializes optimizer and replay buffers as well. ''' state_dim, action_dim, disc = self.get_env_properties() if self.network_type == "mlp": if self.dueling_dqn: self.model = DuelingDQNValueMlp(state_dim, action_dim) elif self.categorical_dqn: self.model = CategoricalDQNValue( state_dim, action_dim, self.num_atoms, ) elif self.noisy_dqn: self.model = NoisyDQNValue(state_dim, action_dim) else: self.model = get_model("v", self.network_type)(state_dim, action_dim, "Qs") elif self.network_type == "cnn": if self.history_length is None: self.history_length = 4 if self.transform is None: self.transform = transforms.Compose([ transforms.ToPILImage(), transforms.Grayscale(), transforms.Resize((110, 84)), transforms.CenterCrop(84), transforms.ToTensor() ]) self.state_history = deque([ self.transform(self.env.observation_space.sample()).reshape( -1, 84, 84) for _ in range(self.history_length) ], maxlen=self.history_length) if self.dueling_dqn: self.model = DuelingDQNValueCNN(self.env.action_space.n, self.history_length) elif self.noisy_dqn: self.model = NoisyDQNValueCNN(self.env.action_space.n, self.history_length) elif self.categorical_dqn: self.model = CategoricalDQNValueCNN(self.env.action_space.n, self.num_atoms, self.history_length) else: self.model = get_model("v", self.network_type)( self.env.action_space.n, self.history_length, "Qs") # load paramaters if already trained if self.pretrained is not None: self.load(self) self.model.load_state_dict(self.checkpoint["weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.target_model = deepcopy(self.model) if self.prioritized_replay: self.replay_buffer = PrioritizedBuffer( self.replay_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.replay_size) self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)
class DDPG: """ Deep Deterministic Policy Gradient algorithm (DDPG) Paper: https://arxiv.org/abs/1509.02971 :param network_type: (str) The deep neural network layer types ['mlp'] :param env: (Gym environment) The environment to learn from :param gamma: (float) discount factor :param replay_size: (int) Replay memory size :param batch_size: (int) Update batch size :param lr_p: (float) Policy network learning rate :param lr_q: (float) Q network learning rate :param polyak: (float) Polyak averaging weight to update target network :param epochs: (int) Number of epochs :param start_steps: (int) Number of exploratory steps at start :param steps_per_epoch: (int) Number of steps per epoch :param noise_std: (float) Standard deviation for action noise :param max_ep_len: (int) Maximum steps per episode :param start_update: (int) Number of steps before first parameter update :param update_interval: (int) Number of steps between parameter updates :param save_interval: (int) Number of steps between saves of models :param layers: (tuple or list) Number of neurons in hidden layers :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param seed (int): seed for torch and gym :param render (boolean): if environment is to be rendered :param device (str): device to use for tensor operations; 'cpu' for cpu and 'cuda' for gpu :param run_num: (int) model run number if it has already been trained, (if None, don't load from past model) :param save_model: (string) directory the user wants to save models to """ def __init__( self, network_type, env, gamma=0.99, replay_size=1000000, batch_size=100, lr_p=0.0001, lr_q=0.001, polyak=0.995, epochs=100, start_steps=10000, steps_per_epoch=4000, noise=None, noise_std=0.1, max_ep_len=1000, start_update=1000, update_interval=50, layers=(32, 32), pretrained=None, tensorboard_log=None, seed=None, render=False, device="cpu", run_num=None, save_model=None, save_interval=5000, ): self.network_type = network_type self.env = env self.gamma = gamma self.replay_size = replay_size self.batch_size = batch_size self.lr_p = lr_p self.lr_q = lr_q self.polyak = polyak self.epochs = epochs self.start_steps = start_steps self.steps_per_epoch = steps_per_epoch self.noise = noise self.noise_std = noise_std self.max_ep_len = max_ep_len self.start_update = start_update self.update_interval = update_interval self.save_interval = save_interval self.pretrained = pretrained self.layers = layers self.tensorboard_log = tensorboard_log self.seed = seed self.render = render self.evaluate = evaluate self.run_num = run_num self.save_model = save_model self.save = save_params self.load = load_params # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) # Setup tensorboard writer self.writer = None if self.tensorboard_log is not None: # pragma: no cover from torch.utils.tensorboard import SummaryWriter self.writer = SummaryWriter(log_dir=self.tensorboard_log) self.create_model() def create_model(self): state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "Qsa", False).to(self.device) # load paramaters if already trained if self.pretrained is not None: self.load(self) self.ac.load_state_dict(self.checkpoint["weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size) self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_p) self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q) def select_action(self, state, deterministic=True): with torch.no_grad(): action, _ = self.ac.get_action(torch.as_tensor( state, dtype=torch.float32).to(self.device), deterministic=deterministic) action = action.detach().cpu().numpy() # add noise to output from policy network if self.noise is not None: action += self.noise() return np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) def get_q_loss(self, state, action, reward, next_state, done): q = self.ac.critic.get_value(torch.cat([state, action], dim=-1)) with torch.no_grad(): q_pi_target = self.ac_target.get_value( torch.cat([ next_state, self.ac_target.get_action(next_state, True)[0] ], dim=-1)) target = reward + self.gamma * (1 - done) * q_pi_target return nn.MSELoss()(q, target) def get_p_loss(self, state): q_pi = self.ac.get_value( torch.cat([state, self.ac.get_action(state, True)[0]], dim=-1)) return -torch.mean(q_pi) def update_params(self, state, action, reward, next_state, done): self.optimizer_q.zero_grad() loss_q = self.get_q_loss(state, action, reward, next_state, done) loss_q.backward() self.optimizer_q.step() # freeze critic params for policy update for param in self.ac.critic.parameters(): param.requires_grad = False self.optimizer_policy.zero_grad() loss_p = self.get_p_loss(state) loss_p.backward() self.optimizer_policy.step() # unfreeze critic params for param in self.ac.critic.parameters(): param.requires_grad = True # update target network with torch.no_grad(): for param, param_target in zip(self.ac.parameters(), self.ac_target.parameters()): param_target.data.mul_(self.polyak) param_target.data.add_((1 - self.polyak) * param.data) def learn(self): # pragma: no cover state, episode_reward, episode_len, episode = self.env.reset(), 0, 0, 0 total_steps = self.steps_per_epoch * self.epochs if self.noise is not None: self.noise.reset() for t in range(total_steps): # execute single transition if t > self.start_steps: action = self.select_action(state, deterministic=True) else: action = self.env.action_space.sample() next_state, reward, done, _ = self.env.step(action) if self.render: self.env.render() episode_reward += reward episode_len += 1 # don't set done to True if max_ep_len reached done = False if episode_len == self.max_ep_len else done self.replay_buffer.push((state, action, reward, next_state, done)) state = next_state if done or (episode_len == self.max_ep_len): if self.noise is not None: self.noise.reset() if episode % 20 == 0: print("Episode: {}, Reward: {}, Timestep: {}".format( episode, episode_reward, t)) if self.tensorboard_log: self.writer.add_scalar("episode_reward", episode_reward, t) state, episode_reward, episode_len = self.env.reset(), 0, 0 episode += 1 # update params if t >= self.start_update and t % self.update_interval == 0: for _ in range(self.update_interval): batch = self.replay_buffer.sample(self.batch_size) states, actions, next_states, rewards, dones = (x.to( self.device) for x in batch) self.update_params(states, actions, next_states, rewards, dones) if self.save_model is not None: if t >= self.start_update and t % self.save_interval == 0: self.checkpoint = self.get_hyperparams() self.save(self, t) print("Saved current model") self.env.close() if self.tensorboard_log: self.writer.close() def get_hyperparams(self): hyperparams = { "network_type": self.network_type, "gamma": self.gamma, "batch_size": self.batch_size, "replay_size": self.replay_size, "polyak": self.polyak, "noise_std": self.noise_std, "lr_policy": self.lr_p, "lr_value": self.lr_q, "weights": self.ac.state_dict(), } return hyperparams
class SAC: """ Soft Actor Critic algorithm (SAC) Paper: https://arxiv.org/abs/1812.05905 :param network_type: The deep neural network layer types ['mlp', 'cnn'] :param env: The environment to learn from :param gamma: discount factor :param replay_size: Replay memory size :param batch_size: Update batch size :param lr: learning rate for optimizers :param alpha: entropy coefficient :param polyak: polyak averaging weight for target network update :param entropy_tuning: if alpha should be a learned parameter :param epochs: Number of epochs to train on :param start_steps: Number of initial exploratory steps :param steps_per_epoch: Number of parameter updates per epoch :param max_ep_len: Maximum number of steps per episode :param start_update: Number of steps before first parameter update :param update_interval: Number of step between updates :param layers: Neural network layer dimensions :param seed: seed for torch and gym :param render: if environment is to be rendered :param device: device to use for tensor operations; ['cpu','cuda'] :type network_type: string :type env: Gym environment :type gamma: float :type replay_size: int :type batch_size: int :type lr: float :type alpha: float :type polyak: float :type entropy_tuning: bool :type epochs: int :type start_steps: int :type steps_per_epoch: int :type max_ep_len: int :type start_update: int :type update_interval: int :type layers: tuple :type seed: int :type render: bool :type device: string """ def __init__( self, network_type: str, env: Union[gym.Env, VecEnv], gamma: float = 0.99, replay_size: int = 1000000, batch_size: int = 256, lr: float = 3e-4, alpha: float = 0.01, polyak: float = 0.995, entropy_tuning: bool = True, epochs: int = 1000, start_steps: int = 0, steps_per_epoch: int = 1000, max_ep_len: int = 1000, start_update: int = 256, update_interval: int = 1, layers: Tuple = (256, 256), seed: Optional[int] = None, render: bool = False, device: Union[torch.device, str] = "cpu", ): self.network_type = network_type self.env = env self.gamma = gamma self.replay_size = replay_size self.batch_size = batch_size self.lr = lr self.alpha = alpha self.polyak = polyak self.entropy_tuning = entropy_tuning self.epochs = epochs self.start_steps = start_steps self.steps_per_epoch = steps_per_epoch self.max_ep_len = max_ep_len self.start_update = start_update self.update_interval = update_interval self.layers = layers self.seed = seed self.render = render # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) # Setup tensorboard writer self.writer = None self.empty_logs() self.create_model() def create_model(self) -> None: """ Initialize the model Initializes optimizer and replay buffers as well. """ state_dim, action_dim, discrete, _ = get_env_properties(self.env) self.q1 = (get_model("v", self.network_type)( state_dim, action_dim, "Qsa", self.layers).to(self.device).float()) self.q2 = (get_model("v", self.network_type)( state_dim, action_dim, "Qsa", self.layers).to(self.device).float()) self.policy = (get_model( "p", self.network_type)(state_dim, action_dim, self.layers, discrete, False, sac=True).to(self.device).float()) self.q1_targ = deepcopy(self.q1).to(self.device).float() self.q2_targ = deepcopy(self.q2).to(self.device).float() # freeze target parameters for param in self.q1_targ.parameters(): param.requires_grad = False for param in self.q2_targ.parameters(): param.requires_grad = False # optimizers self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr) self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr) self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr) if self.entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr) self.replay_buffer = ReplayBuffer(self.replay_size, self.env) # set action scales if self.env.action_space is None: self.action_scale = torch.tensor(1.0).to(self.device) self.action_bias = torch.tensor(0.0).to(self.device) else: self.action_scale = torch.FloatTensor( (self.env.action_space.high - self.env.action_space.low) / 2.0).to(self.device) self.action_bias = torch.FloatTensor( (self.env.action_space.high + self.env.action_space.low) / 2.0).to(self.device) def sample_action(self, state: np.ndarray, deterministic: bool = False) -> np.ndarray: """ sample action normal distribution parameterized by policy network :param state: Observation state :param deterministic: Is the greedy action being chosen? :type state: int, float, ... :type deterministic: bool :returns: action :returns: log likelihood of policy :returns: scaled mean of normal distribution :rtype: int, float, ... :rtype: float :rtype: float """ mean, log_std = self.policy.forward(state) std = log_std.exp() # reparameterization trick distribution = Normal(mean, std) xi = distribution.rsample() yi = torch.tanh(xi) action = yi * self.action_scale + self.action_bias log_pi = distribution.log_prob(xi) # enforcing action bound (appendix of paper) log_pi -= torch.log(self.action_scale * (1 - yi.pow(2)) + np.finfo(np.float32).eps) log_pi = log_pi.sum(1, keepdim=True) mean = torch.tanh(mean) * self.action_scale + self.action_bias return action.float(), log_pi, mean def update_params_before_select_action(self, timestep: int) -> None: """ Update any parameters before selecting action like epsilon for decaying epsilon greedy :param timestep: Timestep in the training process :type timestep: int """ pass def select_action(self, state, deterministic=False): """ select action given a state :param state: Observation state :param deterministic: Is the greedy action being chosen? :type state: int, float, ... :type deterministic: bool """ state = torch.FloatTensor(state).to(self.device) action, _, _ = self.sample_action(state, deterministic) return action.detach().cpu().numpy() def update_params(self, update_interval: int) -> (Tuple[float]): """ Computes loss and takes optimizer step :param timestep: timestep :type timestep: int :returns: policy loss :rtype: float :returns: entropy coefficient loss :rtype: float """ for timestep in range(update_interval): batch = self.replay_buffer.sample(self.batch_size) state, action, reward, next_state, done = (x.to(self.device) for x in batch) # compute targets if self.env.n_envs == 1: state, action, next_state = ( state.squeeze().float(), action.squeeze(1).float(), next_state.squeeze().float(), ) else: state, action, next_state = ( state.reshape(-1, *self.env.obs_shape).float(), action.reshape(-1, *self.env.action_shape).float(), next_state.reshape(-1, *self.env.obs_shape).float(), ) reward, done = reward.reshape(-1, 1), done.reshape(-1, 1) with torch.no_grad(): next_action, next_log_pi, _ = self.sample_action(next_state) next_q1_targ = self.q1_targ( torch.cat([next_state, next_action], dim=-1)) next_q2_targ = self.q2_targ( torch.cat([next_state, next_action], dim=-1)) next_q_targ = (torch.min(next_q1_targ, next_q2_targ) - self.alpha * next_log_pi) next_q = reward + self.gamma * (1 - done) * next_q_targ # compute losses q1 = self.q1(torch.cat([state, action], dim=-1)) q2 = self.q2(torch.cat([state, action], dim=-1)) q1_loss = nn.MSELoss()(q1, next_q) q2_loss = nn.MSELoss()(q2, next_q) pi, log_pi, _ = self.sample_action(state) q1_pi = self.q1(torch.cat([state, pi.float()], dim=-1).float()) q2_pi = self.q2(torch.cat([state, pi.float()], dim=-1).float()) min_q_pi = torch.min(q1_pi, q2_pi) policy_loss = ((self.alpha * log_pi) - min_q_pi).mean() # gradient step self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # alpha loss alpha_loss = torch.tensor(0.0).to(self.device) if self.entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() # soft update target params for target_param, param in zip(self.q1_targ.parameters(), self.q1.parameters()): target_param.data.copy_(target_param.data * self.polyak + param.data * (1 - self.polyak)) for target_param, param in zip(self.q2_targ.parameters(), self.q2.parameters()): target_param.data.copy_(target_param.data * self.polyak + param.data * (1 - self.polyak)) self.logs["q1_loss"].append(q1_loss.item()) self.logs["q2_loss"].append(q2_loss.item()) self.logs["policy_loss"].append(policy_loss.item()) self.logs["alpha_loss"].append(alpha_loss.item()) def learn(self) -> None: # pragma: no cover total_steps = self.steps_per_epoch * self.epochs * self.env.n_envs episode_reward, episode_len = ( np.zeros(self.env.n_envs), np.zeros(self.env.n_envs), ) state = self.env.reset() for i in range(0, total_steps, self.env.n_envs): # done = [False] * self.env.n_envs # while not done: # sample action if i > self.start_steps: action = self.select_action(state) else: action = self.env.sample() if (i >= self.start_update and i % self.update_interval == 0 and self.replay_buffer.pos > self.batch_size): self.update_params(self.update_interval) # prepare transition for replay memory push next_state, reward, done, _ = self.env.step(action) if self.render: self.env.render() done = [ False if ep_len == self.max_ep_len else done for ep_len in episode_len ] if np.any(done) or np.any(episode_len == self.max_ep_len): for j, di in enumerate(done): if di: episode_reward[j] = 0 episode_len[j] = 0 self.replay_buffer.extend( zip(state, action, reward, next_state, done)) state = next_state if i > total_steps: break if sum(episode_len) % ( 5 * self.env.n_envs) == 0 and sum(episode_len) != 0: print("Episode: {}, total numsteps: {}, reward: {}".format( sum(episode_len), i, episode_reward)) # ep += 1 self.env.close() def get_hyperparams(self) -> Dict[str, Any]: hyperparams = { "network_type": self.network_type, "gamma": self.gamma, "lr": self.lr, "replay_size": self.replay_size, "entropy_tuning": self.entropy_tuning, "alpha": self.alpha, "polyak": self.polyak, "q1_weights": self.q1.state_dict(), "q2_weights": self.q2.state_dict(), "policy_weights": self.policy.state_dict(), } return hyperparams def load_weights(self, weights) -> None: """ Load weights for the agent from pretrained model """ self.q1.load_state_dict(weights["q1_weights"]) self.q2.load_state_dict(weights["q2_weights"]) self.policy.load_state_dict(weights["policy_weights"]) def get_logging_params(self) -> Dict[str, Any]: """ :returns: Logging parameters for monitoring training :rtype: dict """ logs = { "policy_loss": safe_mean(self.logs["policy_loss"]), "q1_loss": safe_mean(self.logs["q1_loss"]), "q2_loss": safe_mean(self.logs["q2_loss"]), "alpha_loss": safe_mean(self.logs["alpha_loss"]), } self.empty_logs() return logs def empty_logs(self): """ Empties logs """ self.logs = {} self.logs["q1_loss"] = [] self.logs["q2_loss"] = [] self.logs["policy_loss"] = [] self.logs["alpha_loss"] = []
class DDPG: """ Deep Deterministic Policy Gradient algorithm (DDPG) Paper: https://arxiv.org/abs/1509.02971 :param network_type: The deep neural network layer types ['mlp', 'cnn'] :param env: The environment to learn from :param gamma: discount factor :param replay_size: Replay memory size :param batch_size: Update batch size :param lr_p: learning rate for policy optimizer :param lr_q: learning rate for value fn optimizer :param polyak: polyak averaging weight for target network update :param epochs: Number of epochs :param start_steps: Number of exploratory steps at start :param steps_per_epoch: Number of steps per epoch :param noise_std: Standard deviation for action noise :param max_ep_len: Maximum steps per episode :param start_update: Number of steps before first parameter update :param update_interval: Number of steps between parameter updates :param layers: Number of neurons in hidden layers :param seed: seed for torch and gym :param render: if environment is to be rendered :param device: device to use for tensor operations; ['cpu','cuda'] :type network_type: string :type env: Gym environment :type gamma: float :type replay_size: int :type batch_size: int :type lr_p: float :type lr_q: float :type polyak: float :type epochs: int :type start_steps: int :type steps_per_epoch: int :type noise_std: float :type max_ep_len: int :type start_update: int :type update_interval: int :type layers: tuple :type seed: int :type render: bool :type device: string """ def __init__( self, network_type: str, env: Union[gym.Env, VecEnv], gamma: float = 0.99, replay_size: int = 1000000, batch_size: int = 100, lr_p: float = 0.0001, lr_q: float = 0.001, polyak: float = 0.995, epochs: int = 100, start_steps: int = 10000, steps_per_epoch: int = 4000, noise: Optional[Any] = None, noise_std: float = 0.1, max_ep_len: int = 1000, start_update: int = 1000, update_interval: int = 50, layers: Tuple = (32, 32), seed: Optional[int] = None, render: bool = False, device: Union[torch.device, str] = "cpu", ): self.network_type = network_type self.env = env self.gamma = gamma self.replay_size = replay_size self.batch_size = batch_size self.lr_p = lr_p self.lr_q = lr_q self.polyak = polyak self.epochs = epochs self.start_steps = start_steps self.steps_per_epoch = steps_per_epoch self.noise = noise self.noise_std = noise_std self.max_ep_len = max_ep_len self.start_update = start_update self.update_interval = update_interval self.layers = layers self.seed = seed self.render = render # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) # Setup tensorboard writer self.writer = None self.empty_logs() self.create_model() def create_model(self) -> None: """ Initialize the model Initializes optimizer and replay buffers as well. """ state_dim, action_dim, discrete, _ = get_env_properties(self.env) if discrete: raise Exception( "Discrete Environments not supported for {}.".format( __class__.__name__)) if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "Qsa", False).to(self.device) self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_p) self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q) def update_params_before_select_action(self, timestep: int) -> None: """ Update any parameters before selecting action like epsilon for decaying epsilon greedy :param timestep: Timestep in the training process :type timestep: int """ pass def select_action(self, state: np.ndarray, deterministic: bool = False) -> np.ndarray: """ Selection of action :param state: Observation state :param deterministic: Action selection type :type state: int, float, ... :type deterministic: bool :returns: Action based on the state and epsilon value :rtype: int, float, ... """ with torch.no_grad(): action, _ = self.ac.get_action( torch.as_tensor(state, dtype=torch.float32).to(self.device), deterministic=deterministic, ) action = action.detach().cpu().numpy() # add noise to output from policy network if self.noise is not None: action += self.noise() return np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) def get_q_loss( self, state: np.ndarray, action: np.ndarray, reward: float, next_state: np.ndarray, done: bool, ) -> torch.Tensor: """ Computes loss for Q-Network :param state: environment observation :param action: agent action :param: reward: environment reward :param next_state: environment next observation :param done: if episode is over :type state: int, float, ... :type action: float :type: reward: float :type next_state: int, float, ... :type done: bool :returns: the Q loss value :rtype: float """ quality = self.ac.critic.get_value(torch.cat([state, action], dim=-1)) with torch.no_grad(): q_pi_target = self.ac_target.get_value( torch.cat([ next_state, self.ac_target.get_action(next_state, True)[0] ], dim=-1)) target = reward + self.gamma * (1 - done) * q_pi_target value_loss = F.mse_loss(quality, target) self.logs["value_loss"].append(value_loss.item()) return value_loss def get_p_loss(self, state: np.ndarray) -> torch.Tensor: """ Computes policy loss :param state: Environment observation :type state: int, float, ... :returns: Policy loss :rtype: float """ q_pi = self.ac.get_value( torch.cat([state, self.ac.get_action(state, True)[0]], dim=-1)) policy_loss = torch.mean(q_pi) self.logs["policy_loss"].append(policy_loss.item()) return -policy_loss def update_params(self, update_interval: int) -> None: """ Takes the step for optimizer. :param timestep: timestep :type timestep: int """ for timestep in range(update_interval): batch = self.replay_buffer.sample(self.batch_size) state, action, reward, next_state, done = (x.to(self.device) for x in batch) self.optimizer_q.zero_grad() loss_q = self.get_q_loss(state, action, reward, next_state, done) loss_q.backward() self.optimizer_q.step() # freeze critic params for policy update for param in self.ac.critic.parameters(): param.requires_grad = False self.optimizer_policy.zero_grad() loss_p = self.get_p_loss(state) loss_p.backward() self.optimizer_policy.step() # unfreeze critic params for param in self.ac.critic.parameters(): param.requires_grad = True # update target network with torch.no_grad(): for param, param_target in zip(self.ac.parameters(), self.ac_target.parameters()): param_target.data.mul_(self.polyak) param_target.data.add_((1 - self.polyak) * param.data) def learn(self): # pragma: no cover state, episode_reward, episode_len, episode = ( self.env.reset(), np.zeros(self.env.n_envs), np.zeros(self.env.n_envs), np.zeros(self.env.n_envs), ) total_steps = self.steps_per_epoch * self.epochs * self.env.n_envs if self.noise is not None: self.noise.reset() for timestep in range(0, total_steps, self.env.n_envs): # execute single transition if timestep > self.start_steps: action = self.select_action(state) else: action = self.env.sample() next_state, reward, done, _ = self.env.step(action) if self.render: self.env.render() episode_reward += reward episode_len += 1 # dont set d to True if max_ep_len reached done = [ False if ep_len == self.max_ep_len else done for ep_len in episode_len ] self.replay_buffer.extend( zip(state, action, reward, next_state, done)) state = next_state if np.any(done) or np.any(episode_len == self.max_ep_len): if self.noise is not None: self.noise.reset() if sum(episode) % 20 == 0: print("Ep: {}, reward: {}, t: {}".format( sum(episode), np.mean(episode_reward), timestep)) for i, di in enumerate(done): if di: episode_reward[i] = 0 episode_len[i] = 0 episode += 1 # update params if timestep >= self.start_update and timestep % self.update_interval == 0: self.update_params(self.update_interval) self.env.close() def get_hyperparams(self) -> Dict[str, Any]: hyperparams = { "network_type": self.network_type, "gamma": self.gamma, "batch_size": self.batch_size, "replay_size": self.replay_size, "polyak": self.polyak, "noise_std": self.noise_std, "lr_policy": self.lr_p, "lr_value": self.lr_q, "weights": self.ac.state_dict(), } return hyperparams def load_weights(self, weights) -> None: """ Load weights for the agent from pretrained model """ self.ac.load_state_dict(weights["weights"]) def get_logging_params(self) -> Dict[str, Any]: """ :returns: Logging parameters for monitoring training :rtype: dict """ logs = { "policy_loss": safe_mean(self.logs["policy_loss"]), "value_loss": safe_mean(self.logs["value_loss"]), } self.empty_logs() return logs def empty_logs(self): """ Empties logs """ self.logs = {} self.logs["policy_loss"] = [] self.logs["value_loss"] = []
class DQN: """ Deep Q Networks Paper (DQN) https://arxiv.org/pdf/1312.5602.pdf Paper (Double DQN) https://arxiv.org/abs/1509.06461 :param network_type: The deep neural network layer types ['mlp', 'cnn'] :param env: The environment to learn from :param double_dqn: For training Double DQN :param dueling_dqn: For training Dueling DQN :param noisy_dqn: For using Noisy Q :param categorical_dqn: For using Distributional DQN :param parameterized_replay: For using a prioritized buffer :param epochs: Number of epochs :param max_iterations_per_epoch: Number of iterations per epoch :param max_ep_len: Maximum steps per episode :param gamma: discount factor :param lr: learing rate for the optimizer :param batch_size: Update batch size :param replay_size: Replay memory size :param seed: seed for torch and gym :param render: if environment is to be rendered :param device: device to use for tensor operations; 'cpu' for cpu and 'cuda' for gpu :type network_type: string :type env: Gym environment :type double_dqn: bool :type dueling_dqn: bool :type noisy_dqn: bool :type categorical_dqn: bool :type parameterized_replay: bool :type epochs: int :type max_iterations_per_epoch: int :type max_ep_len: int :type gamma: float :type lr: float :type batch_size: int :type replay_size: int :type seed: int :type render: bool :type device: string """ def __init__( self, network_type: str, env: Union[gym.Env, VecEnv], double_dqn: bool = False, dueling_dqn: bool = False, noisy_dqn: bool = False, categorical_dqn: bool = False, prioritized_replay: bool = False, epochs: int = 100, max_iterations_per_epoch: int = 100, max_ep_len: int = 1000, gamma: float = 0.99, lr: float = 0.001, batch_size: int = 32, replay_size: int = 100, prioritized_replay_alpha: float = 0.6, max_epsilon: float = 1.0, min_epsilon: float = 0.01, epsilon_decay: int = 1000, num_atoms: int = 51, vmin: int = -10, vmax: int = 10, seed: Optional[int] = None, render: bool = False, device: Union[torch.device, str] = "cpu", ): self.env = env self.double_dqn = double_dqn self.dueling_dqn = dueling_dqn self.noisy_dqn = noisy_dqn self.categorical_dqn = categorical_dqn self.prioritized_replay = prioritized_replay self.max_epochs = epochs self.max_iterations_per_epoch = max_iterations_per_epoch self.max_ep_len = max_ep_len self.replay_size = replay_size self.prioritized_replay_alpha = prioritized_replay_alpha self.lr = lr self.gamma = gamma self.batch_size = batch_size self.num_atoms = num_atoms self.Vmin = vmin self.Vmax = vmax self.render = render self.reward_hist = [] self.max_epsilon = max_epsilon self.min_epsilon = min_epsilon self.epsilon_decay = epsilon_decay self.network_type = network_type # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) # Setup tensorboard writer self.writer = None self.empty_logs() self.create_model() def create_model(self) -> None: """ Initialize the model and target model for various variants of DQN. Initializes optimizer and replay buffers as well. """ state_dim, action_dim, _, _ = get_env_properties(self.env) if self.network_type == "mlp": if self.dueling_dqn: self.model = DuelingDQNValueMlp(state_dim, action_dim) elif self.categorical_dqn: self.model = CategoricalDQNValue(state_dim, action_dim, self.num_atoms) elif self.noisy_dqn: self.model = NoisyDQNValue(state_dim, action_dim) else: self.model = get_model("v", self.network_type)(state_dim, action_dim, "Qs") elif self.network_type == "cnn": self.framestack = self.env.framestack if self.dueling_dqn: self.model = DuelingDQNValueCNN(action_dim, self.framestack) elif self.noisy_dqn: self.model = NoisyDQNValueCNN(action_dim, self.framestack) elif self.categorical_dqn: self.model = CategoricalDQNValueCNN(action_dim, self.num_atoms, self.framestack) else: self.model = get_model("v", self.network_type)(action_dim, self.framestack, "Qs") self.target_model = deepcopy(self.model) if self.prioritized_replay: self.replay_buffer = PrioritizedBuffer( self.replay_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr) def update_target_model(self) -> None: """ Copy the target model weights with the model """ self.target_model.load_state_dict(self.model.state_dict()) def update_params_before_select_action(self, timestep: int) -> None: """ Update any parameters before selecting action like epsilon for decaying epsilon greedy :param timestep: Timestep in the training process :type timestep: int """ self.timestep = timestep self.epsilon = self.calculate_epsilon_by_frame() def select_action(self, state: np.ndarray, deterministic: bool = False) -> np.ndarray: """ Epsilon Greedy selection of action :param state: Observation state :param deterministic: Whether greedy action should be taken always :type state: int, float, ... :type deterministic: bool :returns: Action based on the state and epsilon value :rtype: int, float, ... """ if not deterministic: if np.random.rand() < self.epsilon: return np.asarray(self.env.sample()) if self.categorical_dqn: state = Variable(torch.FloatTensor(state)) dist = self.model(state).data.cpu() dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms) action = dist.sum(2).max(1)[1].numpy() # [0] else: state = Variable(torch.FloatTensor(state)) q_value = self.model(state) action = np.argmax(q_value.detach().numpy(), axis=-1) return action def get_td_loss(self) -> torch.Tensor: """ Computes loss for various variants :returns: the TD loss depending upon the variant :rtype: float """ if self.prioritized_replay: ( state, action, reward, next_state, done, indices, weights, ) = self.replay_buffer.sample(self.batch_size) weights = Variable(torch.FloatTensor(weights)) else: (state, action, reward, next_state, done) = self.replay_buffer.sample(self.batch_size) state = state.reshape(self.batch_size * self.env.n_envs, *self.env.obs_shape) action = action.reshape(self.batch_size * self.env.n_envs, *self.env.action_shape) reward = reward.reshape(-1, 1) done = done.reshape(-1, 1) next_state = next_state.reshape(self.batch_size * self.env.n_envs, *self.env.obs_shape) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action.long())) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) if self.network_type == "cnn": state = state.view( -1, self.framestack, self.env.screen_size, self.env.screen_size, ) next_state = next_state.view( -1, self.framestack, self.env.screen_size, self.env.screen_size, ) if self.categorical_dqn: projection_dist = self.projection_distribution( next_state, reward, done) dist = self.model(state) action = action.unsqueeze(1).expand( self.batch_size * self.env.n_envs, 1, self.num_atoms) dist = dist.gather(1, action).squeeze(1) dist.data.clamp_(0.01, 0.99) elif self.double_dqn: q_values = self.model(state) q_value = q_values.gather(1, action).squeeze(1) q_next_state_values = self.model(next_state) action_next = q_next_state_values.max(1)[1] q_target_next_state_values = self.target_model(next_state) q_target_s_a_prime = q_target_next_state_values.gather( 1, action_next.unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * q_target_s_a_prime.reshape( -1, 1) * (1 - done) else: q_values = self.model(state) q_value = q_values.gather(1, action).squeeze(1) q_next_state_values = self.target_model(next_state) q_s_a_prime = q_next_state_values.max(1)[0] expected_q_value = reward + self.gamma * q_s_a_prime.reshape( -1, 1) * (1 - done) if self.categorical_dqn: loss = -(Variable(projection_dist) * dist.log()).sum(1).mean() else: if self.prioritized_replay: loss = (q_value - expected_q_value.detach()).pow(2) * weights priorities = loss + 1e-5 loss = loss.mean() self.replay_buffer.update_priorities( indices, priorities.data.cpu().numpy()) else: loss = (q_value - expected_q_value.detach()).pow(2).mean() self.logs["value_loss"].append(loss.item()) return loss def update_params(self, update_interval: int) -> None: """ (Takes the step for optimizer. This internally call get_td_loss(), so no need to call the function explicitly.) """ for timestep in range(update_interval): loss = self.get_td_loss() self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.noisy_dqn or self.categorical_dqn: self.model.reset_noise() self.target_model.reset_noise() if timestep % update_interval == 0: self.update_target_model() def calculate_epsilon_by_frame(self) -> float: """ A helper function to calculate the value of epsilon after every step. :returns: epsilon value for the step :rtype: float """ return self.min_epsilon + ( self.max_epsilon - self.min_epsilon) * np.exp( -1.0 * self.timestep / self.epsilon_decay) def projection_distribution(self, next_state: np.ndarray, rewards: List[float], dones: List[bool]): """ A helper function used for categorical DQN :param next_state: next observation state :param rewards: rewards collected :param dones: dones :type next_state: int, float, ... :type rewards: list :type dones: list :returns: projection distribution :rtype: float """ batch_size = next_state.size(0) delta_z = float(self.Vmax - self.Vmin) / (self.num_atoms - 1) support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms) next_dist = self.target_model(next_state).data.cpu() * support next_action = next_dist.sum(2).max(1)[1] next_action = (next_action.unsqueeze(1).unsqueeze(1).expand( next_dist.size(0), 1, next_dist.size(2))) next_dist = next_dist.gather(1, next_action).squeeze(1) rewards = rewards.expand_as(next_dist) dones = dones.expand_as(next_dist) support = support.unsqueeze(0).expand_as(next_dist) tz = rewards + (1 - dones) * 0.99 * support tz = tz.clamp(min=self.Vmin, max=self.Vmax) bz = (tz - self.Vmin) / delta_z lower = bz.floor().long() upper = bz.ceil().long() offset = (torch.linspace(0, (batch_size - 1) * self.num_atoms, batch_size).long().unsqueeze(1).expand( self.batch_size * self.env.n_envs, self.num_atoms)) projection_dist = torch.zeros(next_dist.size()) projection_dist.view(-1).index_add_(0, (lower + offset).view(-1), (next_dist * (upper.float() - bz)).view(-1)) projection_dist.view(-1).index_add_(0, (upper + offset).view(-1), (next_dist * (bz - lower.float())).view(-1)) return projection_dist def learn(self) -> None: # pragma: no cover total_steps = self.max_epochs * self.max_iterations_per_epoch state, episode_reward, episode, episode_len = self.env.reset(), 0, 0, 0 if self.double_dqn: self.update_target_model() for frame_idx in range(1, total_steps + 1): self.timestep = frame_idx self.epsilon = self.calculate_epsilon_by_frame() action = self.select_action(state) next_state, reward, done, _ = self.env.step(action) if self.render: self.env.render() self.replay_buffer.push((state, action, reward, next_state, done)) state = next_state episode_reward += reward episode_len += 1 done = False if episode_len == self.max_ep_len else done if done or (episode_len == self.max_ep_len): if episode % 20 == 0: print("Episode: {}, Reward: {}, Frame Index: {}".format( episode, episode_reward, frame_idx)) self.reward_hist.append(episode_reward) state, episode_reward, episode_len = self.env.reset(), 0, 0 episode += 1 if frame_idx >= self.start_update and frame_idx % self.update_interval == 0: self.agent.update_params(self.update_interval) if frame_idx % 100 == 0: self.update_target_model() self.env.close() def get_hyperparams(self) -> Dict[str, Any]: hyperparams = { "gamma": self.gamma, "batch_size": self.batch_size, "lr": self.lr, "replay_size": self.replay_size, "double_dqn": self.double_dqn, "dueling_dqn": self.dueling_dqn, "noisy_dqn": self.noisy_dqn, "categorical_dqn": self.categorical_dqn, "prioritized_replay": self.prioritized_replay, "prioritized_replay_alpha": self.prioritized_replay_alpha, "weights": self.model.state_dict(), "timestep": self.timestep, } return hyperparams def load_weights(self, weights) -> None: """ Load weights for the agent from pretrained model """ self.model.load_state_dict(weights["weights"]) def get_logging_params(self) -> Dict[str, Any]: """ :returns: Logging parameters for monitoring training :rtype: dict """ logs = { "value_loss": safe_mean(self.logs["value_loss"]), } self.empty_logs() return logs def empty_logs(self): """ Empties logs """ self.logs = {} self.logs["value_loss"] = []