def __init__( self, observation_shape, hidden_sizes, action_size, n_tile=20, ): super().__init__() self._obs_ndim = 1 self._n_tile = n_tile input_dim = int(np.sum(observation_shape)) self._action_size = action_size self.mlp_loc = MlpModel( input_size=input_dim, hidden_sizes=hidden_sizes, output_size=4 ) self.mlp_delta = MlpModel( input_size=input_dim + 4 * n_tile, hidden_sizes=hidden_sizes, output_size=3 * 2, ) self.delta_distribution = Gaussian( dim=3, squash=True, min_std=np.exp(MIN_LOG_STD), max_std=np.exp(MAX_LOG_STD), ) self.cat_distribution = Categorical(4) self._counter = 0
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): _initial_model_state_dict = self.initial_model_state_dict self.initial_model_state_dict = None #! Don't let base agent try to load. super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.initial_model_state_dict = _initial_model_state_dict self.q_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.target_q_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.target_q_model.load_state_dict(self.q_model.state_dict()) if self.initial_model_state_dict is not None and not self.load_model_after_min_steps: self.load_state_dict(self.initial_model_state_dict) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], squash=self.action_squash, min_std=np.exp(MIN_LOG_STD), max_std=np.exp(MAX_LOG_STD), ) # Tie weights (need to make sure False if not using encoder) if self.tie_weights: self.model.encoder.copy_conv_weights_from(self.q_model.encoder)
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """Instantiates mu and q, and target_mu and target_q models.""" super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.q_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) if self.initial_q_model_state_dict is not None: self.q_model.load_state_dict(self.initial_q_model_state_dict) self.target_model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs) self.target_q_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.target_q_model.load_state_dict(self.q_model.state_dict()) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], std=self.action_std, noise_clip=self.action_noise_clip, clip=env_spaces.action.high[0], # Assume symmetric low=-high. )
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): _initial_model_state_dict = self.initial_model_state_dict # Don't let base agent try to load. self.initial_model_state_dict = None super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.initial_model_state_dict = _initial_model_state_dict self.q1_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.q2_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.target_q1_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.target_q2_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.target_q1_model.load_state_dict(self.q1_model.state_dict()) self.target_q2_model.load_state_dict(self.q2_model.state_dict()) if self.initial_model_state_dict is not None: self.load_state_dict(self.initial_model_state_dict) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], squash=self.action_squash, min_std=np.exp(MIN_LOG_STD), max_std=np.exp(MAX_LOG_STD), )
def initialize(self, env_spaces, share_memory=False): env_model_kwargs = self.make_env_to_model_kwargs(env_spaces) self.mu_model = self.MuModelCls(**env_model_kwargs, **self.mu_model_kwargs) self.q_model = self.QModelCls(**env_model_kwargs, **self.q_model_kwargs) if share_memory: self.mu_model.share_memory() # self.q_model.share_memory() # Not needed for sampling. self.shared_mu_model = self.mu_model # self.shared_q_model = self.q_model if self.initial_mu_model_state_dict is not None: self.mu_model.load_state_dict(self.initial_mu_model_state_dict) if self.initial_q_model_state_dict is not None: self.q_model.load_state_dict(self.initial_q_model_state_dict) self.target_mu_model = self.MuModelCls(**env_model_kwargs, **self.mu_model_kwargs) self.target_mu_model.load_state_dict(self.mu_model.state_dict()) self.target_q_model = self.QModelCls(**env_model_kwargs, **self.q_model_kwargs) self.target_q_model.load_state_dict(self.q_model.state_dict()) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], std=self.action_std, noise_clip=self.action_noise_clip, clip=env_spaces.action.high[0], # Assume symmetric low=-high. ) self.env_spaces = env_spaces self.env_model_kwargs = env_model_kwargs
def initialize(self, env_spaces, share_memory=False): env_model_kwargs = self.make_env_to_model_kwargs(env_spaces) self.q1_model = self.QModelCls(**env_model_kwargs, **self.q_model_kwargs) self.q2_model = self.QModelCls(**env_model_kwargs, **self.q_model_kwargs) self.v_model = self.VModelCls(**env_model_kwargs, **self.v_model_kwargs) self.pi_model = self.PiModelCls(**env_model_kwargs, **self.pi_model_kwargs) if share_memory: self.pi_model.share_memory() # Only one needed for sampling. self.shared_pi_model = self.pi_model if self.initial_q1_model_state_dict is not None: self.q1_model.load_state_dict(self.initial_q1_model_state_dict) if self.initial_q2_model_state_dict is not None: self.q2_model.load_state_dict(self.initial_q2_model_state_dict) if self.initial_v_model_state_dict is not None: self.v_model.load_state_dict(self.initial_v_model_state_dict) if self.initial_pi_model_state_dict is not None: self.pi_model.load_state_dict(self.initial_pi_model_state_dict) self.target_v_model = self.VModelCls(**env_model_kwargs, **self.v_model_kwargs) self.target_v_model.load_state_dict(self.v_model.state_dict()) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], squash=self.action_squash, min_std=np.exp(MIN_LOG_STD), max_std=np.exp(MAX_LOG_STD), ) self.env_spaces = env_spaces self.env_model_kwargs = env_model_kwargs
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): _initial_model_state_dict = self.initial_model_state_dict self.initial_model_state_dict = None super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.initial_model_state_dict = _initial_model_state_dict self.q_models = [self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) for _ in range(self.n_qs)] self.target_q_models = [self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) for _ in range(self.n_qs)] [target_q.load_state_dict(q.state_dict()) for target_q, q in zip(self.target_q_models, self.q_models)] self.log_alpha = nn.Parameter(torch.tensor(0.0, dtype=torch.float32)) if self.initial_model_state_dict is not None: self.load_state_dict(self.initial_model_state_dict) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], squash=self.action_squash, min_std=np.exp(MIN_LOG_STD), max_std=np.exp(MAX_LOG_STD), )
def optim_initialize(self, rank=0): """Called by async runner.""" self.rank = rank self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q1_optimizer = self.OptimCls(self.agent.q1_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q2_optimizer = self.OptimCls(self.agent.q2_parameters(), lr=self.learning_rate, **self.optim_kwargs) self._log_alpha = torch.zeros(1, requires_grad=True) self._alpha = torch.exp(self._log_alpha.detach()) self.alpha_optimizer = self.OptimCls((self._log_alpha, ), lr=self.learning_rate, **self.optim_kwargs) if self.target_entropy == "auto": self.target_entropy = -np.prod(self.agent.env_spaces.action.shape) if self.initial_optim_state_dict is not None: self.load_optim_state_dict(self.initial_optim_state_dict) if self.action_prior == "gaussian": self.action_prior_distribution = Gaussian(dim=np.prod( self.agent.env_spaces.action.shape), std=1.)
def optim_initialize(self, rank=0): """Called in initilize or by async runner after forking sampler.""" self.rank = rank self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q1_optimizer = self.OptimCls(self.agent.q1_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q2_optimizer = self.OptimCls(self.agent.q2_parameters(), lr=self.learning_rate, **self.optim_kwargs) if self.fixed_alpha is None: self.target_entropy = -np.log( (1.0 / self.agent.env_spaces.action.n)) * 0.98 self._log_alpha = torch.zeros(1, requires_grad=True) self._alpha = self._log_alpha.exp() self.alpha_optimizer = self.OptimCls((self._log_alpha, ), lr=self.learning_rate, **self.optim_kwargs) else: self._log_alpha = torch.tensor([np.log(self.fixed_alpha)]) self._alpha = torch.tensor([self.fixed_alpha]) self.alpha_optimizer = None if self.target_entropy == "auto": self.target_entropy = -np.prod(self.agent.env_spaces.action.n) if self.initial_optim_state_dict is not None: self.load_optim_state_dict(self.initial_optim_state_dict) if self.action_prior == "gaussian": self.action_prior_distribution = Gaussian(dim=np.prod( self.agent.env_spaces.action.shape), std=1.)
def __init__( self, observation_shape, hidden_sizes, action_size, all_corners=False ): super().__init__() self._obs_ndim = 1 self._all_corners = all_corners input_dim = int(np.sum(observation_shape)) print('all corners', self._all_corners) delta_dim = 12 if all_corners else 3 self._delta_dim = delta_dim self.mlp = MlpModel( input_size=input_dim, hidden_sizes=hidden_sizes, output_size=2 * delta_dim + 4, # 3 for each corners, times two for std, 4 probs ) self.delta_distribution = Gaussian( dim=delta_dim, squash=True, min_std=np.exp(MIN_LOG_STD), max_std=np.exp(MAX_LOG_STD), ) self.cat_distribution = Categorical(4)
def initialize(self, env_spaces, share_memory=False): super().initialize(env_spaces, share_memory) assert len(env_spaces.action.shape == 1) self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? )
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super().initialize(env_spaces, share_memory) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, clip=env_spaces.action.high[0], # Probably +1? )
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) assert len(env_spaces.action.shape) == 1 # assert len(np.unique(env_spaces.action.high)) == 1 # assert np.all(env_spaces.action.low == -env_spaces.action.high) self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? )
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super().initialize(env_spaces, share_memory) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? ) self.distribution_omega = Categorical( dim=self.model_kwargs["option_size"])
def optim_initialize(self, rank=0): """Called by async runner.""" self.rank = rank self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q_optimizers = [self.OptimCls(q_param) for q_param in self.agent.q_parameters()] self.alpha_optimizer = self.OptimCls([self.agent.log_alpha], lr=self.learning_rate, **self.optim_kwargs) if self.initial_optim_state_dict is not None: self.pi_optimizer.load_state_dict(self.initial_optim_state_dict) if self.action_prior == "gaussian": self.action_prior_distribution = Gaussian( dim=self.agent.env_spaces.action.size, std=1.)
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super().initialize(env_spaces, share_memory, global_B, env_ranks) self.q2_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) if self.initial_q2_model_state_dict is not None: self.q2_model.load_state_dict(self.initial_q2_model_state_dict) self.target_q2_model = self.QModelCls( **self.env_model_kwargs, **self.q_model_kwargs ) self.target_q2_model.load_state_dict(self.q2_model.state_dict()) self.target_distribution = Gaussian( dim=env_spaces.action.shape[0], std=self.target_noise_std, noise_clip=self.target_noise_clip, clip=env_spaces.action.high[0], # Assume symmetric low=-high. )
class GaussianPgAgent(BaseAgent): """ Agent for policy gradient algorithm using Gaussian action distribution. """ def __call__(self, observation, prev_action, prev_reward, device='cpu'): """Performs forward pass on training data, for algorithm.""" model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) return buffer_to((DistInfoStd(mean=mu, log_std=log_std), value), device=device) def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """Extends base method to build Gaussian distribution.""" super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) assert len(env_spaces.action.shape) == 1 assert len(np.unique(env_spaces.action.high)) == 1 assert np.all(env_spaces.action.low == -env_spaces.action.high) self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? ) @torch.no_grad() def step(self, observation, prev_action, prev_reward, device="cpu"): """ Compute policy's action distribution from inputs, and sample an action. Calls the model to produce mean, log_std, and value estimate. Moves inputs to device and returns outputs back to CPU, for the sampler. (no grad) """ model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device=device) return AgentStep(action=action, agent_info=agent_info) @torch.no_grad() def value(self, observation, prev_action, prev_reward, device="cpu"): """ Compute the value estimate for the environment state, e.g. for the bootstrap value, V(s_{T+1}), in the sampler. (no grad) """ model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) _mu, _log_std, value = self.model(*model_inputs) return value.to(device)
def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples): if agent.recurrent: raise NotImplementedError self.agent = agent self.n_itr = n_itr self.mid_batch_reset = mid_batch_reset self.optimizer = self.OptimCls(agent.parameters(), lr=self.learning_rate, **self.optim_kwargs) if self.initial_optim_state_dict is not None: self.optimizer.load_state_dict(self.initial_optim_state_dict) sample_bs = batch_spec.size train_bs = self.batch_size assert (self.training_ratio * sample_bs) % train_bs == 0 self.updates_per_optimize = int( (self.training_ratio * sample_bs) // train_bs) logger.log( f"From sampler batch size {sample_bs}, training " f"batch size {train_bs}, and training ratio " f"{self.training_ratio}, computed {self.updates_per_optimize} " f"updates per iteration.") self.min_itr_learn = self.min_steps_learn // sample_bs self.agent.give_min_itr_learn(self.min_itr_learn) example_to_buffer = SamplesToBuffer( observation=examples["observation"], action=examples["action"], reward=examples["reward"], done=examples["done"], ) replay_kwargs = dict( example=example_to_buffer, size=self.replay_size, B=batch_spec.B, n_step_return=self.n_step_return, ) self.replay_buffer = UniformReplayBuffer(**replay_kwargs) if self.action_prior == "gaussian": self.action_prior_distribution = Gaussian( dim=agent.env_spaces.action.size, std=1.)
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """Extends base method to build Gaussian distribution.""" super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) assert len(env_spaces.action.shape) == 1 assert len(np.unique(env_spaces.action.high)) == 1 assert np.all(env_spaces.action.low == -env_spaces.action.high) self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? ) self.distribution_omega = Categorical( dim=self.model_kwargs["option_size"])
class RecurrentGaussianPgAgentBase(BaseAgent): def __call__(self, observation, prev_action, prev_reward, init_rnn_state): # Assume init_rnn_state already shaped: [N,B,H] model_inputs = buffer_to( (observation, prev_action, prev_reward, init_rnn_state), device=self.device) mu, log_std, value, next_rnn_state = self.model(*model_inputs) dist_info, value = buffer_to( (DistInfoStd(mean=mu, log_std=log_std), value), device="cpu") return dist_info, value, next_rnn_state # Leave rnn_state on device. def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super().initialize(env_spaces, share_memory) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? ) @torch.no_grad() def step(self, observation, prev_action, prev_reward): agent_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value, rnn_state = self.model(*agent_inputs, self.prev_rnn_state) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) # Model handles None, but Buffer does not, make zeros if needed: prev_rnn_state = self.prev_rnn_state or buffer_func( rnn_state, torch.zeros_like) # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage. # (Special case: model should always leave B dimension in.) prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1) agent_info = AgentInfoRnn(dist_info=dist_info, value=value, prev_rnn_state=prev_rnn_state) action, agent_info = buffer_to((action, agent_info), device="cpu") self.advance_rnn_state(rnn_state) # Keep on device. return AgentStep(action=action, agent_info=agent_info) @torch.no_grad() def value(self, observation, prev_action, prev_reward): agent_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) _mu, _log_std, value, _rnn_state = self.model(*agent_inputs, self.prev_rnn_state) return value.to("cpu")
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super(SacAgent, self).initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.target_model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs) self.target_model.load_state_dict(self.model.state_dict()) if self.initial_model_state_dict is not None: self.load_state_dict(self.initial_model_state_dict) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], squash=self.action_squash, min_std=np.exp(MIN_LOG_STD), max_std=np.exp(MAX_LOG_STD), )
def optim_initialize(self, rank=0): """Called in initilize or by async runner after forking sampler.""" self.rank = rank # Be very explicit about which parameters are optimized where. self.pi_optimizer = self.OptimCls( chain( self.agent.pi_fc1.parameters(), # No conv. self.agent.pi_mlp.parameters(), ), lr=self.pi_lr, betas=(self.pi_beta, 0.999), ) self.q_optimizer = self.OptimCls( chain( () if self.stop_conv_grad else self.agent.conv.parameters(), self.agent.q_fc1.parameters(), self.agent.q_mlps.parameters(), ), lr=self.q_lr, betas=(self.q_beta, 0.999), ) self._log_alpha = torch.tensor(np.log(self.alpha_init), requires_grad=True) self._alpha = torch.exp(self._log_alpha.detach()) self.alpha_optimizer = self.OptimCls( (self._log_alpha,), lr=self.alpha_lr, betas=(self.alpha_beta, 0.999) ) if self.target_entropy == "auto": self.target_entropy = -np.prod(self.agent.env_spaces.action.shape) if self.initial_optim_state_dict is not None: self.load_optim_state_dict(self.initial_optim_state_dict) if self.action_prior == "gaussian": self.action_prior_distribution = Gaussian( dim=np.prod(self.agent.env_spaces.action.shape), std=1.0 )
class MultiAgentGaussianPgAgent(BaseAgent): def __call__(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) samples = (DistInfoStd(mean=mu, log_std=log_std), value) return buffer_to(samples, device="cpu") def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) for _a_space in env_spaces.action.space: assert len(_a_space.shape) == 1 # assert len(np.unique(_a_space.high)) == 1 assert np.all(_a_space.low == -_a_space.high) self.distribution = Gaussian( dim=env_spaces.action.shape[-1], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? ) @torch.no_grad() def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) # import pdb; pdb.set_trace() dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) @torch.no_grad() def value(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) _mu, _log_std, value = self.model(*model_inputs) return value.to("cpu")
def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """Extends base method to build Gaussian distribution.""" if (not (env_spaces.action.high == 1).all() and (env_spaces.action.low == -1).all()): raise ValueError(f"The space for all actions should be [-1, 1].") super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.distribution = Gaussian(dim=env_spaces.action.shape[0], min_std=1e-6, max_std=1)
class GaussianPgAgent(BasePgAgent): def __call__(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) return buffer_to((DistInfoStd(mean=mu, log_std=log_std), value), device="cpu") def initialize(self, env_spaces, share_memory=False): super().initialize(env_spaces, share_memory) assert len(env_spaces.action.shape) == 1 assert len(np.unique(env_spaces.action.high)) == 1 assert np.all(env_spaces.action.low == -env_spaces.action.high) self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? ) @torch.no_grad() def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value = self.model(*model_inputs) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info, value=value) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) @torch.no_grad() def value(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) _mu, _log_std, value = self.model(*model_inputs) return value.to("cpu")
class SACDiscrete(RlAlgorithm): """Soft actor critic algorithm, training from a replay buffer.""" opt_info_fields = tuple(f for f in OptInfo._fields) # copy def __init__( self, discount=0.99, batch_size=256, min_steps_learn=int(1e4), replay_size=int(1e6), replay_ratio=256, # data_consumption / data_generation target_update_tau=0.005, # tau=1 for hard update. target_update_interval=1, # 1000 for hard update, 1 for soft. learning_rate=3e-4, fixed_alpha=None, # None for adaptive alpha, float for any fixed value OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, # for all of them. action_prior="uniform", # or "gaussian" reward_scale=1, target_entropy="auto", # "auto", float, or None reparameterize=True, clip_grad_norm=1e9, # policy_output_regularization=0.001, n_step_return=1, updates_per_sync=1, # For async mode only. bootstrap_timelimit=False, ReplayBufferCls=None, # Leave None to select by above options. ): """Save input arguments.""" if optim_kwargs is None: optim_kwargs = dict() assert action_prior in ["uniform", "gaussian"] self._batch_size = batch_size del batch_size # Property. save__init__args(locals()) def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples, world_size=1, rank=0): """Stores input arguments and initializes replay buffer and optimizer. Use in non-async runners. Computes number of gradient updates per optimization iteration as `(replay_ratio * sampler-batch-size / training-batch_size)`.""" self.agent = agent self.n_itr = n_itr self.mid_batch_reset = mid_batch_reset self.sampler_bs = sampler_bs = batch_spec.size self.updates_per_optimize = int(self.replay_ratio * sampler_bs / self.batch_size) logger.log( f"From sampler batch size {sampler_bs}, training " f"batch size {self.batch_size}, and replay ratio " f"{self.replay_ratio}, computed {self.updates_per_optimize} " f"updates per iteration.") self.min_itr_learn = self.min_steps_learn // sampler_bs agent.give_min_itr_learn(self.min_itr_learn) self.initialize_replay_buffer(examples, batch_spec) self.optim_initialize(rank) def async_initialize(self, agent, sampler_n_itr, batch_spec, mid_batch_reset, examples, world_size=1): """Used in async runner only; returns replay buffer allocated in shared memory, does not instantiate optimizer. """ self.agent = agent self.n_itr = sampler_n_itr self.initialize_replay_buffer(examples, batch_spec, async_=True) self.mid_batch_reset = mid_batch_reset self.sampler_bs = sampler_bs = batch_spec.size self.updates_per_optimize = self.updates_per_sync self.min_itr_learn = int(self.min_steps_learn // sampler_bs) agent.give_min_itr_learn(self.min_itr_learn) return self.replay_buffer def optim_initialize(self, rank=0): """Called in initilize or by async runner after forking sampler.""" self.rank = rank self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q1_optimizer = self.OptimCls(self.agent.q1_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q2_optimizer = self.OptimCls(self.agent.q2_parameters(), lr=self.learning_rate, **self.optim_kwargs) if self.fixed_alpha is None: self.target_entropy = -np.log( (1.0 / self.agent.env_spaces.action.n)) * 0.98 self._log_alpha = torch.zeros(1, requires_grad=True) self._alpha = self._log_alpha.exp() self.alpha_optimizer = self.OptimCls((self._log_alpha, ), lr=self.learning_rate, **self.optim_kwargs) else: self._log_alpha = torch.tensor([np.log(self.fixed_alpha)]) self._alpha = torch.tensor([self.fixed_alpha]) self.alpha_optimizer = None if self.target_entropy == "auto": self.target_entropy = -np.prod(self.agent.env_spaces.action.n) if self.initial_optim_state_dict is not None: self.load_optim_state_dict(self.initial_optim_state_dict) if self.action_prior == "gaussian": self.action_prior_distribution = Gaussian(dim=np.prod( self.agent.env_spaces.action.shape), std=1.) def initialize_replay_buffer(self, examples, batch_spec, async_=False): """ Allocates replay buffer using examples and with the fields in `SamplesToBuffer` namedarraytuple. """ example_to_buffer = SamplesToBuffer( observation=examples["observation"], action=examples["action"], reward=examples["reward"], done=examples["done"], ) if not self.bootstrap_timelimit: ReplayCls = AsyncUniformReplayBuffer if async_ else UniformReplayBuffer else: example_to_buffer = SamplesToBufferTl( *example_to_buffer, timeout=examples["env_info"].timeout) ReplayCls = AsyncTlUniformReplayBuffer if async_ else TlUniformReplayBuffer replay_kwargs = dict( example=example_to_buffer, size=self.replay_size, B=batch_spec.B, n_step_return=self.n_step_return, ) if self.ReplayBufferCls is not None: ReplayCls = self.ReplayBufferCls logger.log( f"WARNING: ignoring internal selection logic and using" f" input replay buffer class: {ReplayCls} -- compatibility not" " guaranteed.") self.replay_buffer = ReplayCls(**replay_kwargs) def optimize_agent(self, itr, samples=None, sampler_itr=None): """ Extracts the needed fields from input samples and stores them in the replay buffer. Then samples from the replay buffer to train the agent by gradient updates (with the number of updates determined by replay ratio, sampler batch size, and training batch size). """ itr = itr if sampler_itr is None else sampler_itr # Async uses sampler_itr. if samples is not None: samples_to_buffer = self.samples_to_buffer(samples) self.replay_buffer.append_samples(samples_to_buffer) opt_info = OptInfo(*([] for _ in range(len(OptInfo._fields)))) if itr < self.min_itr_learn: return opt_info for _ in range(self.updates_per_optimize): samples_from_replay = self.replay_buffer.sample_batch( self.batch_size) losses, values = self.loss(samples_from_replay) q1_loss, q2_loss, pi_loss, alpha_loss = losses if alpha_loss is not None: self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self._alpha = torch.exp(self._log_alpha.detach()) self.pi_optimizer.zero_grad() pi_loss.backward() pi_grad_norm = torch.nn.utils.clip_grad_norm_( self.agent.pi_parameters(), self.clip_grad_norm) self.pi_optimizer.step() # Step Q's last because pi_loss.backward() uses them? self.q1_optimizer.zero_grad() q1_loss.backward() q1_grad_norm = torch.nn.utils.clip_grad_norm_( self.agent.q1_parameters(), self.clip_grad_norm) self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() q2_grad_norm = torch.nn.utils.clip_grad_norm_( self.agent.q2_parameters(), self.clip_grad_norm) self.q2_optimizer.step() grad_norms = (q1_grad_norm, q2_grad_norm, pi_grad_norm) self.append_opt_info_(opt_info, losses, grad_norms, values) self.update_counter += 1 if self.update_counter % self.target_update_interval == 0: self.agent.update_target(self.target_update_tau) return opt_info def samples_to_buffer(self, samples): """Defines how to add data from sampler into the replay buffer. Called in optimize_agent() if samples are provided to that method.""" samples_to_buffer = SamplesToBuffer( observation=samples.env.observation, action=samples.agent.action, reward=samples.env.reward, done=samples.env.done, ) if self.bootstrap_timelimit: samples_to_buffer = SamplesToBufferTl( *samples_to_buffer, timeout=samples.env.env_info.timeout) return samples_to_buffer def loss(self, samples): """ Computes losses for twin Q-values against the min of twin target Q-values and an entropy term. Computes reparameterized policy loss, and loss for tuning entropy weighting, alpha. Input samples have leading batch dimension [B,..] (but not time). """ agent_inputs, target_inputs, action = buffer_to( (samples.agent_inputs, samples.target_inputs, samples.action)) if self.mid_batch_reset and not self.agent.recurrent: valid = torch.ones_like(samples.done, dtype=torch.float) # or None else: valid = valid_from_done(samples.done) if self.bootstrap_timelimit: # To avoid non-use of bootstrap when environment is 'done' due to # time-limit, turn off training on these samples. valid *= (1 - samples.timeout_n.float()) with torch.no_grad(): target_action, target_action_probs, target_log_pi, _ = self.agent.pi( *target_inputs) target_q1, target_q2 = self.agent.target_q(*target_inputs, target_action) min_target_q = torch.min(target_q1, target_q2) target_value = target_action_probs * (min_target_q - self._alpha * target_log_pi) target_value = target_value.sum(dim=1).unsqueeze(-1) disc = self.discount**self.n_step_return y = self.reward_scale * samples.return_ + ( 1 - samples.done_n.float()) * disc * target_value q1, q2 = self.agent.q(*agent_inputs, action) q1 = torch.gather(q1, 1, action.unsqueeze(1).long()) q2 = torch.gather(q2, 1, action.unsqueeze(1).long()) q1_loss = 0.5 * valid_mean((y - q1)**2, valid) q2_loss = 0.5 * valid_mean((y - q2)**2, valid) action, action_probs, log_pi, _ = self.agent.pi(*agent_inputs) q1_pi, q2_pi = self.agent.q(*agent_inputs, action) min_pi_target = torch.min(q1_pi, q2_pi) inside_term = self._alpha * log_pi - min_pi_target policy_loss = (action_probs * inside_term).sum(dim=1).mean() log_pi = torch.sum(log_pi * action_probs, dim=1) # if self.policy_output_regularization > 0: # pi_losses += self.policy_output_regularization * torch.mean( # 0.5 * pi_mean ** 2 + 0.5 * pi_log_std ** 2, dim=-1) pi_loss = valid_mean(policy_loss, valid) if self.target_entropy is not None and self.fixed_alpha is None: alpha_losses = -self._log_alpha * (log_pi.detach() + self.target_entropy) alpha_loss = valid_mean(alpha_losses, valid) else: alpha_loss = None losses = (q1_loss, q2_loss, pi_loss, alpha_loss) values = tuple(val.detach() for val in (q1, q2, action_probs)) return losses, values def get_action_prior(self, action): if self.action_prior == "uniform": prior_log_pi = 0.0 elif self.action_prior == "gaussian": prior_log_pi = self.action_prior_distribution.log_likelihood( action, GaussianDistInfo(mean=torch.zeros_like(action))) return prior_log_pi def append_opt_info_(self, opt_info, losses, grad_norms, values): """In-place.""" q1_loss, q2_loss, pi_loss, alpha_loss = losses q1_grad_norm, q2_grad_norm, pi_grad_norm = grad_norms q1, q2, action_probs = values opt_info.q1Loss.append(q1_loss.item()) opt_info.q2Loss.append(q2_loss.item()) opt_info.piLoss.append(pi_loss.item()) opt_info.q1GradNorm.append( torch.tensor(q1_grad_norm).item()) # backwards compatible opt_info.q2GradNorm.append( torch.tensor(q2_grad_norm).item()) # backwards compatible opt_info.piGradNorm.append( torch.tensor(pi_grad_norm).item()) # backwards compatible opt_info.q1.extend(q1[::10].numpy()) # Downsample for stats. opt_info.q2.extend(q2[::10].numpy()) opt_info.qMeanDiff.append(torch.mean(abs(q1 - q2)).item()) opt_info.alpha.append(self._alpha.item()) def optim_state_dict(self): return dict( pi_optimizer=self.pi_optimizer.state_dict(), q1_optimizer=self.q1_optimizer.state_dict(), q2_optimizer=self.q2_optimizer.state_dict(), alpha_optimizer=self.alpha_optimizer.state_dict() if self.alpha_optimizer else None, log_alpha=self._log_alpha.detach().item(), ) def load_optim_state_dict(self, state_dict): self.pi_optimizer.load_state_dict(state_dict["pi_optimizer"]) self.q1_optimizer.load_state_dict(state_dict["q1_optimizer"]) self.q2_optimizer.load_state_dict(state_dict["q2_optimizer"]) if self.alpha_optimizer is not None and state_dict[ "alpha_optimizer"] is not None: self.alpha_optimizer.load_state_dict(state_dict["alpha_optimizer"]) with torch.no_grad(): self._log_alpha[:] = state_dict["log_alpha"] self._alpha = torch.exp(self._log_alpha.detach())
class RecurrentGaussianPgAgentBase(BaseAgent): def __call__(self, observation, prev_action, prev_reward, init_rnn_state, device="cpu"): """Performs forward pass on training data, for algorithm (requires recurrent state input).""" # Assume init_rnn_state already shaped: [N,B,H] model_inputs = buffer_to( (observation, prev_action, prev_reward, init_rnn_state), device=self.device) mu, log_std, value, next_rnn_state = self.model(*model_inputs) dist_info, value = buffer_to( (DistInfoStd(mean=mu, log_std=log_std), value), device=device) return dist_info, value, next_rnn_state # Leave rnn_state on device. def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super().initialize(env_spaces, share_memory) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], # min_std=MIN_STD, # clip=env_spaces.action.high[0], # Probably +1? ) @torch.no_grad() def step(self, observation, prev_action, prev_reward, device="cpu"): """ Compute policy's action distribution from inputs, and sample an action. Calls the model to produce mean, log_std, value estimate, and next recurrent state. Moves inputs to device and returns outputs back to CPU, for the sampler. Advances the recurrent state of the agent. (no grad) """ agent_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu, log_std, value, rnn_state = self.model(*agent_inputs, self.prev_rnn_state) dist_info = DistInfoStd(mean=mu, log_std=log_std) action = self.distribution.sample(dist_info) # Model handles None, but Buffer does not, make zeros if needed: prev_rnn_state = self.prev_rnn_state if self.prev_rnn_state is not None else buffer_func( rnn_state, torch.zeros_like) # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage. # (Special case: model should always leave B dimension in.) prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1) agent_info = AgentInfoRnn(dist_info=dist_info, value=value, prev_rnn_state=prev_rnn_state) action, agent_info = buffer_to((action, agent_info), device=device) self.advance_rnn_state(rnn_state) # Keep on device. return AgentStep(action=action, agent_info=agent_info) @torch.no_grad() def value(self, observation, prev_action, prev_reward, device="cpu"): """ Compute the value estimate for the environment state using the currently held recurrent state, without advancing the recurrent state, e.g. for the bootstrap value V(s_{T+1}), in the sampler. (no grad) """ agent_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) _mu, _log_std, value, _rnn_state = self.model(*agent_inputs, self.prev_rnn_state) return value.to(device)
class SAC_V(RlAlgorithm): """TO BE DEPRECATED.""" opt_info_fields = tuple(f for f in OptInfo._fields) # copy def __init__( self, discount=0.99, batch_size=256, min_steps_learn=int(1e4), replay_size=int(1e6), replay_ratio=256, # data_consumption / data_generation target_update_tau=0.005, # tau=1 for hard update. target_update_interval=1, # 1000 for hard update, 1 for soft. learning_rate=3e-4, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, # for all of them. action_prior="uniform", # or "gaussian" reward_scale=1, reparameterize=True, clip_grad_norm=1e9, policy_output_regularization=0.001, n_step_return=1, updates_per_sync=1, # For async mode only. bootstrap_timelimit=True, ReplayBufferCls=None, # Leave None to select by above options. ): if optim_kwargs is None: optim_kwargs = dict() assert action_prior in ["uniform", "gaussian"] self._batch_size = batch_size del batch_size # Property. save__init__args(locals()) def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples, world_size=1, rank=0): """Used in basic or synchronous multi-GPU runners, not async.""" self.agent = agent self.n_itr = n_itr self.mid_batch_reset = mid_batch_reset self.sampler_bs = sampler_bs = batch_spec.size self.updates_per_optimize = int(self.replay_ratio * sampler_bs / self.batch_size) logger.log( f"From sampler batch size {sampler_bs}, training " f"batch size {self.batch_size}, and replay ratio " f"{self.replay_ratio}, computed {self.updates_per_optimize} " f"updates per iteration.") self.min_itr_learn = self.min_steps_learn // sampler_bs agent.give_min_itr_learn(self.min_itr_learn) self.initialize_replay_buffer(examples, batch_spec) self.optim_initialize(rank) def async_initialize(self, agent, sampler_n_itr, batch_spec, mid_batch_reset, examples, world_size=1): """Used in async runner only.""" self.agent = agent self.n_itr = sampler_n_itr self.initialize_replay_buffer(examples, batch_spec, async_=True) self.mid_batch_reset = mid_batch_reset self.sampler_bs = sampler_bs = batch_spec.size self.updates_per_optimize = self.updates_per_sync self.min_itr_learn = int(self.min_steps_learn // sampler_bs) agent.give_min_itr_learn(self.min_itr_learn) return self.replay_buffer def optim_initialize(self, rank=0): """Called by async runner.""" self.rank = rank self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q1_optimizer = self.OptimCls(self.agent.q1_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.q2_optimizer = self.OptimCls(self.agent.q2_parameters(), lr=self.learning_rate, **self.optim_kwargs) self.v_optimizer = self.OptimCls(self.agent.v_parameters(), lr=self.learning_rate, **self.optim_kwargs) if self.initial_optim_state_dict is not None: self.load_optim_state_dict(self.initial_optim_state_dict) if self.action_prior == "gaussian": self.action_prior_distribution = Gaussian( dim=self.agent.env_spaces.action.size, std=1.) def initialize_replay_buffer(self, examples, batch_spec, async_=False): example_to_buffer = self.examples_to_buffer(examples) replay_kwargs = dict( example=example_to_buffer, size=self.replay_size, B=batch_spec.B, n_step_return=self.n_step_return, ) if not self.bootstrap_timelimit: ReplayCls = AsyncUniformReplayBuffer if async_ else UniformReplayBuffer else: ReplayCls = AsyncTlUniformReplayBuffer if async_ else TlUniformReplayBuffer if self.ReplayBufferCls is not None: ReplayCls = self.ReplayBufferCls logger.log( f"WARNING: ignoring internal selection logic and using" f" input replay buffer class: {ReplayCls} -- compatibility not" " guaranteed.") self.replay_buffer = ReplayCls(**replay_kwargs) def optimize_agent(self, itr, samples=None, sampler_itr=None): itr = itr if sampler_itr is None else sampler_itr # Async uses sampler_itr. if samples is not None: samples_to_buffer = self.samples_to_buffer(samples) self.replay_buffer.append_samples(samples_to_buffer) opt_info = OptInfo(*([] for _ in range(len(OptInfo._fields)))) if itr < self.min_itr_learn: return opt_info for _ in range(self.updates_per_optimize): samples_from_replay = self.replay_buffer.sample_batch( self.batch_size) losses, values = self.loss(samples_from_replay) q1_loss, q2_loss, v_loss, pi_loss = losses self.v_optimizer.zero_grad() v_loss.backward() v_grad_norm = torch.nn.utils.clip_grad_norm_( self.agent.v_parameters(), self.clip_grad_norm) self.v_optimizer.step() self.pi_optimizer.zero_grad() pi_loss.backward() pi_grad_norm = torch.nn.utils.clip_grad_norm_( self.agent.pi_parameters(), self.clip_grad_norm) self.pi_optimizer.step() # Step Q's last because pi_loss.backward() uses them? self.q1_optimizer.zero_grad() q1_loss.backward() q1_grad_norm = torch.nn.utils.clip_grad_norm_( self.agent.q1_parameters(), self.clip_grad_norm) self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() q2_grad_norm = torch.nn.utils.clip_grad_norm_( self.agent.q2_parameters(), self.clip_grad_norm) self.q2_optimizer.step() grad_norms = (q1_grad_norm, q2_grad_norm, v_grad_norm, pi_grad_norm) self.append_opt_info_(opt_info, losses, grad_norms, values) self.update_counter += 1 if self.update_counter % self.target_update_interval == 0: self.agent.update_target(self.target_update_tau) return opt_info def samples_to_buffer(self, samples): return SamplesToBuffer( observation=samples.env.observation, action=samples.agent.action, reward=samples.env.reward, done=samples.env.done, timeout=getattr(samples.env.env_info, "timeout", None), ) def examples_to_buffer(self, examples): """Defines how to initialize the replay buffer from examples. Called in initialize_replay_buffer(). """ return SamplesToBuffer( observation=examples["observation"], action=examples["action"], reward=examples["reward"], done=examples["done"], timeout=getattr(examples["env_info"], "timeout", None), ) def loss(self, samples): """Samples have leading batch dimension [B,..] (but not time).""" agent_inputs, target_inputs, action = buffer_to( (samples.agent_inputs, samples.target_inputs, samples.action)) q1, q2 = self.agent.q(*agent_inputs, action) with torch.no_grad(): target_v = self.agent.target_v(*target_inputs) disc = self.discount**self.n_step_return y = (self.reward_scale * samples.return_ + (1 - samples.done_n.float()) * disc * target_v) if self.mid_batch_reset and not self.agent.recurrent: valid = torch.ones_like(samples.done, dtype=torch.float) else: valid = valid_from_done(samples.done) if self.bootstrap_timelimit: # To avoid non-use of bootstrap when environment is 'done' due to # time-limit, turn off training on these samples. valid *= (1 - samples.timeout_n.float()) q1_loss = 0.5 * valid_mean((y - q1)**2, valid) q2_loss = 0.5 * valid_mean((y - q2)**2, valid) v = self.agent.v(*agent_inputs) new_action, log_pi, (pi_mean, pi_log_std) = self.agent.pi(*agent_inputs) if not self.reparameterize: new_action = new_action.detach() # No grad. log_target1, log_target2 = self.agent.q(*agent_inputs, new_action) min_log_target = torch.min(log_target1, log_target2) prior_log_pi = self.get_action_prior(new_action.cpu()) v_target = (min_log_target - log_pi + prior_log_pi).detach() # No grad. v_loss = 0.5 * valid_mean((v - v_target)**2, valid) if self.reparameterize: pi_losses = log_pi - min_log_target else: pi_factor = (v - v_target).detach() pi_losses = log_pi * pi_factor if self.policy_output_regularization > 0: pi_losses += self.policy_output_regularization * torch.mean( 0.5 * pi_mean**2 + 0.5 * pi_log_std**2, dim=-1) pi_loss = valid_mean(pi_losses, valid) losses = (q1_loss, q2_loss, v_loss, pi_loss) values = tuple(val.detach() for val in (q1, q2, v, pi_mean, pi_log_std)) return losses, values # def q_loss(self, samples): # """Samples have leading batch dimension [B,..] (but not time).""" # agent_inputs, target_inputs, action = buffer_to( # (samples.agent_inputs, samples.target_inputs, samples.action), # device=self.agent.device) # Move to device once, re-use. # q1, q2 = self.agent.q(*agent_inputs, action) # with torch.no_grad(): # target_v = self.agent.target_v(*target_inputs) # disc = self.discount ** self.n_step_return # y = (self.reward_scale * samples.return_ + # (1 - samples.done_n.float()) * disc * target_v) # if self.mid_batch_reset and not self.agent.recurrent: # valid = None # OR: torch.ones_like(samples.done, dtype=torch.float) # else: # valid = valid_from_done(samples.done) # q1_loss = 0.5 * valid_mean((y - q1) ** 2, valid) # q2_loss = 0.5 * valid_mean((y - q2) ** 2, valid) # losses = (q1_loss, q2_loss) # values = tuple(val.detach() for val in (q1, q2)) # return losses, values, agent_inputs, valid # def pi_v_loss(self, agent_inputs, valid): # v = self.agent.v(*agent_inputs) # new_action, log_pi, (pi_mean, pi_log_std) = self.agent.pi(*agent_inputs) # if not self.reparameterize: # new_action = new_action.detach() # No grad. # log_target1, log_target2 = self.agent.q(*agent_inputs, new_action) # min_log_target = torch.min(log_target1, log_target2) # prior_log_pi = self.get_action_prior(new_action.cpu()) # v_target = (min_log_target - log_pi + prior_log_pi).detach() # No grad. # v_loss = 0.5 * valid_mean((v - v_target) ** 2, valid) # if self.reparameterize: # pi_losses = log_pi - min_log_target # log_target1 # min_log_target # else: # pi_factor = (v - v_target).detach() # No grad. # pi_losses = log_pi * pi_factor # if self.policy_output_regularization > 0: # pi_losses += self.policy_output_regularization * torch.sum( # 0.5 * pi_mean ** 2 + 0.5 * pi_log_std ** 2, dim=-1) # pi_loss = valid_mean(pi_losses, valid) # losses = (v_loss, pi_loss) # values = tuple(val.detach() for val in (v, pi_mean, pi_log_std)) # return losses, values # def loss(self, samples): # """Samples have leading batch dimension [B,..] (but not time).""" # agent_inputs, target_inputs, action = buffer_to( # (samples.agent_inputs, samples.target_inputs, samples.action), # device=self.agent.device) # Move to device once, re-use. # q1, q2 = self.agent.q(*agent_inputs, action) # with torch.no_grad(): # target_v = self.agent.target_v(*target_inputs) # disc = self.discount ** self.n_step_return # y = (self.reward_scale * samples.return_ + # (1 - samples.done_n.float()) * disc * target_v) # if self.mid_batch_reset and not self.agent.recurrent: # valid = None # OR: torch.ones_like(samples.done, dtype=torch.float) # else: # valid = valid_from_done(samples.done) # q1_loss = 0.5 * valid_mean((y - q1) ** 2, valid) # q2_loss = 0.5 * valid_mean((y - q2) ** 2, valid) # v = self.agent.v(*agent_inputs) # new_action, log_pi, (pi_mean, pi_log_std) = self.agent.pi(*agent_inputs) # if not self.reparameterize: # new_action = new_action.detach() # No grad. # log_target1, log_target2 = self.agent.q(*agent_inputs, new_action) # min_log_target = torch.min(log_target1, log_target2) # prior_log_pi = self.get_action_prior(new_action.cpu()) # v_target = (min_log_target - log_pi + prior_log_pi).detach() # No grad. # v_loss = 0.5 * valid_mean((v - v_target) ** 2, valid) # if self.reparameterize: # pi_losses = log_pi - min_log_target # log_target1 # else: # pi_factor = (v - v_target).detach() # No grad. # pi_losses = log_pi * pi_factor # if self.policy_output_regularization > 0: # pi_losses += torch.sum(self.policy_output_regularization * 0.5 * # pi_mean ** 2 + pi_log_std ** 2, dim=-1) # pi_loss = valid_mean(pi_losses, valid) # losses = (q1_loss, q2_loss, v_loss, pi_loss) # values = tuple(val.detach() for val in (q1, q2, v, pi_mean, pi_log_std)) # return losses, values def get_action_prior(self, action): if self.action_prior == "uniform": prior_log_pi = 0.0 elif self.action_prior == "gaussian": prior_log_pi = self.action_prior_distribution.log_likelihood( action, GaussianDistInfo(mean=torch.zeros_like(action))) return prior_log_pi def append_opt_info_(self, opt_info, losses, grad_norms, values): """In-place.""" q1_loss, q2_loss, v_loss, pi_loss = losses q1_grad_norm, q2_grad_norm, v_grad_norm, pi_grad_norm = grad_norms q1, q2, v, pi_mean, pi_log_std = values opt_info.q1Loss.append(q1_loss.item()) opt_info.q2Loss.append(q2_loss.item()) opt_info.vLoss.append(v_loss.item()) opt_info.piLoss.append(pi_loss.item()) opt_info.q1GradNorm.append( torch.tensor(q1_grad_norm).item()) # backwards compatible opt_info.q2GradNorm.append( torch.tensor(q2_grad_norm).item()) # backwards compatible opt_info.vGradNorm.append( torch.tensor(v_grad_norm).item()) # backwards compatible opt_info.piGradNorm.append( torch.tensor(pi_grad_norm).item()) # backwards compatible opt_info.q1.extend(q1[::10].numpy()) # Downsample for stats. opt_info.q2.extend(q2[::10].numpy()) opt_info.v.extend(v[::10].numpy()) opt_info.piMu.extend(pi_mean[::10].numpy()) opt_info.piLogStd.extend(pi_log_std[::10].numpy()) opt_info.qMeanDiff.append(torch.mean(abs(q1 - q2)).item()) def optim_state_dict(self): return dict( pi_optimizer=self.pi_optimizer.state_dict(), q1_optimizer=self.q1_optimizer.state_dict(), q2_optimizer=self.q2_optimizer.state_dict(), v_optimizer=self.v_optimizer.state_dict(), ) def load_optim_state_dict(self, state_dict): self.pi_optimizer.load_state_dict(state_dict["pi_optimizer"]) self.q1_optimizer.load_state_dict(state_dict["q1_optimizer"]) self.q2_optimizer.load_state_dict(state_dict["q2_optimizer"]) self.v_optimizer.load_state_dict(state_dict["v_optimizer"])
class SacAgent(BaseAgent): """TO BE DEPRECATED.""" def __init__( self, ModelCls=PiMlpModel, # Pi model. QModelCls=QofMuMlpModel, VModelCls=VMlpModel, model_kwargs=None, # Pi model. q_model_kwargs=None, v_model_kwargs=None, initial_model_state_dict=None, # All models. action_squash=1., # Max magnitude (or None). pretrain_std=0.75, # With squash 0.75 is near uniform. ): if model_kwargs is None: model_kwargs = dict(hidden_sizes=[256, 256]) if q_model_kwargs is None: q_model_kwargs = dict(hidden_sizes=[256, 256]) if v_model_kwargs is None: v_model_kwargs = dict(hidden_sizes=[256, 256]) super().__init__(ModelCls=ModelCls, model_kwargs=model_kwargs, initial_model_state_dict=initial_model_state_dict) save__init__args(locals()) self.min_itr_learn = 0 # Get from algo. def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): _initial_model_state_dict = self.initial_model_state_dict self.initial_model_state_dict = None # Don't let base agent try to load. super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.initial_model_state_dict = _initial_model_state_dict self.q1_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.q2_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.v_model = self.VModelCls(**self.env_model_kwargs, **self.v_model_kwargs) self.target_v_model = self.VModelCls(**self.env_model_kwargs, **self.v_model_kwargs) self.target_v_model.load_state_dict(self.v_model.state_dict()) if self.initial_model_state_dict is not None: self.load_state_dict(self.initial_model_state_dict) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], squash=self.action_squash, min_std=np.exp(MIN_LOG_STD), max_std=np.exp(MAX_LOG_STD), ) def to_device(self, cuda_idx=None): super().to_device(cuda_idx) self.q1_model.to(self.device) self.q2_model.to(self.device) self.v_model.to(self.device) self.target_v_model.to(self.device) def data_parallel(self): super().data_parallel DDP_WRAP = DDPC if self.device.type == "cpu" else DDP self.q1_model = DDP_WRAP(self.q1_model) self.q2_model = DDP_WRAP(self.q2_model) self.v_model = DDP_WRAP(self.v_model) def give_min_itr_learn(self, min_itr_learn): self.min_itr_learn = min_itr_learn # From algo. def make_env_to_model_kwargs(self, env_spaces): assert len(env_spaces.action.shape) == 1 return dict( observation_shape=env_spaces.observation.shape, action_size=env_spaces.action.shape[0], ) def q(self, observation, prev_action, prev_reward, action): model_inputs = buffer_to((observation, prev_action, prev_reward, action), device=self.device) q1 = self.q1_model(*model_inputs) q2 = self.q2_model(*model_inputs) return q1.cpu(), q2.cpu() def v(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) v = self.v_model(*model_inputs) return v.cpu() def pi(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mean, log_std = self.model(*model_inputs) dist_info = DistInfoStd(mean=mean, log_std=log_std) action, log_pi = self.distribution.sample_loglikelihood(dist_info) # action = self.distribution.sample(dist_info) # log_pi = self.distribution.log_likelihood(action, dist_info) log_pi, dist_info = buffer_to((log_pi, dist_info), device="cpu") return action, log_pi, dist_info # Action stays on device for q models. def target_v(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) target_v = self.target_v_model(*model_inputs) return target_v.cpu() @torch.no_grad() def step(self, observation, prev_action, prev_reward): model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mean, log_std = self.model(*model_inputs) dist_info = DistInfoStd(mean=mean, log_std=log_std) action = self.distribution.sample(dist_info) agent_info = AgentInfo(dist_info=dist_info) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) def update_target(self, tau=1): update_state_dict(self.target_v_model, self.v_model.state_dict(), tau) @property def models(self): return Models(pi=self.model, q1=self.q1_model, q2=self.q2_model, v=self.v_model) def pi_parameters(self): return self.model.parameters() def q1_parameters(self): return self.q1_model.parameters() def q2_parameters(self): return self.q2_model.parameters() def v_parameters(self): return self.v_model.parameters() def train_mode(self, itr): super().train_mode(itr) self.q1_model.train() self.q2_model.train() self.v_model.train() def sample_mode(self, itr): super().sample_mode(itr) self.q1_model.eval() self.q2_model.eval() self.v_model.eval() if itr == 0: logger.log(f"Agent at itr {itr}, sample std: {self.pretrain_std}") if itr == self.min_itr_learn: logger.log(f"Agent at itr {itr}, sample std: learned.") std = None if itr >= self.min_itr_learn else self.pretrain_std self.distribution.set_std(std) # If None: std from policy dist_info. def eval_mode(self, itr): super().eval_mode(itr) self.q1_model.eval() self.q2_model.eval() self.v_model.eval() self.distribution.set_std(0.) # Deterministic (dist_info std ignored). def state_dict(self): return dict( model=self.model.state_dict(), # Pi model. q1_model=self.q1_model.state_dict(), q2_model=self.q2_model.state_dict(), v_model=self.v_model.state_dict(), target_v_model=self.target_v_model.state_dict(), ) def load_state_dict(self, state_dict): self.model.load_state_dict(state_dict["model"]) self.q1_model.load_state_dict(state_dict["q1_model"]) self.q2_model.load_state_dict(state_dict["q2_model"]) self.v_model.load_state_dict(state_dict["v_model"]) self.target_v_model.load_state_dict(state_dict["target_v_model"])
class DdpgAgent(BaseAgent): """Agent for deep deterministic policy gradient algorithm.""" shared_mu_model = None def __init__( self, ModelCls=MuMlpModel, # Mu model. QModelCls=QofMuMlpModel, model_kwargs=None, # Mu model. q_model_kwargs=None, initial_model_state_dict=None, # Mu model. initial_q_model_state_dict=None, action_std=0.1, action_noise_clip=None, ): """Saves input arguments; default network sizes saved here.""" if model_kwargs is None: model_kwargs = dict(hidden_sizes=[400, 300]) if q_model_kwargs is None: q_model_kwargs = dict(hidden_sizes=[400, 300]) save__init__args(locals()) super().__init__() # For async setup. def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """Instantiates mu and q, and target_mu and target_q models.""" super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.q_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) if self.initial_q_model_state_dict is not None: self.q_model.load_state_dict(self.initial_q_model_state_dict) self.target_model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs) self.target_q_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs) self.target_q_model.load_state_dict(self.q_model.state_dict()) assert len(env_spaces.action.shape) == 1 self.distribution = Gaussian( dim=env_spaces.action.shape[0], std=self.action_std, noise_clip=self.action_noise_clip, clip=env_spaces.action.high[0], # Assume symmetric low=-high. ) def to_device(self, cuda_idx=None): super().to_device(cuda_idx) # Takes care of self.model. self.target_model.to(self.device) self.q_model.to(self.device) self.target_q_model.to(self.device) def data_parallel(self): device_id = super().data_parallel() # Takes care of self.model. self.q_model = DDP( self.q_model, device_ids=None if device_id is None else [device_id], # 1 GPU. output_device=device_id, ) return device_id def make_env_to_model_kwargs(self, env_spaces): assert len(env_spaces.action.shape) == 1 return dict( observation_shape=env_spaces.observation.shape, action_size=env_spaces.action.shape[0], ) def q(self, observation, prev_action, prev_reward, action): """Compute Q-value for input state/observation and action (with grad).""" model_inputs = buffer_to( (observation, prev_action, prev_reward, action), device=self.device) q = self.q_model(*model_inputs) return q.cpu() def q_at_mu(self, observation, prev_action, prev_reward): """Compute Q-value for input state/observation, through the mu_model (with grad).""" model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu = self.model(*model_inputs) q = self.q_model(*model_inputs, mu) return q.cpu() def target_q_at_mu(self, observation, prev_action, prev_reward): """Compute target Q-value for input state/observation, through the target mu_model.""" model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) target_mu = self.target_model(*model_inputs) target_q_at_mu = self.target_q_model(*model_inputs, target_mu) return target_q_at_mu.cpu() @torch.no_grad() def step(self, observation, prev_action, prev_reward): """Computes distribution parameters (mu) for state/observation, returns (gaussian) sampled action.""" model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) mu = self.model(*model_inputs) action = self.distribution.sample(DistInfo(mean=mu)) agent_info = AgentInfo(mu=mu) action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) def update_target(self, tau=1): update_state_dict(self.target_model, self.model.state_dict(), tau) update_state_dict(self.target_q_model, self.q_model.state_dict(), tau) def q_parameters(self): return self.q_model.parameters() def mu_parameters(self): return self.model.parameters() def train_mode(self, itr): super().train_mode(itr) self.q_model.train() def sample_mode(self, itr): super().sample_mode(itr) self.q_model.eval() self.distribution.set_std(self.action_std) def eval_mode(self, itr): super().eval_mode(itr) self.q_model.eval() self.distribution.set_std(0.) # Deterministic. def state_dict(self): return dict( model=self.model.state_dict(), q_model=self.q_model.state_dict(), target_model=self.target_model.state_dict(), target_q_model=self.target_q_model.state_dict(), ) def load_state_dict(self, state_dict): self.model.load_state_dict(state_dict["model"]) self.q_model.load_state_dict(state_dict["q_model"]) self.target_model.load_state_dict(state_dict["target_model"]) self.target_q_model.load_state_dict(state_dict["target_q_model"])