def ddpg_per(self, train_config, device, dtype): c = train_config actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) critic = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic_t = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) ddpg_per = DDPGPer( actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) return ddpg_per
def ddpg_per_vis(self, train_config, device, dtype, tmpdir): # not used for training, only used for testing apis c = train_config tmp_dir = tmpdir.make_numbered_dir() actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) critic = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic_t = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) ddpg_per = DDPGPer( actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, visualize=True, visualize_dir=str(tmp_dir), ) return ddpg_per
def test_criterion(self, train_config, device, dtype): c = train_config actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) critic = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device ) critic_t = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device ) with pytest.raises( RuntimeError, match="Criterion does not have the " "'reduction' property" ): def criterion(a, b): return a - b _ = DDPGPer( actor, actor_t, critic, critic_t, t.optim.Adam, criterion, replay_device="cpu", replay_size=c.replay_size, )
def ddpg_per_train(self, train_config): c = train_config # cpu is faster for testing full training. actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu") actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu") critic = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") critic_t = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") ddpg_per = DDPGPer( actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) return ddpg_per
def ddpg_per(self, train_config): c = train_config actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device), c.device, c.device) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device), c.device, c.device) critic = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic_t = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) ddpg_per = DDPGPer(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.device, replay_size=c.replay_size) return ddpg_per
def _build_model(self): actor = self._build_actor() actor_target = self._build_actor() critic = self._build_critic() critic_target = self._build_critic() optimizer = lambda params, lr: torch.optim.Adam( params, lr=lr, weight_decay=self.l2_reg) criterion = nn.MSELoss(reduction='sum') # DDPG with prioritized replay self.ddpg_per = DDPGPer(actor, actor_target, critic, critic_target, optimizer=optimizer, criterion=criterion, batch_size=self.batch_size, actor_learning_rate=self.actor_learning_rate, critic_learning_rate=self.critic_learning_rate, discount=self.gamma, replay_size=self.replay_capacity)