def td3_lr(self, train_config, device, dtype): # not used for training, only used for testing apis c = train_config actor = smw(ActorDiscrete(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) actor_t = smw(ActorDiscrete(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) critic = smw(Critic(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) critic_t = smw(Critic(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) critic2 = smw(Critic(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) critic2_t = smw(Critic(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)], logger=logger) with pytest.raises(TypeError, match="missing .+ positional argument"): _ = TD3(actor, actor_t, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size, lr_scheduler=LambdaLR) td3 = TD3(actor, actor_t, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size, lr_scheduler=LambdaLR, lr_scheduler_args=((lr_func,), (lr_func,), (lr_func,))) return td3
def td3(self, train_config): c = train_config actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device), c.device, c.device) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device), c.device, c.device) critic = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic_t = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic2 = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic2_t = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) td3 = TD3(actor, actor_t, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.device, replay_size=c.replay_size) return td3
def td3_vis(self, train_config, tmpdir): # not used for training, only used for testing apis c = train_config tmp_dir = tmpdir.make_numbered_dir() actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device), c.device, c.device) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device), c.device, c.device) critic = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic_t = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic2 = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic2_t = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) td3 = TD3(actor, actor_t, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.device, replay_size=c.replay_size, visualize=True, visualize_dir=str(tmp_dir)) return td3
def test_config_init(self, train_config): c = train_config config = TD3.generate_config({}) config["frame_config"]["models"] = [ "Actor", "Actor", "Critic", "Critic", "Critic", "Critic", ] config["frame_config"][ "model_kwargs"] = [{ "state_dim": c.observe_dim, "action_dim": c.action_dim, "action_range": c.action_range, }] * 2 + [{ "state_dim": c.observe_dim, "action_dim": c.action_dim }] * 4 td3 = TD3.init_from_config(config) old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32) action = t.zeros([1, c.action_dim], dtype=t.float32) td3.store_episode([{ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "terminal": False, } for _ in range(3)]) td3.update()
def td3_train(self, train_config): c = train_config # cpu is faster for testing full training. actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu") actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu") critic = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") critic_t = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") critic2 = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") critic2_t = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") td3 = TD3(actor, actor_t, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size) return td3
def td3(self, train_config, device, dtype): c = train_config actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) critic = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic_t = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic2 = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic2_t = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) td3 = TD3( actor, actor_t, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) return td3