def setUp(self): torch.manual_seed(2) self.model = nn.Sequential( nn.Linear(STATE_DIM, ACTIONS) ) optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) self.policy = SoftmaxPolicy(self.model, optimizer)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"]) v = VNetwork(self.value_model, value_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return A2C( features, v, policy, n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], writer=writer)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters["min_batch_size"] feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR(feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) v = VNetwork( self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer ) policy = SoftmaxPolicy( self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) return DeepmindAtariBody( VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]), )
def _vpg(env, writer=DummyWriter()): feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, writer=writer ) v = VNetwork( value_model, value_optimizer, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, writer=writer ) return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"])
def _vac(envs, writer=DummyWriter()): value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(envs[0]).to(device) feature_model = feature_model_constructor().to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, ) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) return DeepmindAtariBody( VAC(features, v, policy, discount_factor=discount_factor), )
def _ppo(envs, writer=DummyWriter()): env = envs[0] feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer) return PPO(features, v, policy, epsilon=epsilon, epochs=epochs, lam=lam, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer)
def _a2c(envs, writer=DummyWriter()): env = envs[0] feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork( value_model, value_optimizer, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer ) return A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer )
def _vpg(env, writer=DummyWriter()): feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer) return VPG(features, v, policy, gamma=gamma, min_batch_size=min_batch_size)
def _ppo(envs, writer=DummyWriter()): env = envs[0] # Update epoch * minibatches times per update, # but we only update once per n_steps, # with n_envs and 4 frames per step final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs * 4) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def _a2c(envs, writer=DummyWriter()): env = envs[0] feature_model = conv_features().to(device) value_model = value_net().to(device) policy_model = policy_net(env).to(device) feature_optimizer = RMSprop( feature_model.parameters(), alpha=alpha, lr=lr * feature_lr_scaling, eps=eps ) value_optimizer = RMSprop( value_model.parameters(), alpha=alpha, lr=lr, eps=eps ) policy_optimizer = RMSprop( policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad ) v = ValueNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, ) return ParallelAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, ), envs, )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters["clip_initial"], self.hyperparameters["clip_final"], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters["epochs"], minibatches=self.hyperparameters["minibatches"], n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], lam=self.hyperparameters["lam"], entropy_loss_scaling=self. hyperparameters["entropy_loss_scaling"], writer=writer, ))
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs) value_model = value_head().to(device) policy_model = policy_head(env).to(device) feature_model = conv_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return FrameStack( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer), writer=writer ), size=4 )
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs * 4) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer ), )
def _actor_critic(env, writer=DummyWriter()): value_model = fc_value(env).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v) v = ValueNetwork(value_model, value_optimizer, writer=writer) policy_model = fc_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, writer=writer) return ActorCritic(v, policy)
def _vac(env, writer=DummyWriter()): value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor(env).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) v = VNetwork(value_model, value_optimizer, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, writer=writer) features = FeatureNetwork(feature_model, feature_optimizer) return VAC(features, v, policy, discount_factor=discount_factor)
def _vac(env, writer=DummyWriter()): value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_model = fc_relu_features(env).to(device) value_optimizer = RMSprop(value_model.parameters(), lr=lr_v, alpha=alpha, eps=eps) policy_optimizer = RMSprop(policy_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps) feature_optimizer = RMSprop(feature_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps) v = VNetwork(value_model, value_optimizer, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, writer=writer) features = FeatureNetwork(feature_model, feature_optimizer) return VAC(features, v, policy, gamma=discount_factor)
def _a2c(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = RMSprop( feature_model.parameters(), alpha=alpha, lr=lr, eps=eps ) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop( policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, ), )
def _vpg_atari(env, writer=DummyWriter()): feature_model = nature_features().to(device) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_optimizer = RMSprop(feature_model.parameters(), alpha=alpha, lr=lr * feature_lr_scaling, eps=eps) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop(policy_model.parameters(), alpha=alpha, lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, ) return DeepmindAtariBody( VPG(features, v, policy, gamma=discount_factor, min_batch_size=min_batch_size), )
def _vpg_atari(env, writer=DummyWriter()): value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) return DeepmindAtariBody(VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size), episodic_lives=True)
def test_agent(self): features = FeatureNetwork(copy.deepcopy(self.feature_model)) policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) return DeepmindAtariBody(VACTestAgent(features, policy))
class DiversityLearner: def __init__( self, model_fn, model_features, logger, device, num_targets, max_learn_steps, num_actions, obs_preproc, discount_factor=0.99, entropy_target=-2, lr_value=1e-3, lr_pi=1e-4, # Training settings polyak_rate=0.005, # Replay Buffer settings replay_start_size=5000, replay_buffer_size=1e6, # Exploration settings temperature_initial=0.1, lr_temperature=1e-5, entropy_target_scaling=1., ): self.writer = writer = DummyWriter() eps = 1e-5 self.discount_factor = discount_factor self.entropy_target = entropy_target self.temperature = temperature_initial self.lr_temperature = lr_temperature self.logger = logger self.device = device self.num_targets = num_targets self.max_learn_steps = max_learn_steps self.num_actions = num_actions final_anneal_step = (max_learn_steps) self.policy = DiversityPolicy(model_fn, model_features, num_actions, num_targets, obs_preproc, device) self.policy = self.policy.to(device) self.obs_preproc = obs_preproc policy_optimizer = Adam(self.policy.parameters(), lr=lr_pi, eps=eps) self.policy_learner = SoftmaxPolicy(self.policy, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) value_feature_model = model_fn().to(device) q_models = [ DuelingQValueLayer(model_features, num_targets, num_actions).to(device) for i in range(2) ] v_model = ValueLayer(model_features, num_targets, num_actions).to(device) feature_optimizer = Adam(value_feature_model.parameters(), lr=lr_value, eps=eps) q_optimizers = [ Adam(q_models[i].parameters(), lr=lr_value, eps=eps) for i in range(2) ] v_optimizer = Adam(v_model.parameters(), lr=lr_value, eps=eps) self.features = FeatureNetwork( value_feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), # clip_grad=clip_grad, writer=writer) self.qs = [ QContinuous(q_models[i], q_optimizers[i], scheduler=CosineAnnealingLR(q_optimizers[i], final_anneal_step), writer=writer, name=f'q_{i}') for i in range(2) ] self.v = VNetwork( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', ) def learn_step(self, idxs, transition_batch, weights): Otm1, targ_vec, old_action, env_rew, done, Ot = transition_batch batch_size = len(Ot) obsm1 = self.obs_preproc(torch.tensor(Otm1, device=self.device)) targ_vec = torch.tensor(targ_vec, device=self.device) actions = torch.tensor(old_action, device=self.device) rewards = torch.tensor(env_rew, device=self.device) done = torch.tensor(done, device=self.device).float().to(self.device) next_obs = self.obs_preproc(torch.tensor(Ot, device=self.device)) weights = torch.tensor(weights, device=self.device) # assert (not (Otm1 == Ot).all()) # print(self.device) states = StateArray( { 'observation': obsm1, 'reward': rewards, 'done': done, }, shape=(batch_size, )) # print(states['mask']) next_states = StateArray( { 'observation': obsm1, 'reward': torch.zeros(batch_size, device=self.device), 'done': torch.zeros(batch_size, device=self.device), 'mask': torch.ones(batch_size, device=self.device), }, shape=(batch_size, )) # prediction_reward = self.predictor(Ot) * targ_vec with torch.no_grad(): distribution = self.policy_learner(states) _log_probs = distribution.log_prob(actions).detach().squeeze() value_feature1 = self.features(states) value_feature2 = self.features(next_states) _actions = distribution.sample() #torch.argmax(_log_probs, axis=-1) q_targets = rewards + self.discount_factor * self.v.target( value_feature2).detach() # print(value_feature1) v_targets = torch.min( self.qs[0].target(value_feature1, _actions), self.qs[1].target(value_feature1, _actions), ) - self.temperature * _log_probs # update Q and V-functions # print(q_targets.min(),torch.min( # self.qs[0].target(value_feature1, _actions), # self.qs[1].target(value_feature1, _actions), # )) for i in range(2): self.qs[i].reinforce( mse_loss(self.qs[i](value_feature1, actions), q_targets)) # print(self.v(value_feature1).shape) # print(v_targets.shape) self.v.reinforce(mse_loss(self.v(value_feature1), v_targets)) # update policy distribution = self.policy_learner(states) _actions2 = distribution.sample() _log_probs2 = distribution.log_prob(_actions2).squeeze() loss = (-self.qs[0](value_feature1, _actions2).detach() + self.temperature * _log_probs2).mean() self.policy_learner.reinforce(loss) self.features.reinforce() self.qs[0].zero_grad() # adjust temperature temperature_grad = (_log_probs + self.entropy_target).mean() self.temperature += self.lr_temperature * temperature_grad.detach( ).cpu().numpy()
class TestSoftmax(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential( nn.Linear(STATE_DIM, ACTIONS) ) optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) self.policy = SoftmaxPolicy(self.model, optimizer) def test_run(self): state1 = State(torch.randn(1, STATE_DIM)) dist1 = self.policy(state1) action1 = dist1.sample() log_prob1 = dist1.log_prob(action1) self.assertEqual(action1.item(), 0) state2 = State(torch.randn(1, STATE_DIM)) dist2 = self.policy(state2) action2 = dist2.sample() log_prob2 = dist2.log_prob(action2) self.assertEqual(action2.item(), 2) loss = -(torch.tensor([-1, 1000000]) * torch.cat((log_prob1, log_prob2))).mean() self.policy.reinforce(loss) state3 = State(torch.randn(1, STATE_DIM)) dist3 = self.policy(state3) action3 = dist3.sample() self.assertEqual(action3.item(), 2) def test_multi_action(self): states = State(torch.randn(3, STATE_DIM)) actions = self.policy(states).sample() tt.assert_equal(actions, torch.tensor([2, 2, 0])) def test_list(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) dist = self.policy(states) actions = dist.sample() log_probs = dist.log_prob(actions) tt.assert_equal(actions, torch.tensor([1, 2, 1])) loss = -(torch.tensor([[1, 2, 3]]) * log_probs).mean() self.policy.reinforce(loss) def test_reinforce(self): def loss(log_probs): return -log_probs.mean() states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1])) actions = self.policy.eval(states).sample() # notice the values increase with each successive reinforce log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.84, -0.62, -0.757]), decimal=3) self.policy.reinforce(loss(log_probs)) log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.811, -0.561, -0.701]), decimal=3) self.policy.reinforce(loss(log_probs)) log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.785, -0.51, -0.651]), decimal=3)
def __init__( self, model_fn, model_features, logger, device, num_targets, max_learn_steps, num_actions, obs_preproc, discount_factor=0.99, entropy_target=-2, lr_value=1e-3, lr_pi=1e-4, # Training settings polyak_rate=0.005, # Replay Buffer settings replay_start_size=5000, replay_buffer_size=1e6, # Exploration settings temperature_initial=0.1, lr_temperature=1e-5, entropy_target_scaling=1., ): self.writer = writer = DummyWriter() eps = 1e-5 self.discount_factor = discount_factor self.entropy_target = entropy_target self.temperature = temperature_initial self.lr_temperature = lr_temperature self.logger = logger self.device = device self.num_targets = num_targets self.max_learn_steps = max_learn_steps self.num_actions = num_actions final_anneal_step = (max_learn_steps) self.policy = DiversityPolicy(model_fn, model_features, num_actions, num_targets, obs_preproc, device) self.policy = self.policy.to(device) self.obs_preproc = obs_preproc policy_optimizer = Adam(self.policy.parameters(), lr=lr_pi, eps=eps) self.policy_learner = SoftmaxPolicy(self.policy, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) value_feature_model = model_fn().to(device) q_models = [ DuelingQValueLayer(model_features, num_targets, num_actions).to(device) for i in range(2) ] v_model = ValueLayer(model_features, num_targets, num_actions).to(device) feature_optimizer = Adam(value_feature_model.parameters(), lr=lr_value, eps=eps) q_optimizers = [ Adam(q_models[i].parameters(), lr=lr_value, eps=eps) for i in range(2) ] v_optimizer = Adam(v_model.parameters(), lr=lr_value, eps=eps) self.features = FeatureNetwork( value_feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), # clip_grad=clip_grad, writer=writer) self.qs = [ QContinuous(q_models[i], q_optimizers[i], scheduler=CosineAnnealingLR(q_optimizers[i], final_anneal_step), writer=writer, name=f'q_{i}') for i in range(2) ] self.v = VNetwork( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', )
def test_agent(self): features = FeatureNetwork(copy.deepcopy(self.feature_model)) policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) return VPGTestAgent(features, policy)
class TestSoftmax(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS)) optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) self.policy = SoftmaxPolicy(self.model, optimizer, ACTIONS) def test_run(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.item(), 0) state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.item(), 2) self.policy.reinforce(torch.tensor([-1, 1000000]).float()) action = self.policy(state) self.assertEqual(action.item(), 2) def test_multi_action(self): states = State(torch.randn(3, STATE_DIM)) actions = self.policy(states) tt.assert_equal(actions, torch.tensor([2, 2, 0])) self.policy.reinforce(torch.tensor([[1, 2, 3]]).float()) def test_multi_batch_reinforce(self): self.policy(State(torch.randn(2, STATE_DIM))) self.policy(State(torch.randn(2, STATE_DIM))) self.policy(State(torch.randn(2, STATE_DIM))) self.policy.reinforce(torch.tensor([1, 2, 3, 4]).float()) self.policy.reinforce(torch.tensor([1, 2]).float()) with self.assertRaises(Exception): self.policy.reinforce(torch.tensor([1, 2]).float()) def test_list(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) actions = self.policy(states) tt.assert_equal(actions, torch.tensor([1, 2, 1])) self.policy.reinforce(torch.tensor([[1, 2, 3]]).float()) def test_action_prob(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) with torch.no_grad(): actions = self.policy(states) probs = self.policy(states, action=actions) tt.assert_almost_equal(probs, torch.tensor([0.204, 0.333, 0.217]), decimal=3)
def _ppo(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam( feature_model.parameters(), lr=lr, eps=eps ) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler( clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer ), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, ) )
class TestSoftmax(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS)) optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) self.policy = SoftmaxPolicy(self.model, optimizer, ACTIONS) def test_run(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.item(), 0) state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.item(), 2) self.policy.reinforce(torch.tensor([-1, 1000000]).float()) action = self.policy(state) self.assertEqual(action.item(), 2) def test_multi_action(self): states = State(torch.randn(3, STATE_DIM)) actions = self.policy(states) tt.assert_equal(actions, torch.tensor([2, 2, 0])) self.policy.reinforce(torch.tensor([[1, 2, 3]]).float()) def test_multi_batch_reinforce(self): self.policy(State(torch.randn(2, STATE_DIM))) self.policy(State(torch.randn(2, STATE_DIM))) self.policy(State(torch.randn(2, STATE_DIM))) self.policy.reinforce(torch.tensor([1, 2, 3, 4]).float()) self.policy.reinforce(torch.tensor([1, 2]).float()) with self.assertRaises(Exception): self.policy.reinforce(torch.tensor([1, 2]).float()) def test_list(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) actions = self.policy(states) tt.assert_equal(actions, torch.tensor([1, 2, 1])) self.policy.reinforce(torch.tensor([[1, 2, 3]]).float()) def test_action_prob(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) with torch.no_grad(): actions = self.policy(states) log_probs = self.policy(states, action=actions) tt.assert_almost_equal(log_probs, torch.tensor([-1.59, -1.099, -1.528]), decimal=3) def test_custom_loss(self): def loss(log_probs): return -log_probs.mean() states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1])) actions = self.policy.eval(states) # notice the values increase with each successive reinforce log_probs = self.policy(states, actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.84, -0.62, -0.757]), decimal=3) self.policy.reinforce(loss) log_probs = self.policy(states, actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.811, -0.561, -0.701]), decimal=3) self.policy.reinforce(loss) log_probs = self.policy(states, actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.785, -0.51, -0.651]), decimal=3)