def _vpg(env, writer=DummyWriter()): feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, writer=writer ) v = VNetwork( value_model, value_optimizer, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, writer=writer ) return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters["min_batch_size"] feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR(feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) v = VNetwork( self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer ) policy = SoftmaxPolicy( self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) return DeepmindAtariBody( VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"])
def _model_predictive_dqn(env, writer=None): # models feature_model = shared_feature_layers().to(device) value_model = value_head().to(device) reward_model = reward_head(env).to(device) generator_model = Generator(env).to(device) # optimizers feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) reward_optimizer = Adam(reward_model.parameters(), lr=lr, eps=eps) generator_optimizer = Adam(generator_model.parameters(), lr=lr, eps=eps) # approximators f = FeatureNetwork(feature_model, feature_optimizer, writer=writer) v = VNetwork(value_model, value_optimizer, writer=writer) r = QNetwork(reward_model, reward_optimizer, name='reward', writer=writer) g = Approximation(generator_model, generator_optimizer, name='generator', writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # create agent agent = ModelPredictiveDQN(f, v, r, g, replay_buffer, minibatch_size=minibatch_size, replay_start_size=replay_start_size ) # apply agent wrappers for better atari performance return DeepmindAtariBody(agent, lazy_frames=True)
def _ppo(envs, writer=DummyWriter()): env = envs[0] feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer) return PPO(features, v, policy, epsilon=epsilon, epochs=epochs, lam=lam, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer)
def _vac(envs, writer=DummyWriter()): value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(envs[0]).to(device) feature_model = feature_model_constructor().to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, ) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) return DeepmindAtariBody( VAC(features, v, policy, discount_factor=discount_factor), )
def _vpg(env, writer=DummyWriter()): feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer) return VPG(features, v, policy, gamma=gamma, min_batch_size=min_batch_size)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"]) v = VNetwork(self.value_model, value_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return A2C( features, v, policy, n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], writer=writer)
def _a2c(envs, writer=DummyWriter()): env = envs[0] feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork( value_model, value_optimizer, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer ) return A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer )
def _ppo(envs, writer=DummyWriter()): env = envs[0] # Update epoch * minibatches times per update, # but we only update once per n_steps, # with n_envs and 4 frames per step final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs * 4) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters["clip_initial"], self.hyperparameters["clip_final"], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters["epochs"], minibatches=self.hyperparameters["minibatches"], n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], lam=self.hyperparameters["lam"], entropy_loss_scaling=self. hyperparameters["entropy_loss_scaling"], writer=writer, ))
def _ppo(envs, writer=DummyWriter()): final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs) env = envs[0] feature_model, value_model, policy_model = fc_actor_critic(env) feature_model.to(device) value_model.to(device) policy_model.to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = GaussianPolicy( policy_model, policy_optimizer, env.action_space, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs) value_model = value_head().to(device) policy_model = policy_head(env).to(device) feature_model = conv_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return FrameStack( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer), writer=writer ), size=4 )
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs * 4) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer ), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) features = Identity(self.device) v = VNetwork( self.value_model, value_optimizer, loss_scaling=self.hyperparameters['value_loss_scaling'], clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), ) policy = GaussianPolicy( self.policy_model, policy_optimizer, self.action_space, clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters['clip_initial'], self.hyperparameters['clip_final'], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters['epochs'], minibatches=self.hyperparameters['minibatches'], n_envs=self.hyperparameters['n_envs'], n_steps=self.hyperparameters['n_steps'], discount_factor=self.hyperparameters['discount_factor'], lam=self.hyperparameters['lam'], entropy_loss_scaling=self. hyperparameters['entropy_loss_scaling'], writer=writer, ))
def _vac(env, writer=DummyWriter()): value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_model = fc_relu_features(env).to(device) value_optimizer = RMSprop(value_model.parameters(), lr=lr_v, alpha=alpha, eps=eps) policy_optimizer = RMSprop(policy_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps) feature_optimizer = RMSprop(feature_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps) v = VNetwork(value_model, value_optimizer, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, writer=writer) features = FeatureNetwork(feature_model, feature_optimizer) return VAC(features, v, policy, gamma=discount_factor)
def _vac(env, writer=DummyWriter()): value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor(env).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) v = VNetwork(value_model, value_optimizer, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, writer=writer) features = FeatureNetwork(feature_model, feature_optimizer) return VAC(features, v, policy, discount_factor=discount_factor)
def _a2c(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = RMSprop( feature_model.parameters(), alpha=alpha, lr=lr, eps=eps ) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop( policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, ), )
def _online_cacla(env, writer=DummyWriter()): value_model = models.critic(env, hidden1=hidden1, hidden2=hidden2).to(device) policy_model = models.actor(env, hidden1=hidden1, hidden2=hidden2).to(device) # feature_model = models.features(env.state_space.shape[0]).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) # feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) # feature_optimizer = SGD(feature_model.parameters(), lr=lr_pi, momentum=0.9) policy = DeterministicPolicy( policy_model, policy_optimizer, env.action_space, quiet=not log, clip_grad=1.0, writer=writer, normalise_inputs=True, box=env.state_space, ) v = VNetwork( value_model, value_optimizer, quiet=not log, writer=writer, normalise_inputs=True, box=env.state_space, ) features = None # FeatureNetwork(feature_model, feature_optimizer, writer=writer, normalize_input=False) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # TODO - reintroduce TimeFeature wrapper return OnlineCACLA(features, v, policy, replay_buffer, env.action_space, log=log, writer=writer, discount_factor=discount_factor)
def _vpg_atari(env, writer=DummyWriter()): feature_model = nature_features().to(device) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_optimizer = RMSprop(feature_model.parameters(), alpha=alpha, lr=lr * feature_lr_scaling, eps=eps) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop(policy_model.parameters(), alpha=alpha, lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, ) return DeepmindAtariBody( VPG(features, v, policy, gamma=discount_factor, min_batch_size=min_batch_size), )
def _sac(env, writer=DummyWriter()): q_1_model = fc_q(env).to(device) q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) q_1 = QContinuous(q_1_model, q_1_optimizer, writer=writer, name='q_1') q_2_model = fc_q(env).to(device) q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) q_2 = QContinuous(q_2_model, q_2_optimizer, writer=writer, name='q_2') v_model = fc_v(env).to(device) v_optimizer = Adam(v_model.parameters(), lr=lr_v) v = VNetwork( v_model, v_optimizer, target=PolyakTarget(polyak_rate), writer=writer, name='v', ) policy_model = fc_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = SoftDeterministicPolicy(policy_model, policy_optimizer, env.action_space, writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return SAC(policy, q_1, q_2, v, replay_buffer, entropy_target=(-env.action_space.shape[0] * entropy_target_scaling), lr_temperature=lr_temperature, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, writer=writer)
def _vpg_atari(env, writer=DummyWriter()): value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) return DeepmindAtariBody(VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size), episodic_lives=True)
def _fac(env, writer=DummyWriter()): value_model = models.critic(env, hidden1=hidden1, hidden2=hidden2).to(device) policy_model = models.actor(env, hidden1=hidden1, hidden2=hidden2).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) policy = DeterministicPolicy( policy_model, policy_optimizer, env.action_space, quiet=not log, clip_grad=1.0, writer=writer, normalise_inputs=True, box=env.state_space, ) v = VNetwork( value_model, value_optimizer, quiet=not log, writer=writer, normalise_inputs=True, box=env.state_space, ) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # TODO - reintroduce TimeFeature wrapper return ForwardAC(v, policy, replay_buffer, env.action_space, log=log, trace_decay=trace_decay, writer=writer, discount_factor=discount_factor)
def agent(self, writer=DummyWriter(), train_steps=float("inf")): # optimizers feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) reward_optimizer = Adam(self.reward_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) generator_optimizer = Adam(self.generator_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) # approximators f = FeatureNetwork(self.feature_model, feature_optimizer, writer=writer) v = VNetwork(self.value_model, value_optimizer, writer=writer) r = QNetwork(self.reward_model, reward_optimizer, name="reward", writer=writer) g = Approximation(self.generator_model, generator_optimizer, name="generator", writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(self.hyperparameters["replay_buffer_size"], device=self.device) # create agent agent = ModelBasedDQN(f, v, r, g, replay_buffer, minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"] ) # apply atari wrappers for better performance return DeepmindAtariBody(agent, lazy_frames=True)
class NStepAdvantageBufferTest(unittest.TestCase): def setUp(self): torch.manual_seed(1) self.features = FeatureNetwork(nn.Linear(1, 2), None) self.v = VNetwork(nn.Linear(2, 1), None) def _compute_expected_advantages(self, states, returns, next_states, lengths): return (returns + (0.5**lengths) * self.v.eval(self.features.eval(next_states)) - self.v.eval(self.features.eval(states))) def test_rollout(self): buffer = NStepAdvantageBuffer(self.v, self.features, 2, 3, discount_factor=0.5) actions = torch.ones((3)) states = State(torch.arange(0, 12).unsqueeze(1)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) states, _, advantages = buffer.advantages(states[6:9]) expected_states = State(torch.arange(0, 6).unsqueeze(1)) expected_next_states = State( torch.cat((torch.arange(6, 9), torch.arange(6, 9))).unsqueeze(1)) expected_returns = torch.tensor([0.5, 0.5, 0.5, 1, 1, 1]).float() expected_lengths = torch.tensor([2., 2, 2, 1, 1, 1]) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) def test_rollout_with_nones(self): buffer = NStepAdvantageBuffer(self.v, self.features, 3, 3, discount_factor=0.5) done = torch.ones(12) done[5] = 0 done[7] = 0 done[9] = 0 states = State(torch.arange(0, 12).unsqueeze(1), done) actions = torch.ones((3)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) buffer.store(states[6:9], actions, 2 * torch.ones(3)) states, actions, advantages = buffer.advantages(states[9:12]) expected_states = State(torch.arange(0, 9).unsqueeze(1), done[0:9]) expected_next_done = torch.zeros(9) expected_next_done[5] = 1 expected_next_done[7] = 1 expected_next_done[8] = 1 expected_next_states = State( torch.tensor([9, 7, 5, 9, 7, 11, 9, 10, 11]).unsqueeze(1), expected_next_done) expected_returns = torch.tensor([1, 0.5, 0, 2, 1, 2, 2, 2, 2]).float() expected_lengths = torch.tensor([3, 2, 1, 2, 1, 2, 1, 1, 1]).float() self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) def test_multi_rollout(self): buffer = NStepAdvantageBuffer(self.v, self.features, 2, 2, discount_factor=0.5) raw_states = State(torch.arange(0, 12).unsqueeze(1)) actions = torch.ones((2)) buffer.store(raw_states[0:2], actions, torch.ones(2)) buffer.store(raw_states[2:4], actions, torch.ones(2)) states, actions, advantages = buffer.advantages(raw_states[4:6]) expected_states = State(torch.arange(0, 4).unsqueeze(1)) expected_returns = torch.tensor([1.5, 1.5, 1, 1]) expected_next_states = State(torch.tensor([4, 5, 4, 5]).unsqueeze(1)) expected_lengths = torch.tensor([2., 2, 1, 1]) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) buffer.store(raw_states[4:6], actions, torch.ones(2)) buffer.store(raw_states[6:8], actions, torch.ones(2)) states, actions, advantages = buffer.advantages(raw_states[8:10]) expected_states = State(torch.arange(4, 8).unsqueeze(1)) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages( expected_states, torch.tensor([1.5, 1.5, 1, 1]), State(torch.tensor([8, 9, 8, 9]).unsqueeze(1)), torch.tensor([2., 2, 1, 1]))) def assert_array_equal(self, actual, expected): for i, exp in enumerate(expected): self.assertEqual(actual[i], exp, msg=(("\nactual: %s\nexpected: %s") % (actual, expected))) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.raw, expected.raw) tt.assert_equal(actual.mask, expected.mask)
def setUp(self): torch.manual_seed(1) self.features = FeatureNetwork(nn.Linear(1, 2), None) self.v = VNetwork(nn.Linear(2, 1), None)
class GeneralizedAdvantageBufferTest(unittest.TestCase): def setUp(self): torch.manual_seed(1) self.features = FeatureNetwork(nn.Linear(1, 2), None) self.v = VNetwork(nn.Linear(2, 1), None) def _compute_expected_advantages(self, states, returns, next_states, lengths): return (returns + (0.5**lengths) * self.v.eval(self.features.eval(next_states)) - self.v.eval(self.features.eval(states))) def test_simple(self): buffer = GeneralizedAdvantageBuffer(self.v, self.features, 2, 1, discount_factor=0.5, lam=0.5) actions = torch.ones((1)) states = State(torch.arange(0, 3).unsqueeze(1)) rewards = torch.tensor([1., 2, 4]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(states)) tt.assert_almost_equal(values, torch.tensor([0.1826, -0.3476, -0.8777]), decimal=3) td_errors = torch.zeros(2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([0.6436, 1.909]), decimal=3) advantages = torch.zeros(2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([1.121, 1.909]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages) tt.assert_equal(_actions, torch.tensor([1, 1])) def test_parallel(self): buffer = GeneralizedAdvantageBuffer(self.v, self.features, 2, 2, discount_factor=0.5, lam=0.5) actions = torch.ones((2)) states = [ State(torch.tensor([[0], [3]])), State(torch.tensor([[1], [4]])), State(torch.tensor([[2], [5]])), ] rewards = torch.tensor([[1., 1], [2, 1], [4, 1]]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(State.from_list(states))).view( 3, -1) tt.assert_almost_equal(values, torch.tensor([[0.183, -1.408], [-0.348, -1.938], [-0.878, -2.468]]), decimal=3) td_errors = torch.zeros(2, 2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([[0.6436, 1.439], [1.909, 1.704]]), decimal=3) advantages = torch.zeros(2, 2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([[1.121, 1.865], [1.909, 1.704]]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages.view(-1)) def assert_array_equal(self, actual, expected): for i, exp in enumerate(expected): self.assertEqual(actual[i], exp, msg=(("\nactual: %s\nexpected: %s") % (actual, expected))) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.raw, expected.raw) tt.assert_equal(actual.mask, expected.mask)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters["replay_start_size"] ) / self.hyperparameters["update_frequency"] q_1_optimizer = Adam(self.q_1_model.parameters(), lr=self.hyperparameters["lr_q"]) q_1 = QContinuous(self.q_1_model, q_1_optimizer, scheduler=CosineAnnealingLR(q_1_optimizer, n_updates), writer=writer, name='q_1') q_2_optimizer = Adam(self.q_2_model.parameters(), lr=self.hyperparameters["lr_q"]) q_2 = QContinuous(self.q_2_model, q_2_optimizer, scheduler=CosineAnnealingLR(q_2_optimizer, n_updates), writer=writer, name='q_2') v_optimizer = Adam(self.v_model.parameters(), lr=self.hyperparameters["lr_v"]) v = VNetwork( self.v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, n_updates), target=PolyakTarget(self.hyperparameters["polyak_rate"]), writer=writer, name='v', ) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"]) policy = SoftDeterministicPolicy(self.policy_model, policy_optimizer, self.action_space, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), writer=writer) replay_buffer = ExperienceReplayBuffer( self.hyperparameters["replay_buffer_size"], device=self.device) return TimeFeature( SAC(policy, q_1, q_2, v, replay_buffer, temperature_initial=self. hyperparameters["temperature_initial"], entropy_target=( -self.action_space.shape[0] * self.hyperparameters["entropy_target_scaling"]), lr_temperature=self.hyperparameters["lr_temperature"], replay_start_size=self.hyperparameters["replay_start_size"], discount_factor=self.hyperparameters["discount_factor"], update_frequency=self.hyperparameters["update_frequency"], minibatch_size=self.hyperparameters["minibatch_size"], writer=writer))
def _ppo(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam( feature_model.parameters(), lr=lr, eps=eps ) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler( clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer ), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, ) )
def test_agent(self): f = FeatureNetwork(self.feature_model, None) v = VNetwork(self.value_model, None) r = QNetwork(self.reward_model, None) g = Approximation(self.generator_model, None) return DeepmindAtariBody(ModelBasedTestAgent(f, v, r, g, self.hyperparameters["discount_factor"]))