def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"])
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters["min_batch_size"] feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR(feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) v = VNetwork( self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer ) policy = SoftmaxPolicy( self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) return DeepmindAtariBody( VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]), )
def _vpg(env, writer=DummyWriter()): feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, writer=writer ) v = VNetwork( value_model, value_optimizer, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, writer=writer ) return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size)
def _vpg(env, writer=DummyWriter()): feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer) return VPG(features, v, policy, gamma=gamma, min_batch_size=min_batch_size)
def _vpg_atari(env, writer=DummyWriter()): feature_model = nature_features().to(device) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_optimizer = RMSprop(feature_model.parameters(), alpha=alpha, lr=lr * feature_lr_scaling, eps=eps) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop(policy_model.parameters(), alpha=alpha, lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, ) return DeepmindAtariBody( VPG(features, v, policy, gamma=discount_factor, min_batch_size=min_batch_size), )
def _vpg_atari(env, writer=DummyWriter()): value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) return DeepmindAtariBody(VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size), episodic_lives=True)