Пример #1
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]
        feature_model = fc_relu_features(env).to(device)
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad)
        v = VNetwork(value_model,
                     value_optimizer,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               clip_grad=clip_grad,
                               writer=writer)
        return PPO(features,
                   v,
                   policy,
                   epsilon=epsilon,
                   epochs=epochs,
                   lam=lam,
                   minibatches=minibatches,
                   n_envs=n_envs,
                   n_steps=n_steps,
                   discount_factor=discount_factor,
                   entropy_loss_scaling=entropy_loss_scaling,
                   writer=writer)
Пример #2
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        # Update epoch * minibatches times per update,
        # but we only update once per n_steps,
        # with n_envs and 4 frames per step
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs * 4)

        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)
        feature_model = feature_model_constructor().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
Пример #3
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"],
                                 eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"],
                               eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"],
                                eps=self.hyperparameters["eps"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, n_updates),
                                  clip_grad=self.hyperparameters["clip_grad"],
                                  writer=writer)

        v = VNetwork(self.value_model,
                     value_optimizer,
                     scheduler=CosineAnnealingLR(value_optimizer, n_updates),
                     loss_scaling=self.hyperparameters["value_loss_scaling"],
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               scheduler=CosineAnnealingLR(
                                   policy_optimizer, n_updates),
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters["clip_initial"],
                                        self.hyperparameters["clip_final"],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters["epochs"],
                minibatches=self.hyperparameters["minibatches"],
                n_envs=self.hyperparameters["n_envs"],
                n_steps=self.hyperparameters["n_steps"],
                discount_factor=self.hyperparameters["discount_factor"],
                lam=self.hyperparameters["lam"],
                entropy_loss_scaling=self.
                hyperparameters["entropy_loss_scaling"],
                writer=writer,
            ))
Пример #4
0
    def _ppo(envs, writer=DummyWriter()):
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs)
        env = envs[0]

        feature_model, value_model, policy_model = fc_actor_critic(env)
        feature_model.to(device)
        value_model.to(device)
        policy_model.to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = GaussianPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
Пример #5
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters['lr'],
                               eps=self.hyperparameters['eps'])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters['lr'],
                                eps=self.hyperparameters['eps'])

        features = Identity(self.device)

        v = VNetwork(
            self.value_model,
            value_optimizer,
            loss_scaling=self.hyperparameters['value_loss_scaling'],
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, n_updates),
        )

        policy = GaussianPolicy(
            self.policy_model,
            policy_optimizer,
            self.action_space,
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters['clip_initial'],
                                        self.hyperparameters['clip_final'],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters['epochs'],
                minibatches=self.hyperparameters['minibatches'],
                n_envs=self.hyperparameters['n_envs'],
                n_steps=self.hyperparameters['n_steps'],
                discount_factor=self.hyperparameters['discount_factor'],
                lam=self.hyperparameters['lam'],
                entropy_loss_scaling=self.
                hyperparameters['entropy_loss_scaling'],
                writer=writer,
            ))
Пример #6
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(envs[0]).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(
            feature_model.parameters(), lr=lr, eps=eps
        )
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(
                    clip_initial,
                    clip_final,
                    0,
                    final_anneal_step,
                    name='clip',
                    writer=writer
                ),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
            )
        )