示例#1
0
    def _vpg(env, writer=DummyWriter()):
        feature_model = feature_model_constructor(env).to(device)
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            writer=writer
        )
        return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size)
示例#2
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps / self.hyperparameters["min_batch_size"]

        feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"])

        features = FeatureNetwork(
            self.feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(feature_optimizer, n_updates),
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        v = VNetwork(
            self.value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(value_optimizer, n_updates),
            loss_scaling=self.hyperparameters["value_loss_scaling"],
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        policy = SoftmaxPolicy(
            self.policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        return DeepmindAtariBody(
            VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]),
        )
示例#3
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr_pi"],
                                 eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr_v"],
                               eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"],
                                eps=self.hyperparameters["eps"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  clip_grad=self.hyperparameters["clip_grad"],
                                  writer=writer)

        v = VNetwork(self.value_model,
                     value_optimizer,
                     loss_scaling=self.hyperparameters["value_loss_scaling"],
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return VPG(features,
                   v,
                   policy,
                   discount_factor=self.hyperparameters["discount_factor"],
                   min_batch_size=self.hyperparameters["min_batch_size"])
示例#4
0
 def _model_predictive_dqn(env, writer=None):
     # models
     feature_model = shared_feature_layers().to(device)
     value_model = value_head().to(device)
     reward_model = reward_head(env).to(device)
     generator_model = Generator(env).to(device)
     # optimizers
     feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
     value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
     reward_optimizer = Adam(reward_model.parameters(), lr=lr, eps=eps)
     generator_optimizer = Adam(generator_model.parameters(), lr=lr, eps=eps)
     # approximators
     f = FeatureNetwork(feature_model, feature_optimizer, writer=writer)
     v = VNetwork(value_model, value_optimizer, writer=writer)
     r = QNetwork(reward_model, reward_optimizer, name='reward', writer=writer)
     g = Approximation(generator_model, generator_optimizer, name='generator', writer=writer)
     # replay buffer
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device)
     # create agent
     agent = ModelPredictiveDQN(f, v, r, g, replay_buffer,
         minibatch_size=minibatch_size,
         replay_start_size=replay_start_size
     )
     # apply agent wrappers for better atari performance
     return DeepmindAtariBody(agent, lazy_frames=True)
示例#5
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]
        feature_model = fc_relu_features(env).to(device)
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad)
        v = VNetwork(value_model,
                     value_optimizer,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               clip_grad=clip_grad,
                               writer=writer)
        return PPO(features,
                   v,
                   policy,
                   epsilon=epsilon,
                   epochs=epochs,
                   lam=lam,
                   minibatches=minibatches,
                   n_envs=n_envs,
                   n_steps=n_steps,
                   discount_factor=discount_factor,
                   entropy_loss_scaling=entropy_loss_scaling,
                   writer=writer)
    def _vac(envs, writer=DummyWriter()):
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(envs[0]).to(device)
        feature_model = feature_model_constructor().to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)
        feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps)

        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer,
        )
        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  writer=writer)

        return DeepmindAtariBody(
            VAC(features, v, policy, discount_factor=discount_factor), )
示例#7
0
    def _vpg(env, writer=DummyWriter()):
        feature_model = fc_relu_features(env).to(device)
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  writer=writer)
        v = VNetwork(value_model,
                     value_optimizer,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               env.action_space.n,
                               entropy_loss_scaling=entropy_loss_scaling,
                               clip_grad=clip_grad,
                               writer=writer)
        return VPG(features,
                   v,
                   policy,
                   gamma=gamma,
                   min_batch_size=min_batch_size)
示例#8
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  clip_grad=self.hyperparameters["clip_grad"])

        v = VNetwork(self.value_model,
                     value_optimizer,
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return A2C(
            features,
            v,
            policy,
            n_envs=self.hyperparameters["n_envs"],
            n_steps=self.hyperparameters["n_steps"],
            discount_factor=self.hyperparameters["discount_factor"],
            entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"],
            writer=writer)
示例#9
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        feature_model = feature_model_constructor(env).to(device)
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(
            feature_model, feature_optimizer, clip_grad=clip_grad)
        v = VNetwork(
            value_model,
            value_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        return A2C(
            features,
            v,
            policy,
            n_envs=n_envs,
            n_steps=n_steps,
            discount_factor=discount_factor,
            entropy_loss_scaling=entropy_loss_scaling,
            writer=writer
        )
示例#10
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        # Update epoch * minibatches times per update,
        # but we only update once per n_steps,
        # with n_envs and 4 frames per step
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs * 4)

        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)
        feature_model = feature_model_constructor().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
示例#11
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"],
                                 eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"],
                               eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"],
                                eps=self.hyperparameters["eps"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, n_updates),
                                  clip_grad=self.hyperparameters["clip_grad"],
                                  writer=writer)

        v = VNetwork(self.value_model,
                     value_optimizer,
                     scheduler=CosineAnnealingLR(value_optimizer, n_updates),
                     loss_scaling=self.hyperparameters["value_loss_scaling"],
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               scheduler=CosineAnnealingLR(
                                   policy_optimizer, n_updates),
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters["clip_initial"],
                                        self.hyperparameters["clip_final"],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters["epochs"],
                minibatches=self.hyperparameters["minibatches"],
                n_envs=self.hyperparameters["n_envs"],
                n_steps=self.hyperparameters["n_steps"],
                discount_factor=self.hyperparameters["discount_factor"],
                lam=self.hyperparameters["lam"],
                entropy_loss_scaling=self.
                hyperparameters["entropy_loss_scaling"],
                writer=writer,
            ))
示例#12
0
    def _ppo(envs, writer=DummyWriter()):
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs)
        env = envs[0]

        feature_model, value_model, policy_model = fc_actor_critic(env)
        feature_model.to(device)
        value_model.to(device)
        policy_model.to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = GaussianPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
示例#13
0
文件: a2c.py 项目: cpnota/procgen-all
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        final_anneal_step = last_frame / (n_steps * n_envs)

        value_model = value_head().to(device)
        policy_model = policy_head(env).to(device)
        feature_model = conv_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
            ),
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )

        return FrameStack(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer),
                writer=writer
            ),
            size=4
        )
示例#14
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        final_anneal_step = last_frame / (n_steps * n_envs * 4)

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(env).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
            ),
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
            ),
            clip_grad=clip_grad,
            writer=writer
        )

        return DeepmindAtariBody(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer
            ),
        )
示例#15
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters['lr'],
                               eps=self.hyperparameters['eps'])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters['lr'],
                                eps=self.hyperparameters['eps'])

        features = Identity(self.device)

        v = VNetwork(
            self.value_model,
            value_optimizer,
            loss_scaling=self.hyperparameters['value_loss_scaling'],
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, n_updates),
        )

        policy = GaussianPolicy(
            self.policy_model,
            policy_optimizer,
            self.action_space,
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters['clip_initial'],
                                        self.hyperparameters['clip_final'],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters['epochs'],
                minibatches=self.hyperparameters['minibatches'],
                n_envs=self.hyperparameters['n_envs'],
                n_steps=self.hyperparameters['n_steps'],
                discount_factor=self.hyperparameters['discount_factor'],
                lam=self.hyperparameters['lam'],
                entropy_loss_scaling=self.
                hyperparameters['entropy_loss_scaling'],
                writer=writer,
            ))
示例#16
0
    def _vac(env, writer=DummyWriter()):
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)
        feature_model = fc_relu_features(env).to(device)

        value_optimizer = RMSprop(value_model.parameters(), lr=lr_v, alpha=alpha, eps=eps)
        policy_optimizer = RMSprop(policy_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps)
        feature_optimizer = RMSprop(feature_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps)

        v = VNetwork(value_model, value_optimizer, writer=writer)
        policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, writer=writer)
        features = FeatureNetwork(feature_model, feature_optimizer)

        return VAC(features, v, policy, gamma=discount_factor)
示例#17
0
    def _vac(env, writer=DummyWriter()):
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)
        feature_model = feature_model_constructor(env).to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)
        feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps)

        v = VNetwork(value_model, value_optimizer, writer=writer)
        policy = SoftmaxPolicy(policy_model, policy_optimizer, writer=writer)
        features = FeatureNetwork(feature_model, feature_optimizer)

        return VAC(features, v, policy, discount_factor=discount_factor)
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(envs[0]).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = RMSprop(
            feature_model.parameters(), alpha=alpha, lr=lr, eps=eps
        )
        value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps)
        policy_optimizer = RMSprop(
            policy_model.parameters(), alpha=alpha, lr=lr, eps=eps
        )

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer
        )

        return DeepmindAtariBody(
            A2C(
                features,
                v,
                policy,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
            ),
        )
示例#19
0
    def _online_cacla(env, writer=DummyWriter()):
        value_model = models.critic(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        policy_model = models.actor(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        # feature_model = models.features(env.state_space.shape[0]).to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)
        # feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps)
        # feature_optimizer = SGD(feature_model.parameters(), lr=lr_pi, momentum=0.9)

        policy = DeterministicPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            quiet=not log,
            clip_grad=1.0,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )

        v = VNetwork(
            value_model,
            value_optimizer,
            quiet=not log,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )
        features = None  # FeatureNetwork(feature_model, feature_optimizer, writer=writer, normalize_input=False)
        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        # TODO - reintroduce TimeFeature wrapper
        return OnlineCACLA(features,
                           v,
                           policy,
                           replay_buffer,
                           env.action_space,
                           log=log,
                           writer=writer,
                           discount_factor=discount_factor)
    def _vpg_atari(env, writer=DummyWriter()):
        feature_model = nature_features().to(device)
        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(env).to(device)

        feature_optimizer = RMSprop(feature_model.parameters(),
                                    alpha=alpha,
                                    lr=lr * feature_lr_scaling,
                                    eps=eps)
        value_optimizer = RMSprop(value_model.parameters(),
                                  alpha=alpha,
                                  lr=lr,
                                  eps=eps)
        policy_optimizer = RMSprop(policy_model.parameters(),
                                   alpha=alpha,
                                   lr=lr,
                                   eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  writer=writer)
        v = VNetwork(value_model,
                     value_optimizer,
                     loss_scaling=value_loss_scaling,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
        )

        return DeepmindAtariBody(
            VPG(features,
                v,
                policy,
                gamma=discount_factor,
                min_batch_size=min_batch_size), )
示例#21
0
    def _sac(env, writer=DummyWriter()):
        q_1_model = fc_q(env).to(device)
        q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
        q_1 = QContinuous(q_1_model, q_1_optimizer, writer=writer, name='q_1')

        q_2_model = fc_q(env).to(device)
        q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
        q_2 = QContinuous(q_2_model, q_2_optimizer, writer=writer, name='q_2')

        v_model = fc_v(env).to(device)
        v_optimizer = Adam(v_model.parameters(), lr=lr_v)
        v = VNetwork(
            v_model,
            v_optimizer,
            target=PolyakTarget(polyak_rate),
            writer=writer,
            name='v',
        )

        policy_model = fc_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = SoftDeterministicPolicy(policy_model,
                                         policy_optimizer,
                                         env.action_space,
                                         writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return SAC(policy,
                   q_1,
                   q_2,
                   v,
                   replay_buffer,
                   entropy_target=(-env.action_space.shape[0] *
                                   entropy_target_scaling),
                   lr_temperature=lr_temperature,
                   replay_start_size=replay_start_size,
                   discount_factor=discount_factor,
                   update_frequency=update_frequency,
                   minibatch_size=minibatch_size,
                   writer=writer)
示例#22
0
    def _vpg_atari(env, writer=DummyWriter()):
        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(env).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer,
                                      final_anneal_step,
                                  ),
                                  clip_grad=clip_grad,
                                  writer=writer)
        v = VNetwork(value_model,
                     value_optimizer,
                     scheduler=CosineAnnealingLR(
                         value_optimizer,
                         final_anneal_step,
                     ),
                     loss_scaling=value_loss_scaling,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               scheduler=CosineAnnealingLR(
                                   policy_optimizer,
                                   final_anneal_step,
                               ),
                               clip_grad=clip_grad,
                               writer=writer)

        return DeepmindAtariBody(VPG(features,
                                     v,
                                     policy,
                                     discount_factor=discount_factor,
                                     min_batch_size=min_batch_size),
                                 episodic_lives=True)
示例#23
0
    def _fac(env, writer=DummyWriter()):
        value_model = models.critic(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        policy_model = models.actor(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)

        policy = DeterministicPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            quiet=not log,
            clip_grad=1.0,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )

        v = VNetwork(
            value_model,
            value_optimizer,
            quiet=not log,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )
        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        # TODO - reintroduce TimeFeature wrapper
        return ForwardAC(v,
                         policy,
                         replay_buffer,
                         env.action_space,
                         log=log,
                         trace_decay=trace_decay,
                         writer=writer,
                         discount_factor=discount_factor)
示例#24
0
    def agent(self, writer=DummyWriter(), train_steps=float("inf")):
        # optimizers
        feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        reward_optimizer = Adam(self.reward_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        generator_optimizer = Adam(self.generator_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])

        # approximators
        f = FeatureNetwork(self.feature_model, feature_optimizer, writer=writer)
        v = VNetwork(self.value_model, value_optimizer, writer=writer)
        r = QNetwork(self.reward_model, reward_optimizer, name="reward", writer=writer)
        g = Approximation(self.generator_model, generator_optimizer, name="generator", writer=writer)

        # replay buffer
        replay_buffer = ExperienceReplayBuffer(self.hyperparameters["replay_buffer_size"], device=self.device)

        # create agent
        agent = ModelBasedDQN(f, v, r, g, replay_buffer,
            minibatch_size=self.hyperparameters["minibatch_size"],
            replay_start_size=self.hyperparameters["replay_start_size"]
        )

        # apply atari wrappers for better performance
        return DeepmindAtariBody(agent, lazy_frames=True)
示例#25
0
class NStepAdvantageBufferTest(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(1)
        self.features = FeatureNetwork(nn.Linear(1, 2), None)
        self.v = VNetwork(nn.Linear(2, 1), None)

    def _compute_expected_advantages(self, states, returns, next_states,
                                     lengths):
        return (returns +
                (0.5**lengths) * self.v.eval(self.features.eval(next_states)) -
                self.v.eval(self.features.eval(states)))

    def test_rollout(self):
        buffer = NStepAdvantageBuffer(self.v,
                                      self.features,
                                      2,
                                      3,
                                      discount_factor=0.5)
        actions = torch.ones((3))
        states = State(torch.arange(0, 12).unsqueeze(1))
        buffer.store(states[0:3], actions, torch.zeros(3))
        buffer.store(states[3:6], actions, torch.ones(3))
        states, _, advantages = buffer.advantages(states[6:9])

        expected_states = State(torch.arange(0, 6).unsqueeze(1))
        expected_next_states = State(
            torch.cat((torch.arange(6, 9), torch.arange(6, 9))).unsqueeze(1))
        expected_returns = torch.tensor([0.5, 0.5, 0.5, 1, 1, 1]).float()
        expected_lengths = torch.tensor([2., 2, 2, 1, 1, 1])

        self.assert_states_equal(states, expected_states)
        tt.assert_allclose(
            advantages,
            self._compute_expected_advantages(expected_states,
                                              expected_returns,
                                              expected_next_states,
                                              expected_lengths))

    def test_rollout_with_nones(self):
        buffer = NStepAdvantageBuffer(self.v,
                                      self.features,
                                      3,
                                      3,
                                      discount_factor=0.5)
        done = torch.ones(12)
        done[5] = 0
        done[7] = 0
        done[9] = 0
        states = State(torch.arange(0, 12).unsqueeze(1), done)
        actions = torch.ones((3))
        buffer.store(states[0:3], actions, torch.zeros(3))
        buffer.store(states[3:6], actions, torch.ones(3))
        buffer.store(states[6:9], actions, 2 * torch.ones(3))
        states, actions, advantages = buffer.advantages(states[9:12])

        expected_states = State(torch.arange(0, 9).unsqueeze(1), done[0:9])
        expected_next_done = torch.zeros(9)
        expected_next_done[5] = 1
        expected_next_done[7] = 1
        expected_next_done[8] = 1
        expected_next_states = State(
            torch.tensor([9, 7, 5, 9, 7, 11, 9, 10, 11]).unsqueeze(1),
            expected_next_done)
        expected_returns = torch.tensor([1, 0.5, 0, 2, 1, 2, 2, 2, 2]).float()
        expected_lengths = torch.tensor([3, 2, 1, 2, 1, 2, 1, 1, 1]).float()

        self.assert_states_equal(states, expected_states)
        tt.assert_allclose(
            advantages,
            self._compute_expected_advantages(expected_states,
                                              expected_returns,
                                              expected_next_states,
                                              expected_lengths))

    def test_multi_rollout(self):
        buffer = NStepAdvantageBuffer(self.v,
                                      self.features,
                                      2,
                                      2,
                                      discount_factor=0.5)
        raw_states = State(torch.arange(0, 12).unsqueeze(1))
        actions = torch.ones((2))
        buffer.store(raw_states[0:2], actions, torch.ones(2))
        buffer.store(raw_states[2:4], actions, torch.ones(2))

        states, actions, advantages = buffer.advantages(raw_states[4:6])
        expected_states = State(torch.arange(0, 4).unsqueeze(1))
        expected_returns = torch.tensor([1.5, 1.5, 1, 1])
        expected_next_states = State(torch.tensor([4, 5, 4, 5]).unsqueeze(1))
        expected_lengths = torch.tensor([2., 2, 1, 1])
        self.assert_states_equal(states, expected_states)
        tt.assert_allclose(
            advantages,
            self._compute_expected_advantages(expected_states,
                                              expected_returns,
                                              expected_next_states,
                                              expected_lengths))

        buffer.store(raw_states[4:6], actions, torch.ones(2))
        buffer.store(raw_states[6:8], actions, torch.ones(2))

        states, actions, advantages = buffer.advantages(raw_states[8:10])
        expected_states = State(torch.arange(4, 8).unsqueeze(1))
        self.assert_states_equal(states, expected_states)
        tt.assert_allclose(
            advantages,
            self._compute_expected_advantages(
                expected_states, torch.tensor([1.5, 1.5, 1, 1]),
                State(torch.tensor([8, 9, 8, 9]).unsqueeze(1)),
                torch.tensor([2., 2, 1, 1])))

    def assert_array_equal(self, actual, expected):
        for i, exp in enumerate(expected):
            self.assertEqual(actual[i],
                             exp,
                             msg=(("\nactual: %s\nexpected: %s") %
                                  (actual, expected)))

    def assert_states_equal(self, actual, expected):
        tt.assert_almost_equal(actual.raw, expected.raw)
        tt.assert_equal(actual.mask, expected.mask)
 def setUp(self):
     torch.manual_seed(1)
     self.features = FeatureNetwork(nn.Linear(1, 2), None)
     self.v = VNetwork(nn.Linear(2, 1), None)
class GeneralizedAdvantageBufferTest(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(1)
        self.features = FeatureNetwork(nn.Linear(1, 2), None)
        self.v = VNetwork(nn.Linear(2, 1), None)

    def _compute_expected_advantages(self, states, returns, next_states,
                                     lengths):
        return (returns +
                (0.5**lengths) * self.v.eval(self.features.eval(next_states)) -
                self.v.eval(self.features.eval(states)))

    def test_simple(self):
        buffer = GeneralizedAdvantageBuffer(self.v,
                                            self.features,
                                            2,
                                            1,
                                            discount_factor=0.5,
                                            lam=0.5)
        actions = torch.ones((1))
        states = State(torch.arange(0, 3).unsqueeze(1))
        rewards = torch.tensor([1., 2, 4])
        buffer.store(states[0], actions, rewards[0])
        buffer.store(states[1], actions, rewards[1])

        values = self.v.eval(self.features.eval(states))
        tt.assert_almost_equal(values,
                               torch.tensor([0.1826, -0.3476, -0.8777]),
                               decimal=3)

        td_errors = torch.zeros(2)
        td_errors[0] = rewards[0] + 0.5 * values[1] - values[0]
        td_errors[1] = rewards[1] + 0.5 * values[2] - values[1]
        tt.assert_almost_equal(td_errors,
                               torch.tensor([0.6436, 1.909]),
                               decimal=3)

        advantages = torch.zeros(2)
        advantages[0] = td_errors[0] + 0.25 * td_errors[1]
        advantages[1] = td_errors[1]
        tt.assert_almost_equal(advantages,
                               torch.tensor([1.121, 1.909]),
                               decimal=3)

        _states, _actions, _advantages = buffer.advantages(states[2])
        tt.assert_almost_equal(_advantages, advantages)
        tt.assert_equal(_actions, torch.tensor([1, 1]))

    def test_parallel(self):
        buffer = GeneralizedAdvantageBuffer(self.v,
                                            self.features,
                                            2,
                                            2,
                                            discount_factor=0.5,
                                            lam=0.5)
        actions = torch.ones((2))
        states = [
            State(torch.tensor([[0], [3]])),
            State(torch.tensor([[1], [4]])),
            State(torch.tensor([[2], [5]])),
        ]
        rewards = torch.tensor([[1., 1], [2, 1], [4, 1]])
        buffer.store(states[0], actions, rewards[0])
        buffer.store(states[1], actions, rewards[1])

        values = self.v.eval(self.features.eval(State.from_list(states))).view(
            3, -1)
        tt.assert_almost_equal(values,
                               torch.tensor([[0.183, -1.408], [-0.348, -1.938],
                                             [-0.878, -2.468]]),
                               decimal=3)

        td_errors = torch.zeros(2, 2)
        td_errors[0] = rewards[0] + 0.5 * values[1] - values[0]
        td_errors[1] = rewards[1] + 0.5 * values[2] - values[1]
        tt.assert_almost_equal(td_errors,
                               torch.tensor([[0.6436, 1.439], [1.909, 1.704]]),
                               decimal=3)

        advantages = torch.zeros(2, 2)
        advantages[0] = td_errors[0] + 0.25 * td_errors[1]
        advantages[1] = td_errors[1]
        tt.assert_almost_equal(advantages,
                               torch.tensor([[1.121, 1.865], [1.909, 1.704]]),
                               decimal=3)

        _states, _actions, _advantages = buffer.advantages(states[2])
        tt.assert_almost_equal(_advantages, advantages.view(-1))

    def assert_array_equal(self, actual, expected):
        for i, exp in enumerate(expected):
            self.assertEqual(actual[i],
                             exp,
                             msg=(("\nactual: %s\nexpected: %s") %
                                  (actual, expected)))

    def assert_states_equal(self, actual, expected):
        tt.assert_almost_equal(actual.raw, expected.raw)
        tt.assert_equal(actual.mask, expected.mask)
示例#28
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters["replay_start_size"]
                     ) / self.hyperparameters["update_frequency"]

        q_1_optimizer = Adam(self.q_1_model.parameters(),
                             lr=self.hyperparameters["lr_q"])
        q_1 = QContinuous(self.q_1_model,
                          q_1_optimizer,
                          scheduler=CosineAnnealingLR(q_1_optimizer,
                                                      n_updates),
                          writer=writer,
                          name='q_1')

        q_2_optimizer = Adam(self.q_2_model.parameters(),
                             lr=self.hyperparameters["lr_q"])
        q_2 = QContinuous(self.q_2_model,
                          q_2_optimizer,
                          scheduler=CosineAnnealingLR(q_2_optimizer,
                                                      n_updates),
                          writer=writer,
                          name='q_2')

        v_optimizer = Adam(self.v_model.parameters(),
                           lr=self.hyperparameters["lr_v"])
        v = VNetwork(
            self.v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, n_updates),
            target=PolyakTarget(self.hyperparameters["polyak_rate"]),
            writer=writer,
            name='v',
        )

        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"])
        policy = SoftDeterministicPolicy(self.policy_model,
                                         policy_optimizer,
                                         self.action_space,
                                         scheduler=CosineAnnealingLR(
                                             policy_optimizer, n_updates),
                                         writer=writer)

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters["replay_buffer_size"], device=self.device)

        return TimeFeature(
            SAC(policy,
                q_1,
                q_2,
                v,
                replay_buffer,
                temperature_initial=self.
                hyperparameters["temperature_initial"],
                entropy_target=(
                    -self.action_space.shape[0] *
                    self.hyperparameters["entropy_target_scaling"]),
                lr_temperature=self.hyperparameters["lr_temperature"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                discount_factor=self.hyperparameters["discount_factor"],
                update_frequency=self.hyperparameters["update_frequency"],
                minibatch_size=self.hyperparameters["minibatch_size"],
                writer=writer))
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        value_model = nature_value_head().to(device)
        policy_model = nature_policy_head(envs[0]).to(device)
        feature_model = nature_features().to(device)

        feature_optimizer = Adam(
            feature_model.parameters(), lr=lr, eps=eps
        )
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            clip_grad=clip_grad,
            scheduler=CosineAnnealingLR(
                feature_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(
                value_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            env.action_space.n,
            entropy_loss_scaling=entropy_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(
                policy_optimizer,
                final_anneal_step,
                eta_min=lr * min_lr_scale
            ),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(
                    clip_initial,
                    clip_final,
                    0,
                    final_anneal_step,
                    name='clip',
                    writer=writer
                ),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
            )
        )
示例#30
0
 def test_agent(self):
     f = FeatureNetwork(self.feature_model, None)
     v = VNetwork(self.value_model, None)
     r = QNetwork(self.reward_model, None)
     g = Approximation(self.generator_model, None)
     return DeepmindAtariBody(ModelBasedTestAgent(f, v, r, g, self.hyperparameters["discount_factor"]))