Exemplo n.º 1
0
 def _vqn(envs, writer=DummyWriter()):
     env = envs[0]
     model = nature_ddqn(env).to(device)
     optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps)
     q = QNetwork(
         model,
         optimizer,
         env.action_space.n,
         loss=smooth_l1_loss,
         writer=writer
     )
     policy = GreedyPolicy(
         q,
         env.action_space.n,
         epsilon=LinearScheduler(
             initial_exploration,
             final_exploration,
             0,
             final_exploration_frame,
             name="epsilon",
             writer=writer
         )
     )
     return DeepmindAtariBody(
         VQN(q, policy, gamma=discount_factor),
     )
    def __init__(
            self,
            model,
            optimizer=None,
            checkpointer=None,
            clip_grad=0,
            device=None,
            loss_scaling=1,
            name='approximation',
            scheduler=None,
            target=None,
            writer=DummyWriter(),
    ):
        self.model = model
        self.device = device if device else next(model.parameters()).device
        self._target = target or TrivialTarget()
        self._scheduler = scheduler
        self._target.init(model)
        self._updates = 0
        self._optimizer = optimizer
        self._loss_scaling = loss_scaling
        self._cache = []
        self._clip_grad = clip_grad
        self._writer = writer
        self._name = name

        if checkpointer is None:
            checkpointer = DummyCheckpointer()
        self._checkpointer = checkpointer
        self._checkpointer.init(
            self.model,
            os.path.join(writer.log_dir, name + '.pt')
        )
Exemplo n.º 3
0
 def _dqn(env, writer=DummyWriter()):
     _model = nature_dqn(env).to(device)
     _optimizer = Adam(_model.parameters(), lr=lr, eps=eps)
     q = QNetwork(_model,
                  _optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=smooth_l1_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DeepmindAtariBody(
         DQN(
             q,
             policy,
             replay_buffer,
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
         ), )
Exemplo n.º 4
0
    def _ddpg(env, writer=DummyWriter()):
        value_model = fc_value(env).to(device)
        value_optimizer = Adam(value_model.parameters(), lr=lr_q)
        q = QContinuous(value_model,
                        value_optimizer,
                        target=PolyakTarget(polyak_rate),
                        writer=writer)

        policy_model = fc_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = DeterministicPolicy(policy_model,
                                     policy_optimizer,
                                     env.action_space,
                                     noise,
                                     target=PolyakTarget(polyak_rate),
                                     writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return DDPG(q,
                    policy,
                    replay_buffer,
                    replay_start_size=replay_start_size,
                    discount_factor=discount_factor,
                    update_frequency=update_frequency,
                    minibatch_size=minibatch_size)
Exemplo n.º 5
0
 def _rainbow(env, writer=DummyWriter()):
     model = model_constructor(env, atoms=atoms, sigma=sigma).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QDist(
         model,
         optimizer,
         env.action_space.n,
         atoms,
         v_min=v_min,
         v_max=v_max,
         writer=writer,
     )
     # replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device)
     replay_buffer = PrioritizedReplayBuffer(
         replay_buffer_size,
         alpha=alpha,
         beta=beta,
         device=device
     )
     replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer)
     return Rainbow(
         q,
         replay_buffer,
         exploration=0.,
         discount_factor=discount_factor ** n_steps,
         minibatch_size=minibatch_size,
         replay_start_size=replay_start_size,
         update_frequency=update_frequency,
         writer=writer,
     )
Exemplo n.º 6
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps / self.hyperparameters['n_envs']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QNetwork(
            self.model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            writer=writer
        )

        policy = ParallelGreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                0,
                self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"],
                name="exploration",
                writer=writer
            )
        )

        return VQN(q, policy, discount_factor=self.hyperparameters['discount_factor'])
 def _vsarsa(envs, writer=DummyWriter()):
     env = envs[0]
     model = model_constructor(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr, eps=eps)
     q = QNetwork(model, optimizer, writer=writer)
     policy = ParallelGreedyPolicy(q, env.action_space.n, epsilon=epsilon)
     return VSarsa(q, policy, discount_factor=discount_factor)
Exemplo n.º 8
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps / self.hyperparameters["min_batch_size"]

        feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"])

        features = FeatureNetwork(
            self.feature_model,
            feature_optimizer,
            scheduler=CosineAnnealingLR(feature_optimizer, n_updates),
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        v = VNetwork(
            self.value_model,
            value_optimizer,
            scheduler=CosineAnnealingLR(value_optimizer, n_updates),
            loss_scaling=self.hyperparameters["value_loss_scaling"],
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        policy = SoftmaxPolicy(
            self.policy_model,
            policy_optimizer,
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
            clip_grad=self.hyperparameters["clip_grad"],
            writer=writer
        )

        return DeepmindAtariBody(
            VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]),
        )
Exemplo n.º 9
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  clip_grad=self.hyperparameters["clip_grad"])

        v = VNetwork(self.value_model,
                     value_optimizer,
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return A2C(
            features,
            v,
            policy,
            n_envs=self.hyperparameters["n_envs"],
            n_steps=self.hyperparameters["n_steps"],
            discount_factor=self.hyperparameters["discount_factor"],
            entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"],
            writer=writer)
Exemplo n.º 10
0
 def _vqn(envs, writer=DummyWriter()):
     env = envs[0]
     model = fc_relu_q(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr, eps=eps)
     q = QNetwork(model, optimizer, writer=writer)
     policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon)
     return VQN(q, policy, discount_factor=discount_factor)
Exemplo n.º 11
0
    def _vpg(env, writer=DummyWriter()):
        feature_model = feature_model_constructor(env).to(device)
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(
            feature_model,
            feature_optimizer,
            writer=writer
        )
        v = VNetwork(
            value_model,
            value_optimizer,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            writer=writer
        )
        return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size)
Exemplo n.º 12
0
 def _ddqn(env, writer=DummyWriter()):
     model = model_constructor(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  target=FixedTarget(target_update_frequency),
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                             alpha=alpha,
                                             beta=beta,
                                             device=device)
     return DDQN(q,
                 policy,
                 replay_buffer,
                 discount_factor=discount_factor,
                 replay_start_size=replay_start_size,
                 update_frequency=update_frequency,
                 minibatch_size=minibatch_size)
Exemplo n.º 13
0
    def _vac(envs, writer=DummyWriter()):
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(envs[0]).to(device)
        feature_model = feature_model_constructor().to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)
        feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps)

        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer,
        )
        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  writer=writer)

        return DeepmindAtariBody(
            VAC(features, v, policy, discount_factor=discount_factor), )
Exemplo n.º 14
0
    def _vpg(env, writer=DummyWriter()):
        feature_model = fc_relu_features(env).to(device)
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  writer=writer)
        v = VNetwork(value_model,
                     value_optimizer,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               env.action_space.n,
                               entropy_loss_scaling=entropy_loss_scaling,
                               clip_grad=clip_grad,
                               writer=writer)
        return VPG(features,
                   v,
                   policy,
                   gamma=gamma,
                   min_batch_size=min_batch_size)
Exemplo n.º 15
0
 def __init__(self,
              features,
              v,
              policy,
              buffer,
              action_space,
              discount_factor=0.99,
              sigma=1.0,
              sigma_decay=0.9995,
              sigma_min=0.1,
              n_iter=100,
              minibatch_size=32,
              log=True,
              writer=DummyWriter()):
     self.features = features
     self.v = v
     self.policy = policy
     self.replay_buffer = buffer
     self.minibatch_size = minibatch_size
     self.discount_factor = discount_factor
     self._log = log
     self.writer = writer
     self.sigma = sigma
     self.sigma_decay = sigma_decay
     self.sigma_min = sigma_min
     self.n_iter = n_iter
     self._features = None
     self._action = None
     self._state = None
     self._tde = None
     self._action_low = torch.tensor(action_space.low,
                                     device=policy.device).float()
     self._action_high = torch.tensor(action_space.high,
                                      device=policy.device).float()
    def __init__(self,
                 model,
                 optimizer,
                 clip_grad=0,
                 loss_scaling=1,
                 loss=mse_loss,
                 name='approximation',
                 scheduler=None,
                 target=None,
                 writer=DummyWriter(),
                 checkpointer=None):
        self.model = model
        self.device = next(model.parameters()).device
        self._target = target or TrivialTarget()
        self._scheduler = scheduler
        self._target.init(model)
        self._updates = 0
        self._optimizer = optimizer
        self._loss = loss
        self._loss_scaling = loss_scaling
        self._cache = []
        self._clip_grad = clip_grad
        self._writer = writer
        self._name = name

        if checkpointer is None:
            checkpointer = PeriodicCheckpointer(DEFAULT_CHECKPOINT_FREQUENCY)
        self._checkpointer = checkpointer
        self._checkpointer.init(self.model,
                                os.path.join(writer.log_dir, name + '.pt'))
Exemplo n.º 17
0
 def _vsarsa(envs, writer=DummyWriter()):
     env = envs[0]
     model = fc_relu_q(env).to(device)
     optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps)
     q = QNetwork(model, optimizer, env.action_space.n, writer=writer)
     policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon)
     return VSarsa(q, policy, gamma=gamma)
Exemplo n.º 18
0
    def _ddqn(env, writer=DummyWriter()):
        action_repeat = 1
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = model_constructor(env).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QNetwork(model,
                     optimizer,
                     scheduler=CosineAnnealingLR(optimizer, last_update),
                     target=FixedTarget(target_update_frequency),
                     writer=writer)
        policy = SharedAutonomyPolicy(q,
                                      env.action_space.n,
                                      epsilon=0,
                                      pilot_tol=pilot_tol)

        if prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                                    alpha=alpha,
                                                    beta=beta,
                                                    device=device)
        else:
            replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                                   device=device)

        return co_DDQN(q,
                       policy,
                       replay_buffer,
                       loss=weighted_smooth_l1_loss,
                       discount_factor=discount_factor,
                       minibatch_size=minibatch_size,
                       replay_start_size=replay_start_size,
                       update_frequency=update_frequency)
Exemplo n.º 19
0
 def _dqn(env, writer=DummyWriter()):
     model = fc_relu_q(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=mse_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DQN(q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
Exemplo n.º 20
0
 def _c51(env, writer=DummyWriter()):
     model = fc_relu_dist_q(env, atoms=atoms).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QDist(
         model,
         optimizer,
         env.action_space.n,
         atoms,
         v_min=v_min,
         v_max=v_max,
         writer=writer,
     )
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return C51(q,
                replay_buffer,
                exploration=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    replay_start_size,
                    final_exploration_frame,
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=discount_factor,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                writer=writer)
Exemplo n.º 21
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(
                self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return C51(q,
                   replay_buffer,
                   exploration=LinearScheduler(
                       self.hyperparameters['initial_exploration'],
                       self.hyperparameters['final_exploration'],
                       0,
                       self.hyperparameters["final_exploration_step"] -
                       self.hyperparameters["replay_start_size"],
                       name="epsilon",
                       writer=writer,
                   ),
                   discount_factor=self.hyperparameters["discount_factor"],
                   minibatch_size=self.hyperparameters["minibatch_size"],
                   replay_start_size=self.hyperparameters["replay_start_size"],
                   update_frequency=self.hyperparameters["update_frequency"],
                   writer=writer)
Exemplo n.º 22
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QNetwork(self.model,
                     optimizer,
                     target=FixedTarget(
                         self.hyperparameters['target_update_frequency']),
                     writer=writer)

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] -
                self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer))

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return DQN(
            q,
            policy,
            replay_buffer,
            discount_factor=self.hyperparameters['discount_factor'],
            minibatch_size=self.hyperparameters['minibatch_size'],
            replay_start_size=self.hyperparameters['replay_start_size'],
            update_frequency=self.hyperparameters['update_frequency'],
        )
Exemplo n.º 23
0
    def _a2c(envs, writer=DummyWriter()):
        env = envs[0]
        feature_model = feature_model_constructor(env).to(device)
        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(
            feature_model, feature_optimizer, clip_grad=clip_grad)
        v = VNetwork(
            value_model,
            value_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer
        )
        return A2C(
            features,
            v,
            policy,
            n_envs=n_envs,
            n_steps=n_steps,
            discount_factor=discount_factor,
            entropy_loss_scaling=entropy_loss_scaling,
            writer=writer
        )
Exemplo n.º 24
0
 def __init__(self,
              policy,
              q_1,
              q_2,
              v,
              replay_buffer,
              discount_factor=0.99,
              entropy_target=-2.,
              lr_temperature=1e-4,
              minibatch_size=32,
              replay_start_size=5000,
              temperature_initial=0.1,
              update_frequency=1,
              writer=DummyWriter()):
     # objects
     self.policy = policy
     self.v = v
     self.q_1 = q_1
     self.q_2 = q_2
     self.replay_buffer = replay_buffer
     self.writer = writer
     # hyperparameters
     self.discount_factor = discount_factor
     self.entropy_target = entropy_target
     self.lr_temperature = lr_temperature
     self.minibatch_size = minibatch_size
     self.replay_start_size = replay_start_size
     self.temperature = temperature_initial
     self.update_frequency = update_frequency
     # private
     self._state = None
     self._action = None
     self._frames_seen = 0
Exemplo n.º 25
0
    def _vqn(envs, writer=DummyWriter()):
        action_repeat = 4
        final_exploration_timestep = final_exploration_frame / action_repeat

        env = envs[0]
        model = model_constructor(env).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QNetwork(
            model,
            optimizer,
            writer=writer
        )
        policy = ParallelGreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                0,
                final_exploration_timestep,
                name="epsilon",
                writer=writer
            )
        )
        return DeepmindAtariBody(
            VQN(q, policy, discount_factor=discount_factor),
        )
Exemplo n.º 26
0
 def __init__(
         self,
         q_dist,
         replay_buffer,
         discount_factor=0.99,
         eps=1e-5,
         exploration=0.02,
         minibatch_size=32,
         replay_start_size=5000,
         update_frequency=1,
         writer=DummyWriter(),
 ):
     # objects
     self.q_dist = q_dist
     self.replay_buffer = replay_buffer
     self.writer = writer
     # hyperparameters
     self.eps = eps
     self.exploration = exploration
     self.replay_start_size = replay_start_size
     self.update_frequency = update_frequency
     self.minibatch_size = minibatch_size
     self.discount_factor = discount_factor
     # private
     self._state = None
     self._action = None
     self._frames_seen = 0
Exemplo n.º 27
0
 def __init__(self,
              features,
              v,
              policy,
              discount_factor=0.99,
              entropy_loss_scaling=0.01,
              epochs=4,
              epsilon=0.2,
              lam=0.95,
              minibatches=4,
              n_envs=None,
              n_steps=4,
              writer=DummyWriter()):
     if n_envs is None:
         raise RuntimeError("Must specify n_envs.")
     # objects
     self.features = features
     self.v = v
     self.policy = policy
     self.writer = writer
     # hyperparameters
     self.discount_factor = discount_factor
     self.entropy_loss_scaling = entropy_loss_scaling
     self.epochs = epochs
     self.epsilon = epsilon
     self.lam = lam
     self.minibatches = minibatches
     self.n_envs = n_envs
     self.n_steps = n_steps
     # private
     self._states = None
     self._actions = None
     self._batch_size = n_envs * n_steps
     self._buffer = self._make_buffer()
Exemplo n.º 28
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]
        feature_model = fc_relu_features(env).to(device)
        value_model = fc_value_head().to(device)
        policy_model = fc_policy_head(env).to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr)
        value_optimizer = Adam(value_model.parameters(), lr=lr)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad)
        v = VNetwork(value_model,
                     value_optimizer,
                     clip_grad=clip_grad,
                     writer=writer)
        policy = SoftmaxPolicy(policy_model,
                               policy_optimizer,
                               clip_grad=clip_grad,
                               writer=writer)
        return PPO(features,
                   v,
                   policy,
                   epsilon=epsilon,
                   epochs=epochs,
                   lam=lam,
                   minibatches=minibatches,
                   n_envs=n_envs,
                   n_steps=n_steps,
                   discount_factor=discount_factor,
                   entropy_loss_scaling=entropy_loss_scaling,
                   writer=writer)
Exemplo n.º 29
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        feature_optimizer = Adam(self.feature_model.parameters(),
                                 lr=self.hyperparameters["lr_pi"],
                                 eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters["lr_v"],
                               eps=self.hyperparameters["eps"])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"],
                                eps=self.hyperparameters["eps"])

        features = FeatureNetwork(self.feature_model,
                                  feature_optimizer,
                                  clip_grad=self.hyperparameters["clip_grad"],
                                  writer=writer)

        v = VNetwork(self.value_model,
                     value_optimizer,
                     loss_scaling=self.hyperparameters["value_loss_scaling"],
                     clip_grad=self.hyperparameters["clip_grad"],
                     writer=writer)

        policy = SoftmaxPolicy(self.policy_model,
                               policy_optimizer,
                               clip_grad=self.hyperparameters["clip_grad"],
                               writer=writer)

        return VPG(features,
                   v,
                   policy,
                   discount_factor=self.hyperparameters["discount_factor"],
                   min_batch_size=self.hyperparameters["min_batch_size"])
Exemplo n.º 30
0
    def _ppo(envs, writer=DummyWriter()):
        env = envs[0]

        # Update epoch * minibatches times per update,
        # but we only update once per n_steps,
        # with n_envs and 4 frames per step
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs * 4)

        value_model = value_model_constructor().to(device)
        policy_model = policy_model_constructor(env).to(device)
        feature_model = feature_model_constructor().to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = SoftmaxPolicy(
            policy_model,
            policy_optimizer,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return DeepmindAtariBody(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))