Exemplo n.º 1
0
def test_explorer(env_factory):
    env = env_factory()

    def select_action(engine, iter):
        return env.action_space.sample()

    explorer = Explorer(select_action)
    explorer.run(env, 2)
Exemplo n.º 2
0
 def select_action(engine: Explorer, observation):
     policy.train()
     action_distrib = policy(observation)
     action = action_distrib.sample()
     engine.store_transition_members(
         log_prob=action_distrib.log_prob(action),
         entropy=action_distrib.entropy())
     return action
Exemplo n.º 3
0
def test_explorer_mock():
    select_action = mock.MagicMock()
    select_action.return_value = 1, {}
    explorer = Explorer(select_action)
    explorer.run(Env(), 2)

    assert select_action.call_count == 22
    assert isinstance(explorer.state.transition, Transition)
Exemplo n.º 4
0
def test_explorer_cast(device):
    explorer = Explorer(lambda x, y: (None, {}),
                        dtype=torch.int,
                        device=device)
    explorer.run(Env(), 1)

    # Observation are casted lazily
    @explorer.on(Events.ITERATION_STARTED)
    def _test(engine):
        assert engine.state.observation.device == device
Exemplo n.º 5
0
def test_explorer_transition_members():
    explorer = Explorer(lambda x, y: None)
    explorer.register_transition_members("foo", "bar")

    @explorer.on(Events.ITERATION_STARTED)
    def _add_foo(engine):
        engine.store_transition_members(foo=3, bar=4)
        assert engine.state.extra_transition_members == {"foo": 3, "bar": 4}
        engine.store_transition_members(foo=0)
        assert engine.state.extra_transition_members == {"foo": 0, "bar": 4}

    explorer.run(Env(), 2)

    assert explorer.state.transition.bar == 4
    assert not hasattr(explorer.state, "extra_transition_members")
Exemplo n.º 6
0
def create_memory_qlearner(
    dqn: nn.Module,  # Callable[[Observation], QValues]
    random_action: Callable[[Observation], Action],
    optimizer: optim.Optimizer,
    discount: float = 0.99,
    epsilon: Union[float, num.Stepable] = 0.05,
    evaluation_mode: trainers.QLearningMode = trainers.QLearningMode.DOUBLE,
    optimizing_steps: int = 4,
    double_target_weight_copy_steps: int = 1000,
    memory_capacity: int = 10000,
    batch_size: int = 32,
    clip_grad_norm: Optional[float] = None,
    dtype: Optional[torch.dtype] = None,
    device: Optional[torch.device] = None,
) -> Explorer:
    """Create a Q-learner.

    Optimization is done using TD(0) deep q learning, with memory replay.
    Double and tartget evaluation are also available.

    Parameters
    ----------
    dqn:
        The neural network estimating qvalues tht is being optimized.
    random_action:
        A function to make random actions.
    optimizer:
        The optimizer used to update the `dqn` parameters.
    discount:
        Dicount factor of the rfuture rewards.
    epsilon:
        Probability of amking a random action. The value is `step` if possible.
    evaluation_mode:
        Change the way targets are evaluated, either with a target network, or
        using double q-learning.
    optimizing_steps:
        Number of steps between optimization over the replay memory is
        performed.
    double_target_weight_copy_steps:
        Number of steps between the target/double newtorks weights are updated
        (when applicable).
    memory_capacity:
        Size of the replay memory (dataset).
    batch_size:
        Batch size when optimizing over the replay memeory.
    clip_grad_norm:
        Optionally clip the norm of the `dqn` gradient before applying them.
    dtype:
        Type the observations/model are converted to.
    device:
        Device the observations/model are moved to.

    Returns
    -------
    trainer:
        An ignite engine that optimize an deep Q network over a dataset.

    """
    # Enable converting from string
    evaluation_mode = trainers.QLearningMode(evaluation_mode)
    dqn.to(device=device, dtype=dtype)
    if evaluation_mode == trainers.QLearningMode.SIMPLE:
        target_dqn = None
    else:
        target_dqn = copy.deepcopy(dqn)

    def select_action(engine, observation):
        """Epsilon greedy action selection."""
        with torch.no_grad():
            dqn.eval()
            if torch.rand(1).item() < epsilon:
                return random_action(observation)
            else:
                return dqn(observation).greedy()

    agent = Explorer(select_action=select_action, dtype=dtype, device=device)
    trainer = trainers.create_qlearning_trainer(
        dqn=dqn,
        target_dqn=target_dqn,
        optimizer=optimizer,
        discount=discount,
        evaluation_mode=evaluation_mode,
        clip_grad_norm=clip_grad_norm,
        dtype=dtype,
        device=device,
    )

    @agent.on(Events.STARTED)
    def add_memory_and_trainer_to_agent(engine):
        engine.state.memory = MemoryReplay(T.PinIfCuda(device=device),
                                           capacity=memory_capacity)
        engine.state.trainer = trainer

    @agent.on(Events.ITERATION_COMPLETED)
    def append_transition_and_step_epsilon(engine):
        engine.state.memory.append(engine.state.transition.cpu())
        if isinstance(epsilon, num.Stepable):
            epsilon.step()

    @agent.on(Events.ITERATION_COMPLETED)
    @utils.every(optimizing_steps)
    def optimize(engine):
        sample_elem = engine.state.memory[0]
        dataloader = DataLoader(
            dataset=engine.state.memory,
            batch_size=batch_size,
            collate_fn=sample_elem.__class__.collate,
            shuffle=True,
            drop_last=True,
        )
        engine.state.trainer.run(dataloader)

    @agent.on(Events.ITERATION_COMPLETED)
    @utils.every(double_target_weight_copy_steps)
    def copy_weights(engine):
        if target_dqn is not None:
            dqn.zero_grad()  # Avoid copying the gradients
            target_dqn.load_state_dict(copy.deepcopy(dqn.state_dict()))

    return agent
Exemplo n.º 7
0
def test_EpisodeLength():
    agent = Explorer(lambda eng, obs: None)
    metrics.EpisodeLength().attach(agent, "Len")
    agent.run(Env(), 2)
    assert agent.state.metrics["Len"] == 6
Exemplo n.º 8
0
def test_InfosMetric():
    agent = Explorer(lambda eng, obs: None)
    metrics.InfoMetric("info_member").attach(agent, "Member")
    agent.run(Env(), 2)
    assert agent.state.metrics["Member"] == 6 * 3
Exemplo n.º 9
0
def test_Return():
    agent = Explorer(lambda eng, obs: None,
                     metrics={"Return": metrics.Return()})
    agent.run(Env(), 2)
    assert agent.state.metrics["Return"] == 6
Exemplo n.º 10
0
def test_TransitionMetric():
    agent = Explorer(lambda eng, obs: None)
    metrics.TransitionMetric("reward").attach(agent, "Return")
    agent.run(Env(), 2)
    assert agent.state.metrics["Return"] == 6
Exemplo n.º 11
0
def create_ppo(
    actor_critic: nn.Module,
    optimizer: optim.Optimizer,
    discount: float = 0.99,
    lambda_: float = 0.9,
    ppo_clip: float = 0.02,
    exploration_loss_coef: float = 0.001,
    critic_loss_coef: float = 1.0,
    critic_loss_function: Callable = F.mse_loss,
    norm_returns: bool = True,
    norm_gaes: bool = True,
    dataset_size: int = 1024,
    n_epochs: int = 10,
    # FIXME change the way the dataloader is passed on to the function
    batch_size: int = 16,
    dtype: Optional[torch.dtype] = None,
    device: Optional[torch.device] = None,
) -> Explorer:
    """Create an agent using Proximal Policy Optimization learning algorithm.

    Parameters
    ----------
    actor_critic:
        The neural network used to model the policy and critic. Must return a
        tuple (action probalility distribution, critic value).
    optimizer:
        The optimizer used to update the `model` parameters.
    discount:
        The discount rate used for computing the returns.
    lambda_:
        Lambda discount as defined in Generalized Advantage Estimation.
    ppo_clip:
        Clip parameter for the PPO loss.
    exploration_loss_coef:
        The entropy bonus for encouraging exploration.
    critic_loss_coef:
        Mutiplier for the critic loss.
    critic_loss_function:
        Loss function used by the critic.
    norm_returns:
        Whether to normalize returns. Running averages are kept per task
        (use `task_id` to differentiate tasks) and used to scale back critic
        for bootstrapping and GAEs.
    norm_gaes:
        Whether to normalize the advantages. Independant from the normalization
        of returns that is used to scale back the critic. This happens on
        the final advantages.
    dataset_size:
        Size of the PPO dataset to collect information from agents.
    n_epoch:
        Number of epoch of optimization to be on a single PPO dataset.
    batch_size:
        Batch size used to optimized over the PPO dataset.
    dtype:
        Type the obseravtions/model are casted to.
    device:
        Device the observations/model are moved to.

    Returns
    -------
        The ignite engine, exploring the environement and optimizing.

    """
    actor_critic.to(device=device, dtype=dtype)

    def select_action(engine, observation):
        with torch.no_grad():
            actor_critic.eval()
            action_distrib, critic_value = actor_critic(observation)
            action = action_distrib.sample()
            engine.store_transition_members(
                log_prob=action_distrib.log_prob(action),
                entropy=action_distrib.entropy(),
                critic_value=critic_value,
            )
            return action

    agent = Explorer(select_action=select_action, dtype=dtype, device=device)
    agent.register_transition_members("log_prob", "entropy", "critic_value")
    trainer = trainers.create_ppo_trainer(
        actor_critic=actor_critic,
        optimizer=optimizer,
        ppo_clip=ppo_clip,
        exploration_loss_coef=exploration_loss_coef,
        critic_loss_coef=critic_loss_coef,
        critic_loss_function=critic_loss_function,
        device=device,
        dtype=dtype,
    )

    @agent.on(Events.STARTED)
    def add_trajectories_and_trainer_to_engine(engine):
        engine.state.trajectories = Trajectories(
            T.compose(
                T.WithGAEs(
                    discount=discount,
                    lambda_=lambda_,
                    norm_gaes=norm_gaes,
                    norm_returns=norm_returns,
                ),
                partial(map, T.PinIfCuda(device=device)),
            ))
        engine.state.trainer = trainer

    @agent.on(Events.ITERATION_COMPLETED)
    def append_transition(engine):
        engine.state.trajectories.append(engine.state.transition.cpu())

    @agent.on(Events.EPOCH_COMPLETED)
    def terminate_trajectory_and_data_collection(engine):
        engine.state.trajectories.terminate_trajectory()

    @agent.on(Events.EPOCH_COMPLETED)
    def optimize(engine):
        if len(engine.state.trajectories) >= dataset_size:
            sample_elem = engine.state.trajectories[0]
            dataloader = DataLoader(
                dataset=engine.state.trajectories,
                batch_size=batch_size,
                collate_fn=sample_elem.__class__.collate,
                drop_last=True,
            )
            engine.state.trainer.run(dataloader, n_epochs)
            engine.state.trajectories.clear()

    return agent
Exemplo n.º 12
0
def create_reinforce(
    policy: nn.Module,
    optimizer: optim.Optimizer,
    discount: float = 0.99,
    exploration: float = 0.001,
    norm_returns: bool = True,
    grad_norm_clip: Optional[float] = 1.0,
    dtype: Optional[torch.dtype] = None,
    device: Optional[torch.device] = None,
) -> Explorer:
    """Create an agent using Reinforce learning algorithm.

    Parameters
    ----------
    policy:
        The neural network used to model the policy.
    optimizer:
        The optimizer used to update the `model` parameters.
    discount:
        The discount rate used for computing the returns.
    exploration:
        The entropy bonus for encouraging exploration.
    norm_returns:
        Whether to normalize the rewards with zero mean and unit variance.
        Computed over an episode. Raise an error for episode of length 1.
    grad_norm_clip:
        Value to clip the norm of the gradient at before applying an update.
    dtype:
        Type the obseravtions/model are casted to.
    device:
        Device the observations/model are moved to.

    Returns
    -------
    agent:
        The ignite engine, exploring the environement and optimizing.

    """
    policy.to(device=device, dtype=dtype)

    def select_action(engine: Explorer, observation):
        policy.train()
        action_distrib = policy(observation)
        action = action_distrib.sample()
        engine.store_transition_members(
            log_prob=action_distrib.log_prob(action),
            entropy=action_distrib.entropy())
        return action

    agent = Explorer(select_action=select_action, dtype=dtype, device=device)

    agent.register_transition_members("log_prob", "entropy")

    @agent.on(Events.STARTED)
    def add_trajectories_to_engine(engine):
        engine.state.trajectories = Trajectories(
            T.WithReturns(discount=discount, norm_returns=norm_returns))

    @agent.on(Events.EPOCH_STARTED)
    def empty_trajectectories(engine):
        engine.state.trajectories.clear()

    @agent.on(Events.ITERATION_COMPLETED)
    def append_transition(engine):
        engine.state.trajectories.append(engine.state.transition)

    @agent.on(Events.EPOCH_COMPLETED)
    def optimize(engine):
        engine.state.trajectories.terminate_trajectory()
        # The setting is simple enough that using a dataloader is overkill.
        optimizer.zero_grad()
        for transition in engine.state.trajectories:
            loss = -transition.retrn * transition.log_prob
            loss -= exploration * transition.entropy
            loss.backward()

        if grad_norm_clip is not None:
            nn.utils.clip_grad_norm_(policy.parameters(), grad_norm_clip)
        optimizer.step()

    return agent
Exemplo n.º 13
0
def create_a2c(
    actor_critic: nn.Module,
    optimizer: optim.Optimizer,
    discount: float = 0.99,
    exploration: float = 0.001,
    norm_returns: bool = True,
    critic_loss: Callable = F.mse_loss,
    critic_multiplier: float = 1.0,
    grad_norm_clip: Optional[float] = 1.0,
    dtype: Optional[torch.dtype] = None,
    device: Optional[torch.device] = None,
) -> Explorer:
    """Create an agent using Reinforce learning algorithm.

    Parameters
    ----------
    actor_critic:
        The neural network used to model the policy and critic. Must return a
        tuple (action probalility distribution, critic value).
    optimizer:
        The optimizer used to update the `model` parameters.
    discount:
        The discount rate used for computing the returns.
    exploration:
        The entropy bonus for encouraging exploration.
    norm_returns:
        Whether to normalize the rewards with zero mean and unit variance.
        Computed over an episode. Raise an error for episode of length 1.
    critic_loss:
        The loss function used to learn the critic.
    critic_multiplier:
        Multiplier used for the critic loss in the total loss.
    grad_norm_clip:
        Value to clip the norm of the gradient at before applying an update.
    dtype:
        Type the obseravtions/model are casted to.
    device:
        Device the observations/model are moved to.

    Returns
    -------
    agent:
        The ignite engine, exploring the environement and optimizing.

    """
    actor_critic.to(device=device, dtype=dtype)

    def select_action(engine, observation):
        actor_critic.train()
        action_distrib, critic_value = actor_critic(observation)
        action = action_distrib.sample()
        engine.store_transition_members(
            log_prob=action_distrib.log_prob(action),
            entropy=action_distrib.entropy(),
            critic_value=critic_value,
        )
        return action

    agent = Explorer(select_action=select_action, dtype=dtype, device=device)

    agent.register_transition_members("log_prob", "entropy", "critic_value")

    @agent.on(Events.STARTED)
    def add_trajectories_to_engine(engine):
        engine.state.trajectories = Trajectories(
            T.WithReturns(discount=discount, norm_returns=norm_returns))

    @agent.on(Events.EPOCH_STARTED)
    def empty_trajectectories(engine):
        engine.state.trajectories.clear()

    @agent.on(Events.ITERATION_COMPLETED)
    def append_transition(engine):
        engine.state.trajectories.append(engine.state.transition)

    @agent.on(Events.EPOCH_COMPLETED)
    def optimize(engine):
        engine.state.trajectories.terminate_trajectory()
        # The setting is simple enough that using a dataloader is overkill.
        optimizer.zero_grad()
        for t in engine.state.trajectories:
            loss = -(t.retrn - t.critic_value.detach()) * t.log_prob
            loss -= exploration * t.entropy
            retrn = t.critic_value.new([t.retrn])  # Make tensor on same device
            loss += critic_multiplier * critic_loss(t.critic_value, retrn)
            loss.backward()

        if grad_norm_clip is not None:
            nn.utils.clip_grad_norm_(actor_critic.parameters(), grad_norm_clip)
        optimizer.step()

    return agent
Exemplo n.º 14
0
def test_explorer_transition_members_info():
    explorer = Explorer(lambda x, y: None)
    explorer.register_transition_members("info_member")
    explorer.run(Env(), 2)
    assert hasattr(explorer.state.transition, "info_member")