Exemplo n.º 1
0
def test_replay_buffer_get_state_with_data():
    # Assign
    batch_size = 10
    buffer_size = 20
    buffer = ReplayBuffer(batch_size=batch_size, buffer_size=buffer_size)

    for (state, action, reward, next_state,
         done) in generate_sample_SARS(buffer_size + 1):
        buffer.add(state=state,
                   action=action,
                   reward=reward,
                   next_state=next_state,
                   done=done)

    # Act
    state: BufferState = buffer.get_state()
    state_data: BufferState = buffer.get_state(include_data=True)

    # Assert
    assert state == state_data, "Default option is to include all data"
    assert state.type == ReplayBuffer.type
    assert state.batch_size == batch_size
    assert state.buffer_size == buffer_size
    assert len(state.data) == buffer_size

    for d in state.data:
        b_keys = ("state", "action", "reward", "done", "next_state")
        assert all([k in b_keys for k in d.get_dict().keys()])
Exemplo n.º 2
0
def test_replay_buffer_from_state_wrong_type():
    # Assign
    buffer = ReplayBuffer(batch_size=5, buffer_size=20)
    state = buffer.get_state()
    state.type = "WrongType"

    # Act
    with pytest.raises(ValueError):
        ReplayBuffer.from_state(state=state)
Exemplo n.º 3
0
def test_replay_buffer_from_state_without_data():
    # Assign
    buffer = ReplayBuffer(batch_size=5, buffer_size=20)
    state = buffer.get_state()

    # Act
    new_buffer = ReplayBuffer.from_state(state=state)

    # Assert
    assert new_buffer == buffer
    assert new_buffer.buffer_size == state.buffer_size
    assert new_buffer.batch_size == state.batch_size
    assert new_buffer.data == []
Exemplo n.º 4
0
def test_replay_buffer_get_state_empty():
    # Assign
    batch_size = 10
    buffer_size = 20
    buffer = ReplayBuffer(batch_size=batch_size, buffer_size=buffer_size)

    # Act
    state: BufferState = buffer.get_state()

    # Assert
    assert state.type == ReplayBuffer.type
    assert state.batch_size == batch_size
    assert state.buffer_size == buffer_size
    assert state.data is None
Exemplo n.º 5
0
def test_replay_buffer_from_state_with_data():
    # Assign
    buffer = ReplayBuffer(batch_size=5, buffer_size=20)
    buffer = populate_buffer(buffer, 30)
    state = buffer.get_state()

    # Act
    new_buffer = ReplayBuffer.from_state(state=state)

    # Assert
    assert new_buffer == buffer
    assert new_buffer.buffer_size == state.buffer_size
    assert new_buffer.batch_size == state.batch_size
    assert new_buffer.data == state.data
    assert len(buffer.data) == state.buffer_size
Exemplo n.º 6
0
def test_replay_buffer_get_state_without_data():
    # Assign
    batch_size = 10
    buffer_size = 20
    buffer = ReplayBuffer(batch_size=batch_size, buffer_size=buffer_size)

    for (state, action, reward, next_state,
         done) in generate_sample_SARS(buffer_size + 1):
        buffer.add(state=state,
                   action=action,
                   reward=reward,
                   next_state=next_state,
                   done=done)

    # Act
    state: BufferState = buffer.get_state(include_data=False)

    # Assert
    assert state.type == ReplayBuffer.type
    assert state.batch_size == batch_size
    assert state.buffer_size == buffer_size
    assert state.data is None
Exemplo n.º 7
0
class DDPGAgent(AgentBase):
    """
    Deep Deterministic Policy Gradients (DDPG).

    Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise.
    """

    name = "DDPG"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 actor_lr: float = 2e-3,
                 critic_lr: float = 2e-3,
                 noise_scale: float = 0.2,
                 noise_sigma: float = 0.1,
                 **kwargs):
        super().__init__(**kwargs)
        self.device = self._register_param(kwargs, "device", DEVICE)
        self.state_size = state_size
        self.action_size = action_size

        # Reason sequence initiation.
        hidden_layers = to_numbers_seq(
            self._register_param(kwargs, 'hidden_layers', (128, 128)))
        self.actor = ActorBody(state_size,
                               action_size,
                               hidden_layers=hidden_layers,
                               gate_out=torch.tanh).to(self.device)
        self.critic = CriticBody(state_size,
                                 action_size,
                                 hidden_layers=hidden_layers).to(self.device)
        self.target_actor = ActorBody(state_size,
                                      action_size,
                                      hidden_layers=hidden_layers,
                                      gate_out=torch.tanh).to(self.device)
        self.target_critic = CriticBody(state_size,
                                        action_size,
                                        hidden_layers=hidden_layers).to(
                                            self.device)

        # Noise sequence initiation
        self.noise = GaussianNoise(shape=(action_size, ),
                                   mu=1e-8,
                                   sigma=noise_sigma,
                                   scale=noise_scale,
                                   device=self.device)

        # Target sequence initiation
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Optimization sequence initiation.
        self.actor_lr = float(
            self._register_param(kwargs, 'actor_lr', actor_lr))
        self.critic_lr = float(
            self._register_param(kwargs, 'critic_lr', critic_lr))
        self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        self.max_grad_norm_actor = float(
            self._register_param(kwargs, "max_grad_norm_actor", 10.0))
        self.max_grad_norm_critic = float(
            self._register_param(kwargs, "max_grad_norm_critic", 10.0))
        self.action_min = float(self._register_param(kwargs, 'action_min', -1))
        self.action_max = float(self._register_param(kwargs, 'action_max', 1))
        self.action_scale = float(
            self._register_param(kwargs, 'action_scale', 1))

        self.gamma = float(self._register_param(kwargs, 'gamma', 0.99))
        self.tau = float(self._register_param(kwargs, 'tau', 0.02))
        self.batch_size = int(self._register_param(kwargs, 'batch_size', 64))
        self.buffer_size = int(
            self._register_param(kwargs, 'buffer_size', int(1e6)))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.warm_up = int(self._register_param(kwargs, 'warm_up', 0))
        self.update_freq = int(self._register_param(kwargs, 'update_freq', 1))
        self.number_updates = int(
            self._register_param(kwargs, 'number_updates', 1))

        # Breath, my child.
        self.reset_agent()
        self.iteration = 0
        self._loss_actor = 0.
        self._loss_critic = 0.

    def reset_agent(self) -> None:
        self.actor.reset_parameters()
        self.critic.reset_parameters()
        self.target_actor.reset_parameters()
        self.target_critic.reset_parameters()

    @property
    def loss(self) -> Dict[str, float]:
        return {'actor': self._loss_actor, 'critic': self._loss_critic}

    @loss.setter
    def loss(self, value):
        if isinstance(value, dict):
            self._loss_actor = value['actor']
            self._loss_critic = value['critic']
        else:
            self._loss_actor = value
            self._loss_critic = value

    @torch.no_grad()
    def act(self, obs, noise: float = 0.0) -> List[float]:
        """Acting on the observations. Returns action.

        Returns:
            action: (list float) Action values.
        """
        obs = to_tensor(obs).float().to(self.device)
        action = self.actor(obs)
        action += noise * self.noise.sample()
        action = torch.clamp(action * self.action_scale, self.action_min,
                             self.action_max)
        return action.cpu().numpy().tolist()

    def step(self, state, action, reward, next_state, done) -> None:
        self.iteration += 1
        self.buffer.add(state=state,
                        action=action,
                        reward=reward,
                        next_state=next_state,
                        done=done)

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (self.iteration %
                                                   self.update_freq) == 0:
            for _ in range(self.number_updates):
                self.learn(self.buffer.sample())

    def compute_value_loss(self, states, actions, next_states, rewards, dones):
        next_actions = self.target_actor.act(next_states)
        assert next_actions.shape == actions.shape
        Q_target_next = self.target_critic.act(next_states, next_actions)
        Q_target = rewards + self.gamma * Q_target_next * (1 - dones)
        Q_expected = self.critic(states, actions)
        assert Q_expected.shape == Q_target.shape == Q_target_next.shape
        return mse_loss(Q_expected, Q_target)

    def compute_policy_loss(self, states) -> None:
        """Compute Policy loss based on provided states.

        Loss = Mean(-Q(s, _a) ),
        where _a is actor's estimate based on state, _a = Actor(s).
        """
        pred_actions = self.actor(states)
        return -self.critic(states, pred_actions).mean()

    def learn(self, experiences) -> None:
        """Update critics and actors"""
        rewards = to_tensor(experiences['reward']).float().to(
            self.device).unsqueeze(1)
        dones = to_tensor(experiences['done']).type(torch.int).to(
            self.device).unsqueeze(1)
        states = to_tensor(experiences['state']).float().to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(
            self.device)
        assert rewards.shape == dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size,
                                                     self.state_size)
        assert actions.shape == (self.batch_size, self.action_size)

        # Value (critic) optimization
        loss_critic = self.compute_value_loss(states, actions, next_states,
                                              rewards, dones)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(),
                                 self.max_grad_norm_critic)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.item())

        # Policy (actor) optimization
        loss_actor = self.compute_policy_loss(states)
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(),
                                 self.max_grad_norm_actor)
        self.actor_optimizer.step()
        self._loss_actor = loss_actor.item()

        # Soft update target weights
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

    def state_dict(self) -> Dict[str, dict]:
        """Describes agent's networks.

        Returns:
            state: (dict) Provides actors and critics states.

        """
        return {
            "actor": self.actor.state_dict(),
            "target_actor": self.target_actor.state_dict(),
            "critic": self.critic.state_dict(),
            "target_critic": self.target_critic.state_dict()
        }

    def log_metrics(self,
                    data_logger: DataLogger,
                    step: int,
                    full_log: bool = False):
        data_logger.log_value("loss/actor", self._loss_actor, step)
        data_logger.log_value("loss/critic", self._loss_critic, step)

        if full_log:
            for idx, layer in enumerate(self.actor.layers):
                if hasattr(layer, "weight"):
                    data_logger.create_histogram(f"actor/layer_weights_{idx}",
                                                 layer.weight, step)
                if hasattr(layer, "bias") and layer.bias is not None:
                    data_logger.create_histogram(f"actor/layer_bias_{idx}",
                                                 layer.bias, step)

            for idx, layer in enumerate(self.critic.layers):
                if hasattr(layer, "weight"):
                    data_logger.create_histogram(f"critic/layer_weights_{idx}",
                                                 layer.weight, step)
                if hasattr(layer, "bias") and layer.bias is not None:
                    data_logger.create_histogram(f"critic/layer_bias_{idx}",
                                                 layer.bias, step)

    def get_state(self) -> AgentState:
        net = dict(
            actor=self.actor.state_dict(),
            target_actor=self.target_actor.state_dict(),
            critic=self.critic.state_dict(),
            target_critic=self.target_critic.state_dict(),
        )
        network_state: NetworkState = NetworkState(net=net)
        return AgentState(model=self.name,
                          state_space=self.state_size,
                          action_space=self.action_size,
                          config=self._config,
                          buffer=self.buffer.get_state(),
                          network=network_state)

    def save_state(self, path: str) -> None:
        agent_state = self.get_state()
        torch.save(agent_state, path)

    def load_state(self,
                   *,
                   path: Optional[str] = None,
                   agent_state: Optional[dict] = None):
        if path is None and agent_state:
            raise ValueError(
                "Either `path` or `agent_state` must be provided to load agent's state."
            )
        if path is not None and agent_state is None:
            agent_state = torch.load(path)
        self._config = agent_state.get('config', {})
        self.__dict__.update(**self._config)

        self.actor.load_state_dict(agent_state['actor'])
        self.critic.load_state_dict(agent_state['critic'])
        self.target_actor.load_state_dict(agent_state['target_actor'])
        self.target_critic.load_state_dict(agent_state['target_critic'])
Exemplo n.º 8
0
class DDPGAgent(AgentBase):
    """
    Deep Deterministic Policy Gradients (DDPG).

    Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise.
    """

    name = "DDPG"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 noise_scale: float = 0.2,
                 noise_sigma: float = 0.1,
                 **kwargs):
        """
        Parameters:
            state_size: Number of input dimensions.
            action_size: Number of output dimensions
            noise_scale (float): Added noise amplitude. Default: 0.2.
            noise_sigma (float): Added noise variance. Default: 0.1.

        Keyword parameters:
            hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (64, 64).
            gamma (float): Discount value. Default: 0.99.
            tau (float): Soft-copy factor. Default: 0.002.
            actor_lr (float): Learning rate for the actor (policy). Default: 0.0003.
            critic_lr (float): Learning rate for the critic (value function). Default: 0.0003.
            max_grad_norm_actor (float) Maximum norm value for actor gradient. Default: 10.
            max_grad_norm_critic (float): Maximum norm value for critic gradient. Default: 10.
            batch_size (int): Number of samples used in learning. Default: 64.
            buffer_size (int): Maximum number of samples to store. Default: 1e6.
            warm_up (int): Number of samples to observe before starting any learning step. Default: 0.
            update_freq (int): Number of steps between each learning step. Default 1.
            number_updates (int): How many times to use learning step in the learning phase. Default: 1.
            action_min (float): Minimum returned action value. Default: -1.
            action_max (float): Maximum returned action value. Default: 1.
            action_scale (float): Multipler value for action. Default: 1.

        """
        super().__init__(**kwargs)
        self.device = self._register_param(kwargs, "device", DEVICE)
        self.state_size = state_size
        self.action_size = action_size

        # Reason sequence initiation.
        hidden_layers = to_numbers_seq(
            self._register_param(kwargs, 'hidden_layers', (64, 64)))
        self.actor = ActorBody(state_size,
                               action_size,
                               hidden_layers=hidden_layers,
                               gate_out=torch.tanh).to(self.device)
        self.critic = CriticBody(state_size,
                                 action_size,
                                 hidden_layers=hidden_layers).to(self.device)
        self.target_actor = ActorBody(state_size,
                                      action_size,
                                      hidden_layers=hidden_layers,
                                      gate_out=torch.tanh).to(self.device)
        self.target_critic = CriticBody(state_size,
                                        action_size,
                                        hidden_layers=hidden_layers).to(
                                            self.device)

        # Noise sequence initiation
        self.noise = GaussianNoise(shape=(action_size, ),
                                   mu=1e-8,
                                   sigma=noise_sigma,
                                   scale=noise_scale,
                                   device=self.device)

        # Target sequence initiation
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Optimization sequence initiation.
        self.actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4))
        self.critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4))
        self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        self.max_grad_norm_actor = float(
            self._register_param(kwargs, "max_grad_norm_actor", 10.0))
        self.max_grad_norm_critic = float(
            self._register_param(kwargs, "max_grad_norm_critic", 10.0))
        self.action_min = float(self._register_param(kwargs, 'action_min', -1))
        self.action_max = float(self._register_param(kwargs, 'action_max', 1))
        self.action_scale = float(
            self._register_param(kwargs, 'action_scale', 1))

        self.gamma = float(self._register_param(kwargs, 'gamma', 0.99))
        self.tau = float(self._register_param(kwargs, 'tau', 0.02))
        self.batch_size = int(self._register_param(kwargs, 'batch_size', 64))
        self.buffer_size = int(
            self._register_param(kwargs, 'buffer_size', int(1e6)))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.warm_up = int(self._register_param(kwargs, 'warm_up', 0))
        self.update_freq = int(self._register_param(kwargs, 'update_freq', 1))
        self.number_updates = int(
            self._register_param(kwargs, 'number_updates', 1))

        # Breath, my child.
        self.reset_agent()
        self.iteration = 0
        self._loss_actor = 0.
        self._loss_critic = 0.

    def reset_agent(self) -> None:
        self.actor.reset_parameters()
        self.critic.reset_parameters()
        self.target_actor.reset_parameters()
        self.target_critic.reset_parameters()

    @property
    def loss(self) -> Dict[str, float]:
        return {'actor': self._loss_actor, 'critic': self._loss_critic}

    @loss.setter
    def loss(self, value):
        if isinstance(value, dict):
            self._loss_actor = value['actor']
            self._loss_critic = value['critic']
        else:
            self._loss_actor = value
            self._loss_critic = value

    def __eq__(self, o: object) -> bool:
        return super().__eq__(o) \
            and self._config == o._config \
            and self.buffer == o.buffer \
            and self.get_network_state() == o.get_network_state()

    @torch.no_grad()
    def act(self, obs, noise: float = 0.0) -> List[float]:
        """Acting on the observations. Returns action.

        Returns:
            action: (list float) Action values.
        """
        obs = to_tensor(obs).float().to(self.device)
        action = self.actor(obs)
        action += noise * self.noise.sample()
        action = torch.clamp(action * self.action_scale, self.action_min,
                             self.action_max)
        return action.cpu().numpy().tolist()

    def step(self, state, action, reward, next_state, done) -> None:
        self.iteration += 1
        self.buffer.add(state=state,
                        action=action,
                        reward=reward,
                        next_state=next_state,
                        done=done)

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (self.iteration %
                                                   self.update_freq) == 0:
            for _ in range(self.number_updates):
                self.learn(self.buffer.sample())

    def compute_value_loss(self, states, actions, next_states, rewards, dones):
        next_actions = self.target_actor.act(next_states)
        assert next_actions.shape == actions.shape
        Q_target_next = self.target_critic.act(next_states, next_actions)
        Q_target = rewards + self.gamma * Q_target_next * (1 - dones)
        Q_expected = self.critic(states, actions)
        assert Q_expected.shape == Q_target.shape == Q_target_next.shape
        return mse_loss(Q_expected, Q_target)

    def compute_policy_loss(self, states) -> None:
        """Compute Policy loss based on provided states.

        Loss = Mean(-Q(s, _a) ),
        where _a is actor's estimate based on state, _a = Actor(s).
        """
        pred_actions = self.actor(states)
        return -self.critic(states, pred_actions).mean()

    def learn(self, experiences) -> None:
        """Update critics and actors"""
        rewards = to_tensor(experiences['reward']).float().to(
            self.device).unsqueeze(1)
        dones = to_tensor(experiences['done']).type(torch.int).to(
            self.device).unsqueeze(1)
        states = to_tensor(experiences['state']).float().to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(
            self.device)
        assert rewards.shape == dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size,
                                                     self.state_size)
        assert actions.shape == (self.batch_size, self.action_size)

        # Value (critic) optimization
        loss_critic = self.compute_value_loss(states, actions, next_states,
                                              rewards, dones)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(),
                                 self.max_grad_norm_critic)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.item())

        # Policy (actor) optimization
        loss_actor = self.compute_policy_loss(states)
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(),
                                 self.max_grad_norm_actor)
        self.actor_optimizer.step()
        self._loss_actor = loss_actor.item()

        # Soft update target weights
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

    def state_dict(self) -> Dict[str, dict]:
        """Describes agent's networks.

        Returns:
            state: (dict) Provides actors and critics states.

        """
        return {
            "actor": self.actor.state_dict(),
            "target_actor": self.target_actor.state_dict(),
            "critic": self.critic.state_dict(),
            "target_critic": self.target_critic.state_dict()
        }

    def log_metrics(self,
                    data_logger: DataLogger,
                    step: int,
                    full_log: bool = False):
        data_logger.log_value("loss/actor", self._loss_actor, step)
        data_logger.log_value("loss/critic", self._loss_critic, step)

        if full_log:
            for idx, layer in enumerate(self.actor.layers):
                if hasattr(layer, "weight"):
                    data_logger.create_histogram(f"actor/layer_weights_{idx}",
                                                 layer.weight, step)
                if hasattr(layer, "bias") and layer.bias is not None:
                    data_logger.create_histogram(f"actor/layer_bias_{idx}",
                                                 layer.bias, step)

            for idx, layer in enumerate(self.critic.layers):
                if hasattr(layer, "weight"):
                    data_logger.create_histogram(f"critic/layer_weights_{idx}",
                                                 layer.weight, step)
                if hasattr(layer, "bias") and layer.bias is not None:
                    data_logger.create_histogram(f"critic/layer_bias_{idx}",
                                                 layer.bias, step)

    def get_state(self) -> AgentState:
        return AgentState(
            model=self.name,
            state_space=self.state_size,
            action_space=self.action_size,
            config=self._config,
            buffer=copy.deepcopy(self.buffer.get_state()),
            network=copy.deepcopy(self.get_network_state()),
        )

    def get_network_state(self) -> NetworkState:
        net = dict(
            actor=self.actor.state_dict(),
            target_actor=self.target_actor.state_dict(),
            critic=self.critic.state_dict(),
            target_critic=self.target_critic.state_dict(),
        )
        return NetworkState(net=net)

    @staticmethod
    def from_state(state: AgentState) -> AgentBase:
        config = copy.copy(state.config)
        config.update({
            'state_size': state.state_space,
            'action_size': state.action_space
        })
        agent = DDPGAgent(**config)
        if state.network is not None:
            agent.set_network(state.network)
        if state.buffer is not None:
            agent.set_buffer(state.buffer)
        return agent

    def set_buffer(self, buffer_state: BufferState) -> None:
        self.buffer = BufferFactory.from_state(buffer_state)

    def set_network(self, network_state: NetworkState) -> None:
        self.actor.load_state_dict(copy.deepcopy(network_state.net['actor']))
        self.target_actor.load_state_dict(network_state.net['target_actor'])
        self.critic.load_state_dict(network_state.net['critic'])
        self.target_critic.load_state_dict(network_state.net['target_critic'])

    def save_state(self, path: str) -> None:
        agent_state = self.get_state()
        torch.save(agent_state, path)

    def load_state(self,
                   *,
                   path: Optional[str] = None,
                   agent_state: Optional[dict] = None):
        if path is None and agent_state:
            raise ValueError(
                "Either `path` or `agent_state` must be provided to load agent's state."
            )
        if path is not None and agent_state is None:
            agent_state = torch.load(path)
        self._config = agent_state.get('config', {})
        self.__dict__.update(**self._config)

        self.actor.load_state_dict(agent_state['actor'])
        self.critic.load_state_dict(agent_state['critic'])
        self.target_actor.load_state_dict(agent_state['target_actor'])
        self.target_critic.load_state_dict(agent_state['target_critic'])