示例#1
0
class ActorCritic(Agent):
    def __init__(self,
                 env: Env,
                 policy_lr: float,
                 value_lr: float,
                 gamma: float = 0.99,
                 value_iter=50,
                 policy_layers=(128, 128),
                 value_layers=(128, 128),
                 verbose=False,
                 save=True,
                 policy_path=None,
                 value_path=None):
        super().__init__(env, verbose, save)
        self.gamma = gamma

        if self.action_space.discrete:
            policy_head = nn.Softmax(dim=-1)
        else:
            policy_head = nn.Tanh()

        self.policy_path = policy_path
        self.value_path = value_path
        self.policy_model = MLP(self.state_space.shape[0],
                                self.action_space.shape[0], policy_layers,
                                policy_head)
        self.value_model = MLP(self.state_space.shape[0], 1, value_layers,
                               None)
        self.policy_optimizer = optim.Adam(self.policy_model.parameters(),
                                           lr=policy_lr)
        self.value_optimizer = optim.Adam(self.value_model.parameters(),
                                          lr=value_lr)
        self.value_loss = nn.MSELoss()
        self.reset()
        self.counter = 0
        self.value_iter = value_iter

    def setup_memory(self) -> None:
        columns = [
            "states", "next_states", "actions", "log_probs", "rewards", "done"
        ]
        self.episode_memory = Memory(columns)
        self.epoch_memory = Memory(columns)

    def act(self, state: List, train=True) -> Tuple:
        state = torch.from_numpy(state).type(torch.FloatTensor)
        action_probs = self.policy_model(state)
        distribution = self.action_space.distribution(action_probs)
        action = distribution.sample()
        if train:
            return action.data.numpy(), distribution.log_prob(action)
        else:
            return torch.argmax(action_probs).data.numpy(),

    def update(self) -> None:
        states, next_states, rewards, cumulated_rewards, log_probs, done = self.epoch_memory.get_columns(
            [
                "states", "next_states", "rewards", "cumulated_rewards",
                "log_probs", "done"
            ])
        # Compute the advantge for the previous Value function
        with torch.no_grad():
            advantages = torch.Tensor(rewards) + (
                self.gamma * (1 - torch.tensor(done, dtype=int)) *
                self.value_model(torch.Tensor(next_states)).squeeze() -
                self.value_model(torch.Tensor(states)).squeeze())

        # Train the value function a cetrain number of iterations
        for _ in range(int(self.value_iter) + 1):
            values = self.value_model(torch.Tensor(states)).squeeze()
            value_loss = self.value_loss(values,
                                         torch.Tensor(cumulated_rewards))
            self.value_optimizer.zero_grad()
            value_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1)
            self.value_optimizer.step()
        self.value_iter *= 0.95
        print(f"Value Loss: {value_loss.item()}")
        # Compute the policy loss using th previous value function
        policy_loss = -torch.sum(torch.mul(torch.stack(log_probs),
                                           advantages)) / self.counter
        print(f"Policy Loss: {policy_loss.item()}")
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1)
        self.policy_optimizer.step()
        self.reset()

    def save_model(self) -> None:
        torch.save(self.policy_model.state_dict(), self.policy_path)
        torch.save(self.value_model.state_dict(), self.value_path)

    def load_model(self, policy_path: str, value_path: str) -> None:
        self.policy_model.load_state_dict(torch.load(policy_path))
        self.value_model.load_state_dict(torch.load(value_path))
        self.policy_model.eval()
        self.value_model.eval()

    def setup_schedulers(self, n_epochs: int) -> None:
        policy_scheduler = ExponentialLR(self.policy_optimizer, 0.97)
        value_scheduler = ExponentialLR(self.value_optimizer, 0.97)
        self.schedulers.append(policy_scheduler)
        self.schedulers.append(value_scheduler)

    def cumulate_rewards(self) -> None:
        cumulated_reward = 0
        cumulated_rewards = []
        rewards, = self.episode_memory.get_columns(["rewards"])
        for i in range(len(rewards) - 1, -1, -1):
            cumulated_reward = self.gamma * cumulated_reward + rewards[i]
            cumulated_rewards.append(cumulated_reward)
        self.episode_memory.extend_column("cumulated_rewards",
                                          cumulated_rewards[::-1])
class PolicyGradient(Agent):
    def __init__(self,
                 env: Env,
                 lr: float,
                 gamma: float = 0.99,
                 layers=(128, 128),
                 verbose=False,
                 model_path=None,
                 save=False):
        super().__init__(env, verbose, save)
        self.gamma = gamma
        self.model_path = model_path
        if self.action_space.discrete:
            head = nn.Softmax(dim=-1)
        else:
            head = nn.Tanh()

        self.model = MLP(self.state_space.shape[0], self.action_space.shape[0],
                         layers, head)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.reset()

    def setup_memory(self) -> None:
        columns = ["states", "next_states", "actions", "log_probs", "rewards"]
        self.episode_memory = Memory(columns)
        self.epoch_memory = Memory(columns)

    def act(self, state: List, train: bool = True) -> Tuple:
        state = torch.from_numpy(state).type(torch.FloatTensor)
        action_probs = self.model(state)

        distribution = self.action_space.distribution(action_probs)
        action = distribution.sample()
        if train:
            return action.data.numpy(), distribution.log_prob(action)
        else:
            return torch.argmax(action_probs).data.numpy(),

    def update(self) -> None:
        self.optimizer.zero_grad()
        loss, = self.epoch_memory.get_columns(["loss"])
        loss = torch.mean(torch.stack(loss))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
        self.optimizer.step()
        print(f"Value Loss: {loss.item()}")
        self.reset()

    def save_model(self) -> None:
        torch.save(self.model.state_dict(), self.model_path)

    def load_model(self, model_path: str) -> None:
        self.model.load_state_dict(torch.load(model_path))
        self.model.eval()

    def setup_schedulers(self, n_epochs: int) -> None:
        scheduler = CosineAnnealingLR(self.optimizer, n_epochs)
        self.schedulers.append(scheduler)

    def cumulate_rewards(self) -> None:
        cumulated_reward = 0
        cumulated_rewards = []
        rewards, log_probs = self.episode_memory.get_columns(
            ["rewards", "log_probs"])
        for i in range(len(rewards) - 1, -1, -1):
            cumulated_reward = self.gamma * cumulated_reward + rewards[i]
            cumulated_rewards.append(cumulated_reward)

        cumulated_rewards = cumulated_rewards[::-1]
        loss = -torch.sum(
            torch.mul(torch.stack(log_probs), torch.Tensor(cumulated_rewards)))
        self.episode_memory.append_column("loss", loss)
        self.episode_memory.extend_column("cumulated_rewards",
                                          cumulated_rewards)