Пример #1
0
class DDPGHedgingAgent:
    """DDPGAgent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment
        actor (nn.Module): target actor model to select actions
        actor_target (nn.Module): actor model to predict next actions
        actor_optimizer (Optimizer): optimizer for training actor
        critic (nn.Module): critic model to predict state values
        critic_target (nn.Module): target critic model to predict state values
        critic_optimizer (Optimizer): optimizer for training critic
        memory (ReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        gamma (float): discount factor
        tau (float): parameter for soft target update
        initial_random_episode (int): initial random action steps
        noise (OUNoise): noise generator for exploration
        device (torch.device): cpu / gpu
        transition (list): temporory storage for the recent transition
        total_step (int): total step numbers
        is_test (bool): flag to show the current mode (train / test)
    """
    def __init__(self,
                 env: gym.Env,
                 memory_size: int,
                 batch_size: int,
                 ou_noise_theta: float,
                 ou_noise_sigma: float,
                 gamma: float = 0.99,
                 tau: float = 5e-3,
                 initial_random_episode: int = 1e4,
                 name_cases='myproject'):
        """ Initialize. """

        # Logger
        self.wandb = wandb.init(project=name_cases)

        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]

        self.env = env
        self.memory = ReplayBuffer(memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.initial_random_episode = initial_random_episode

        # noise
        self.noise = OUNoise(
            action_dim,
            theta=ou_noise_theta,
            sigma=ou_noise_sigma,
        )

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        print(self.device)

        # networks
        self.actor = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())

        self.critic = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizer
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        # transition to store in memory
        self.transition = list()

        # total steps count
        self.total_step = 0
        # mode: train / test
        self.is_test = False
        self.populate(self.initial_random_episode)

    def populate(self, eps: int = 100) -> None:
        """
        Carries out several random steps through the environment to initially fill
        up the replay buffer with experiences

        Args:
            steps: number of random steps to populate the buffer with
        """

        if not self.is_test:
            print("Populate Replay Buffer... ")
            kbar = pkbar.Kbar(target=eps, width=20)
            state = self.env.reset()

            for i in range(eps):
                while True:
                    # Get action from sample space
                    selected_action = self.env.action_space.sample()
                    # selected_action = 0
                    noise = self.noise.sample()
                    selected_action = np.clip(selected_action + noise, -1.0,
                                              1.0)

                    next_state, reward, done, _ = self.env.step(
                        selected_action)
                    self.transition = [
                        state, selected_action, reward, next_state,
                        int(done)
                    ]
                    self.memory.append(Experience(*self.transition))

                    state = next_state
                    if done:
                        state = self.env.reset()
                        break

                kbar.add(1)

            # self.scaler = self.memory.standar_scaler()

    @torch.no_grad()
    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input state."""
        state_s = self.scaler.transform([state])
        selected_action = self.actor(
            torch.FloatTensor(state_s).to(self.device)).item()
        # add noise for exploration during training
        if not self.is_test:
            noise = self.noise.sample()
            selected_action = np.clip(selected_action + noise, -1.0, 1.0)

        self.transition = [state, selected_action]
        return selected_action

    def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]:
        """Take an action and return the response of the env."""
        next_state, reward, done, _ = self.env.step(action)

        if not self.is_test:
            self.transition += [reward, next_state, int(done)]
            self.memory.append(Experience(*self.transition))

        return next_state, reward, done

    def update_model(self) -> torch.Tensor:
        """ Update the model by gradient descent.
            Change the loss in to mean variance optimization
        """
        device = self.device  # for shortening the following lines

        state, action, reward, next_state, done = self.memory.sample(
            self.batch_size, self.device)

        state = torch.FloatTensor(self.scaler.transform(state)).to(device)
        next_state = torch.FloatTensor(
            self.scaler.transform(next_state)).to(device)
        # state = state.to(device)
        # next_state = next_state.to(device)
        action = action.to(device)
        reward = reward.to(device)
        done = done.to(device)

        masks = 1 - done
        next_action = self.actor_target(next_state)
        next_value = self.critic_target(next_state, next_action)
        curr_return = reward.reshape(
            -1, 1) + self.gamma * next_value * masks.reshape(-1, 1)

        # train critic
        values = self.critic(state, action)
        critic_loss = F.mse_loss(values, curr_return)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Freeze Q-network so you don't waste computational effort
        # computing gradients for it during the policy learning step.
        for p in self.critic.parameters():
            p.requires_grad = False

        # train actor
        q_values = self.critic(state, self.actor(state))
        actor_loss = -q_values.mean()
        # actor_loss = 0.5 * q_values.std() ** 2

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        for p in self.critic.parameters():
            p.requires_grad = True

        # target update
        self._target_soft_update()

        return actor_loss.data, critic_loss.data

    def train(self, num_frames: int, plotting_interval: int = 200):
        """Train the agent."""
        self.is_test = False

        state = self.env.reset()
        actor_losses = []
        critic_losses = []
        scores = []
        score = 0

        print("Training...")
        kbar = pkbar.Kbar(target=num_frames, width=20)

        for self.total_step in range(1, num_frames + 1):
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

            # if episode ends
            if done:
                state = self.env.reset()
                scores.append(score)
                score = 0

                self._plot(
                    self.total_step,
                    scores,
                    actor_losses,
                    critic_losses,
                )

            # if training is ready
            if (len(self.memory) >= self.batch_size):  # and
                actor_loss, critic_loss = self.update_model()
                actor_losses.append(actor_loss)
                critic_losses.append(critic_loss)

            kbar.add(1)

        self.env.close()

    def test(self):
        """Test the agent."""
        self.is_test = True

        state = self.env.reset()
        done = False
        score = 0

        while not done:
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

        self.env.close()

        return score

    def _target_soft_update(self):
        """Soft-update: target = tau*local + (1-tau)*target."""
        tau = self.tau

        for t_param, l_param in zip(self.actor_target.parameters(),
                                    self.actor.parameters()):
            t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data)

        for t_param, l_param in zip(self.critic_target.parameters(),
                                    self.critic.parameters()):
            t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data)

    def _plot(
        self,
        frame_idx: int,
        scores: List[float],
        actor_losses: List[float],
        critic_losses: List[float],
    ):
        """Plot the training progresses."""

        self.wandb.log({
            'frame': frame_idx,
            'score': scores[-1],
            'actor_loss': actor_losses[-1],
            'critic_loss': critic_losses[-1]
        })
Пример #2
0
class DDPG:
    def __init__(self,
                 n_state,
                 n_action,
                 a_limit,
                 model_folder=None,
                 memory_size=10000,
                 batch_size=32,
                 tau=0.01,
                 gamma=0.99,
                 var=3.0):
        # Record the parameters
        self.n_state = n_state
        self.n_action = n_action
        self.a_limit = a_limit
        self.memory_size = memory_size
        self.model_folder = model_folder
        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.var = var

        # Create the network and related objects
        self.memory = np.zeros(
            [self.memory_size, 2 * self.n_state + self.n_action + 1],
            dtype=np.float32)
        self.memory_counter = 0
        self.eval_actor = Actor(self.n_state, self.n_action, self.a_limit)
        self.eval_critic = Critic(self.n_state, self.n_action)
        self.target_actor = Actor(self.n_state,
                                  self.n_action,
                                  self.a_limit,
                                  trainable=False)
        self.target_critic = Critic(self.n_state,
                                    self.n_action,
                                    trainable=False)

        self.actor_optimizer = Adam(self.eval_actor.parameters(), lr=0.001)
        self.critic_optimizer = Adam(self.eval_critic.parameters(), lr=0.002)
        self.criterion = nn.MSELoss()

        # Make sure the parameter of target network is the same as evaluate network
        self.hardCopy()

    def load(self):
        if os.path.exists(self.model_folder):
            self.eval_actor.load_state_dict(
                torch.load(os.path.join(self.model_folder, 'actor.pth')))
            self.eval_critic.load_state_dict(
                torch.load(os.path.join(self.model_folder, 'critic.pth')))
        self.hardCopy()

    def save(self):
        if not os.path.exists(self.model_folder):
            os.mkdir(self.model_folder)
        torch.save(self.eval_actor.state_dict(),
                   os.path.join(self.model_folder, 'actor.pth'))
        torch.save(self.eval_critic.state_dict(),
                   os.path.join(self.model_folder, 'critic.pth'))

    def chooseAction(self, s):
        """
            給定輸入state,透過evaluate actor輸出[-1, 1]之間的實數動作值
        """
        s = to_var(s)
        a = self.eval_actor(s)
        a = a.cpu().data.numpy()
        if self.var > 0:
            a = np.clip(np.random.normal(a, self.var), -2, 2)
        return a

    def store_path(self, s, a, r, s_):
        """
            儲存state transition相關資訊
        """
        transition = np.hstack((s, a, [r], s_))
        idx = self.memory_counter % self.memory_size
        self.memory[idx, :] = transition
        self.memory_counter += 1

    def softCopy(self):
        for ta, ea in zip(self.target_actor.parameters(),
                          self.eval_actor.parameters()):
            ta.data.copy_((1.0 - self.tau) * ta.data + self.tau * ea.data)
        for tc, ec in zip(self.target_critic.parameters(),
                          self.eval_critic.parameters()):
            tc.data.copy_((1.0 - self.tau) * tc.data + self.tau * ec.data)

    def hardCopy(self):
        for ta, ea in zip(self.target_actor.parameters(),
                          self.eval_actor.parameters()):
            ta.data.copy_(ea.data)
        for tc, ec in zip(self.target_critic.parameters(),
                          self.eval_critic.parameters()):
            tc.data.copy_(ec.data)

    def update(self):
        # 如果儲存的資訊太少就不更新
        if self.memory_counter <= 5000:
            return

        # 將evaluate network的參數複製進入target network中
        self.softCopy()

        # 決定輸入的batch data
        if self.memory_counter > self.memory_size:
            sample_idx = np.random.choice(self.memory_size,
                                          size=self.batch_size)
        else:
            sample_idx = np.random.choice(self.memory_counter,
                                          size=self.batch_size)

        # 從記憶庫中擷取要訓練的資料
        batch_data = self.memory[sample_idx, :]
        batch_s = batch_data[:, :self.n_state]
        batch_a = batch_data[:, self.n_state:self.n_state + self.n_action]
        batch_r = batch_data[:, -self.n_state - 1:-self.n_state]
        batch_s_ = batch_data[:, -self.n_state:]

        # 送入Pytorch中
        batch_s = to_var(batch_s)
        batch_a = to_var(batch_a)
        batch_r = to_var(batch_r)
        batch_s_ = to_var(batch_s_)

        # 用target network計算target Q值
        next_q_target = self.target_critic(batch_s_,
                                           self.target_actor(batch_s_))
        q_target = batch_r + self.gamma * next_q_target

        # 更新critic
        self.critic_optimizer.zero_grad()
        q_batch = self.eval_critic(batch_s, batch_a)
        value_loss = F.mse_loss(input=q_batch, target=q_target)
        value_loss.backward()
        self.critic_optimizer.step()

        # 更新actor
        self.actor_optimizer.zero_grad()
        policy_loss = -self.eval_critic(batch_s,
                                        self.eval_actor(batch_s)).mean()
        policy_loss.backward()
        self.actor_optimizer.step()

        # 降低action隨機搜索廣度
        self.var *= .9995