class Agent():
    def __init__(
        self,
        input_dims,
        n_actions,
        layer_sizes,
        act_lr=0.00003,
        crt_lr=0.0003,
        gamma=0.99,
        max_size=1000000,
        tau=0.005,
        batch_size=64,
        reward_scale=1,
        name='sac',
        chkpt_dir='tmp/ddpg',
        layerNorm=True,
    ):
        '''Higher reward scale means higher weights given to rewards ratehr than entropy'''
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.input_dims = input_dims
        self.n_actions = n_actions
        # The env action was scaled to [-1, 1]
        self.max_action = np.ones(self.n_actions)
        # Cannot use env.action_space.high, because env.action_space.high is not real action space
        self.layer_sizes = layer_sizes
        self.layerNorm = layerNorm

        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  self.max_action,
                                  fc_dims=self.layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)

        self.critic_1 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      self.layer_sizes,
                                      name='critic1_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=self.layerNorm)
        self.critic_2 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      self.layer_sizes,
                                      name='critic2_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=self.layerNorm)

        self.value = ValueNetwork(crt_lr,
                                  self.input_dims,
                                  self.layer_sizes,
                                  name='value_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)
        self.target_value = ValueNetwork(crt_lr,
                                         self.input_dims,
                                         self.layer_sizes,
                                         name='target_value_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=self.layerNorm)

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        updated_value = update_single_target_network_parameters(
            self.value, self.target_value, tau)

        self.target_value.load_state_dict(updated_value)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        self.value.save_checkpoint()
        #        self.target_value.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        #        self.target_value.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device)
        state = T.tensor(state, dtype=T.float).to(self.actor.device)
        action = T.tensor(action, dtype=T.float).to(self.actor.device)

        # Update the value network
        self.value.optimizer.zero_grad()

        value = self.value.forward(state).view(-1)

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=False)
        log_probs = log_probs.view(-1)
        # Use the action from the current policy, rather than the one stored in the buffer
        q1_new_policy = self.critic_1.forward(state, actions).view(-1)
        q2_new_policy = self.critic_2.forward(state, actions).view(-1)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        value_target = critic_value - log_probs  # - log_probs is entropy

        value_loss = F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        # Update the critic network
        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        # action and state are from replay buffer generated by old policy
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)

        value_ = self.target_value.forward(state_).view(-1)
        # value_[done] = 0.0    # In building context, terminal state does not have 0 value
        q_hat = self.scale * reward + self.gamma * value_

        critic_1_loss = F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = F.mse_loss(q2_old_policy, q_hat)
        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        # Update the actor network
        self.actor.optimizer.zero_grad()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=True)
        log_probs = log_probs.view(-1)
        # Use the action from the current policy, rather than the one stored in the buffer
        q1_new_policy = self.critic_1.forward(state, actions).view(-1)
        q2_new_policy = self.critic_2.forward(state, actions).view(-1)
        critic_value = T.min(q1_new_policy, q2_new_policy)

        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.update_network_parameters()

        return critic_loss.item(), actor_loss.item()
예제 #2
0
class Agent():
    def __init__(self,
                 input_dims,
                 n_actions,
                 layer_sizes,
                 act_lr=0.00001,
                 crt_lr=0.0001,
                 tau=0.001,
                 gamma=0.99,
                 max_size=1000000,
                 batch_size=64,
                 update_actor_interval=2,
                 noise=0.1,
                 noise_targetAct=0.2,
                 chkpt_dir='tmp/td3',
                 name='td3',
                 layerNorm=True):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.gamma = gamma
        self.tau = tau
        self.max_action = 1
        self.min_action = -1
        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)
        self.batch_size = batch_size
        self.learn_step_cntr = 0
        self.update_actor_iter = update_actor_interval

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=layerNorm)

        self.critic_1 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      layer_sizes,
                                      name='Critic1_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=layerNorm)
        self.critic_2 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      layer_sizes,
                                      name='Critic2_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=layerNorm)

        self.target_actor = ActorNetwork(act_lr,
                                         self.input_dims,
                                         self.n_actions,
                                         layer_sizes,
                                         name='TargetActor_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=layerNorm)
        self.target_critic_1 = CriticNetwork(crt_lr,
                                             self.input_dims,
                                             self.n_actions,
                                             layer_sizes,
                                             name='TargetCritic1_' + name,
                                             chkpt_dir=chkpt_dir,
                                             layerNorm=layerNorm)
        self.target_critic_2 = CriticNetwork(crt_lr,
                                             self.input_dims,
                                             self.n_actions,
                                             layer_sizes,
                                             name='TargetCritic2_' + name,
                                             chkpt_dir=chkpt_dir,
                                             layerNorm=layerNorm)

        self.noise = noise
        self.noise_targetAct = noise_targetAct
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.tensor(observation, dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.tensor(np.random.normal(scale=self.noise),
                                 dtype=T.float).to(self.actor.device)

        mu_prime = T.clamp(mu_prime, self.min_action, self.max_action)

        return mu_prime.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
        # done = T.tensor(done).to(self.critic_1.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device)
        state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
        action = T.tensor(action, dtype=T.float).to(self.critic_1.device)

        target_actions = self.target_actor.forward(state_)
        target_actions = target_actions + \
            T.clamp(T.tensor(np.random.normal(
                scale=self.noise_targetAct)), -0.5, 0.5)
        target_actions = T.clamp(target_actions, self.min_action,
                                 self.max_action)

        q1_ = self.target_critic_1.forward(state_, target_actions).view(-1)
        q2_ = self.target_critic_2.forward(state_, target_actions).view(-1)
        # q1_[done] = 0.0   # In building context, the terminal state does not have 0 value
        # q2_[done] = 0.0
        critic_value_ = T.min(q1_, q2_)
        target = reward + self.gamma * critic_value_

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        q1 = self.critic_1.forward(state, action).view(-1)
        q2 = self.critic_2.forward(state, action).view(-1)
        q1_loss = F.mse_loss(target, q1)
        q2_loss = F.mse_loss(target, q2)
        critic_loss = q1_loss + q2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.learn_step_cntr += 1
        # if self.learn_step_cntr % self.update_actor_iter != 0:
        #     return

        self.actor.optimizer.zero_grad()
        actor_q1_loss = self.critic_1.forward(
            state, self.actor.forward(state))  # can also use the mean
        # of actor_q1_loss and actor_q2_loss, but it would be slower and does not really matter
        actor_loss = -T.mean(actor_q1_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

        return critic_loss.item(), actor_loss.item()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        updated_actor = update_single_target_network_parameters(
            self.actor, self.target_actor, tau)
        updated_critic_1 = update_single_target_network_parameters(
            self.critic_1, self.target_critic_1, tau)
        updated_critic_2 = update_single_target_network_parameters(
            self.critic_2, self.target_critic_2, tau)

        self.target_actor.load_state_dict(updated_actor)
        self.target_critic_1.load_state_dict(updated_critic_1)
        self.target_critic_2.load_state_dict(updated_critic_2)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        # self.target_actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        # self.target_critic_1.save_checkpoint()
        # self.target_critic_2.save_checkpoint()

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        # self.target_actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
예제 #3
0
class Agent(object):
    def __init__(self,
                 input_dims,
                 n_actions,
                 layer_sizes,
                 act_lr=0.00001,
                 crt_lr=0.0001,
                 tau=0.001,
                 gamma=0.99,
                 max_size=1000000,
                 batch_size=64,
                 chkpt_dir='tmp/ddpg',
                 name='ddpg',
                 layerNorm=True):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.layer_sizes = layer_sizes
        self.layerNorm = layerNorm
        self.gamma = gamma  # discount factor
        self.tau = tau  # target network updating weight
        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)
        self.batch_size = batch_size

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  self.layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)
        self.critic = CriticNetwork(crt_lr,
                                    self.input_dims,
                                    self.n_actions,
                                    self.layer_sizes,
                                    name='Critic_' + name,
                                    chkpt_dir=chkpt_dir,
                                    layerNorm=self.layerNorm)

        self.target_actor = ActorNetwork(act_lr,
                                         self.input_dims,
                                         self.n_actions,
                                         self.layer_sizes,
                                         name='TargetActor_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=self.layerNorm)
        self.target_critic = CriticNetwork(crt_lr,
                                           self.input_dims,
                                           self.n_actions,
                                           self.layer_sizes,
                                           name='TargetCritic_' + name,
                                           chkpt_dir=chkpt_dir,
                                           layerNorm=self.layerNorm)

        self.noise = OUActionNoise(mu=np.zeros(self.n_actions))

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = T.tensor(observation,
                               dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise() * 0.05, dtype=T.float).to(
            self.actor.device)
        self.actor.train()
        return mu_prime.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.critic.device)
        # done = T.tensor(done).to(self.critic.device)
        new_state = T.tensor(new_state, dtype=T.float).to(self.critic.device)
        action = T.tensor(action, dtype=T.float).to(self.critic.device)
        state = T.tensor(state, dtype=T.float).to(self.critic.device)

        # calculate target
        self.target_actor.eval()
        self.target_critic.eval()
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state,
                                                   target_actions).view(-1)
        # critic_value_[done] = 0.0    # In building context, terminal state does not have value of 0
        target = reward + self.gamma * critic_value_

        # train critic
        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_value = self.critic.forward(state, action).view(-1)
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        # train actor
        self.critic.eval()
        self.actor.train()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state)
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

        return critic_loss.item(), actor_loss.item()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        updated_actor = update_single_target_network_parameters(
            self.actor, self.target_actor, tau)
        updated_critic = update_single_target_network_parameters(
            self.critic, self.target_critic, tau)

        self.target_actor.load_state_dict(updated_actor)
        self.target_critic.load_state_dict(updated_critic)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        # self.target_actor.save_checkpoint(modelName)
        self.critic.save_checkpoint()
        # self.target_critic.save_checkpoint(modelName)

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        # self.target_actor.load_checkpoint(modelName)
        self.critic.load_checkpoint()