Exemplo n.º 1
0
class DDpgAgent():
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 n_actions,
                 gamma=0.99,
                 fc1Dms=400,
                 fc2Dms=300,
                 max_size=1000000,
                 batch_size=64):
        self.alpha = alpha
        self.tau = tau
        self.beta = beta
        self.batch_size = batch_size
        self.gamma = gamma
        self.n_actions = n_actions
        print(batch_size, fc1Dms, fc2Dms)
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.actor = ActorNet(alpha=alpha,
                              input_dims=input_dims,
                              n_actions=n_actions,
                              fc1Dms=fc1Dms,
                              fc2Dms=fc2Dms,
                              name='actor')
        self.critic = CriticNet(beta=beta,
                                input_dims=input_dims,
                                n_actions=n_actions,
                                fc1Dms=fc1Dms,
                                fc2Dms=fc2Dms,
                                name='critic')
        self.target_actor = ActorNet(alpha=alpha,
                                     input_dims=input_dims,
                                     n_actions=n_actions,
                                     fc1Dms=fc1Dms,
                                     fc2Dms=fc2Dms,
                                     name='target_actor')
        self.target_critic = CriticNet(beta=beta,
                                       input_dims=input_dims,
                                       n_actions=n_actions,
                                       fc1Dms=fc1Dms,
                                       fc2Dms=fc2Dms,
                                       name='target_critic')

        self.update_network_parameters(tau=1)

    def choose_action(self, state):
        self.actor.eval(
        )  # set the network into evaluation mode (because we are batch norm)
        state = T.tensor([state], dtype=T.float).to(self.actor.device)
        # the actions we got is totally determenistic so we need to add noise
        mu = self.actor.forward(state).to(self.actor.device)
        # adding noise to the actor output (states in the paper p4)
        mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to(
            self.actor.device)
        # back to train mode
        self.actor.train()
        # detach() takes the tensor from the cpu, then we convert it to numpy
        # in order to feed it to the invironment
        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, new_states, dones = \
                self.memory.sampling(self.batch_size)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        dones = T.tensor(dones).to(self.actor.device)
        new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device)
        # print(states.shape)
        # print('actions size inside learn', actions.shape)
        target_actions = self.target_actor.forward(new_states)
        target_critic_value = self.target_critic.forward(
            new_states, target_actions)
        critic_value = self.critic.forward(states, actions)

        target_critic_value[
            dones] = 0.0  # make the value of the terminal state =0
        target_critic_value = target_critic_value.view(
            -1)  # not sure why ?? TODO:test

        target = rewards + self.gamma * target_critic_value
        target = target.view(self.batch_size,
                             1)  # convert to the same size as critic_value

        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.actor.optimizer.zero_grad()
        actor_loss = -self.critic.forward(states, self.actor.forward(states))
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()
        # print("im inside the learn function >>>>>>")
        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_state_dict = dict(target_critic_params)
        target_actor_state_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau * critic_state_dict[name].clone() + \
                                    (1-tau) * target_critic_state_dict[name].clone()
        for name in actor_state_dict:
            actor_state_dict[name] = tau * actor_state_dict[name].clone() + \
                                    (1-tau) * target_actor_state_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)
        self.target_actor.load_state_dict(actor_state_dict)
Exemplo n.º 2
0
class TD3Agent():
    def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2,
            gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300,
            max_size=1000000, batch_size=100,warmup=1000, noise=0.1):
        self.alpha = alpha
        self.beta = beta
        self.tau = tau
        self.batch_size = batch_size
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        self.gamma = gamma 
        self.n_actions = n_actions
        self.learn_step_cntr = 0
        self.time_step = 0
        self.warmup = warmup
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.update_actor_iter = update_actor_interval

        self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Critic1')
        self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Critic2')
        self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms,
                    fc2Dms, name='actor')
        # target nets
        self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Target_critic1')
        self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Target_critic2')
        self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms,
                    fc2Dms, name='Target_actor')
        self.noise = noise 
        # set the target nets to be exactly as our nets
        self.update_network_parameters(tau=1)


    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward,
                                new_state, done)

    def choose_action(self, state):
        # check if we are in time after the warmup
        if self.time_step < self.warmup:
            # scale is the standard deviation
            mu = T.tensor(np.random.normal(scale=self.noise, size=(self.n_actions,)))
        else:
            # print("the warmup is done")
            state = T.tensor(state, dtype=T.float).to(self.actor.device)
            mu = self.actor.forward(state).to(self.actor.device)
            
        mu_prime = mu + T.tensor(np.random.normal(scale=self.noise),
                     dtype=T.float).to(self.actor.device)

        # we want to make sure the mu is not out of the max action the env can take
        mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0])
        self.time_step +=1
        
        # action.shape= (2,)
        # print(action)
        return mu_prime.cpu().detach().numpy()
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            # print("not learning")
            return 
        
        states, actions, rewards, new_states, dones = \
                self.memory.sampling(self.batch_size)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        dones = T.tensor(dones).to(self.actor.device)
        new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device)

        # regularization
        # might breake if elements of min and max are not equal
        target_action = self.target_actor.forward(new_states) + \
                T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5)
        target_action = T.clamp(target_action, self.min_action[0], self.max_action[0])

        target_critic1_q = self.target_critic_1.forward(new_states, target_action)
        target_critic2_q = self.target_critic_2.forward(new_states, target_action)

        target_critic1_q[dones] = 0
        target_critic2_q[dones] = 0

        target_critic1_q = target_critic1_q.view(-1)
        target_critic2_q = target_critic2_q.view(-1)

        q1 = self.critic_1.forward(states, actions)
        q2 = self.critic_2.forward(states, actions)
        

        y = rewards + self.gamma * T.min(target_critic1_q, target_critic2_q)
        y = y.view(self.batch_size, 1)

        
        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        q1_loss = F.mse_loss(y, q1)
        q2_loss = F.mse_loss(y, q2)
        critic_loss = q1_loss + q2_loss
        critic_loss.backward()

        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.learn_step_cntr +=1

        if self.learn_step_cntr % self.update_actor_iter != 0:
            return 
        # print("learning>>")
        self.actor.optimizer.zero_grad()
        actor_loss = self.critic_1.forward(states, self.actor.forward(states))
        actor_loss = -T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()
        
        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        target_actor_params = self.target_actor.named_parameters()

        critic1_params = self.critic_1.named_parameters()
        critic2_params = self.critic_2.named_parameters()
        target_critic1_params = self.target_critic_1.named_parameters()
        target_critic2_params = self.target_critic_2.named_parameters()

        critic1_state_dict = dict(critic1_params)
        critic2_state_dict = dict(critic2_params)
        target_critic1_state_dict = dict(target_critic1_params)
        target_critic2_state_dict = dict(target_critic2_params)

        actor_state_dict = dict(actor_params)
        target_actor_state_dict = dict(target_actor_params)

        # for name in target_actor_state_dict:
        #     target_actor_state_dict[name] = tau * actor_state_dict[name].clone() + \
        #             (1-tau) * target_actor_state_dict[name].clone()

        # for name in target_critic1_state_dict:
        #     target_critic1_state_dict[name] = tau * critic1_state_dict[name].clone() +\
        #             (1-tau) * target_critic1_state_dict[name].clone()

        # for name in target_critic2_state_dict:
        #     target_critic2_state_dict[name] = tau * critic2_state_dict[name].clone() +\
        #             (1-tau) * target_critic2_state_dict[name].clone()

           
        # self.target_actor.load_state_dict(target_actor_state_dict) 
        # self.target_critic_1.load_state_dict(target_critic1_state_dict)
        # self.target_critic_2.load_state_dict(target_critic2_state_dict)


        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                    (1-tau) * target_actor_state_dict[name].clone()

        for name in critic1_state_dict:
            critic1_state_dict[name] = tau*critic1_state_dict[name].clone() + \
                    (1-tau) * target_critic1_state_dict[name].clone()

        for name in critic2_state_dict:
            critic2_state_dict[name] = tau*critic2_state_dict[name].clone() + \
                    (1-tau) * target_critic2_state_dict[name].clone()

           
        self.target_actor.load_state_dict(actor_state_dict) 
        self.target_critic_1.load_state_dict(critic1_state_dict)
        self.target_critic_2.load_state_dict(critic2_state_dict)


    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.target_critic_1.save_checkpoint()
        self.target_critic_2.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.target_critic_1.load_checkpoint()
        self.target_critic_2.load_checkpoint()