Exemplo n.º 1
0
class AgentTrainer(pl.LightningModule):
    '''
    Pytorch trainer class for Drone Reinforcement learning
    '''

    def __init__(self, hparams):
        '''
        Initializations
        '''
        super().__init__()
        self.hparams = hparams

        # Position of human
        source_position = torch.tensor([[self.hparams.environment.position.end.x],
                                        [self.hparams.environment.position.end.y],
                                        [self.hparams.environment.position.end.z]]).float()

        # Position of agent
        agent_position  = torch.tensor([[self.hparams.environment.position.start.x],
                                        [self.hparams.environment.position.start.y],
                                        [self.hparams.environment.position.start.z]]).float()


        # Initialize Replay buffer
        self.replay_buffer = ReplayBuffer(capacity = self.hparams.model.replay_buffer_size)


        # Initialize drone
        self.agent = Drone(start_position = agent_position,
                           goal_position = source_position,
                           velocity_factor = self.hparams.environment.agent.velocity_factor,
                           hparams = self.hparams,
                           buffer = self.replay_buffer)

        # Actor networks
        self.net = Actor(**self.hparams.model.actor)
        self.target_net = Actor(**self.hparams.model.actor)

        # Critic networks
        self.critic = Critic(**self.hparams.model.critic)
        self.target_critic = Critic(**self.hparams.model.critic)

        # Hard update
        self.target_net.load_state_dict(self.net.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.total_reward = -10000
        self.episode_steps = 0.0
        self.max_episode_steps = self.hparams.model.max_episode
        self.episode_reward = 0.0
        self.populate(self.hparams.model.replay_buffer_size)


    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - tau) + param.data * tau
            )

    def configure_optimizers(self):

        optimizer2 = getattr(torch.optim, self.hparams.optimizer.type)([{"params": self.net.parameters(), "lr": self.hparams.optimizer.args.lr}], **self.hparams.optimizer.args)
        optimizer = getattr(torch.optim, self.hparams.optimizer.type)(self.critic.parameters(), **self.hparams.optimizer.args, weight_decay=1e-3)

        scheduler2 = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.type)(optimizer, **self.hparams.scheduler.args)
        scheduler = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.type)(optimizer, **self.hparams.scheduler.args)

        return [optimizer, optimizer2], [scheduler, scheduler2]

    def dqn_mse_loss(self, batch) -> torch.Tensor:
        """
        Calculates the mse loss using a mini batch from the replay buffer
        Args:
            batch: current mini batch of replay data
        Returns:
            loss
        """
        states, actions, rewards, dones, next_states = batch

        #print(states["image"].shape, rewards.shape)
        rewards_out = rewards[:, -1]
        print(actions.shape, rewards_out.shape, rewards.shape, "shapes")
        #print(rewards.shape, actions.shape, "reward, action")
        # print(states["image"].shape)
        # state_action_values = self.net(states["image"], states["signal"]).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        action_value = self.net(next_states["image"])
        Q_value = self.critic(next_states["image"], action_value).squeeze(-1)

        # print(state_action_values)

        with torch.no_grad():


            #next_action_value = self.target_net(next_states["image"], next_states["signal"])
            #print(next_action_value.shape, "action")
            next_Q_value = self.target_critic(states["image"], actions.float()).squeeze(-1)
            # next_state_values[dones] = 0.0
            #print("Q value:", next_Q_value.shape)
            #next_action_value = next_action_value.detach()
            next_Q_value = next_Q_value.detach()

            #Q_value_actor = self.critic(next_states["image"], next_states["signal"], action_value).squeeze(-1)

        #print(next_Q_value.shape, rewards_out.shape)
        expected_state_action_values = Q_value * self.hparams.model.gamma + rewards_out
        #print(expected_state_action_values.shape, Q_value.shape)
        return {"loss": nn.MSELoss()(next_Q_value, expected_state_action_values), "policy_loss": - (Q_value).mean()}

    def populate(self, steps: int = 1000) -> None:
        '''
        Carries out several random steps through the environment to initially fill
        up the replay buffer with experiences
        '''

        for i in range(steps):
            print(i)
            self.agent.playStep(self.net, 1.0, self.get_device())

            if i % self.max_episode_steps == 0:
                self.agent.reset()

        self.agent.reset()

    def playTrajectory(self):
        '''
        Play the trajectory
        '''
        self.agent.reset()
        device = self.get_device()
        while (True):

            self.agent.playStep(self.net, 0, device)

    def training_step(self, batch, batch_idx, optimizer_idx):
        '''
        Training steps
        '''

        self.episode_steps = self.episode_steps + 1
        device = self.get_device()
        epsilon = max(self.hparams.model.min_epsilon, self.hparams.model.max_epsilon - (self.global_step + 1) / self.hparams.model.stop_decay)
        print("eps:", epsilon)

        # step through environment with agent
        reward, done = self.agent.playStep(self.target_net, epsilon, device)
        self.episode_reward += reward

        # calculates training loss
        loss = self.dqn_mse_loss(batch)
        #print(loss)
        self.log("train_loss", loss["loss"], on_epoch = True, prog_bar = True, on_step = True, logger = True)
        self.log("policy_loss", loss["policy_loss"], on_epoch = True, prog_bar = True, on_step = True, logger = True)

        if done:
            if self.episode_reward > self.total_reward:
                self.total_reward = self.episode_reward

            self.episode_reward = 0
            self.episode_steps = 0


        if optimizer_idx:
            loss_out = loss["policy_loss"]
        else:
            loss_out = loss["loss"]

        # Soft update of target network
        if self.global_step % self.hparams.model.sync_rate == 0:

            self.soft_update(self.target_net, self.net, self.hparams.model.tau)
            self.soft_update(self.target_critic, self.critic, self.hparams.model.tau)

            # self.target_net.load_state_dict(self.net.state_dict())
            # self.target_critic.load_state_dict(self.critic.state_dict())

        log = {
            'total_reward': torch.tensor(self.total_reward).to(device),
            'reward': torch.tensor(reward).to(device),
            'steps': torch.tensor(self.global_step).to(device)
        }
        for key in log:
            self.log(key, log[key], logger = True, prog_bar = True, on_step = True)

        if self.episode_steps > self.max_episode_steps:
            self.episode_steps = 0
            self.total_reward = self.episode_reward
            self.agent.reset()
        #print(loss_out)
        #return OrderedDict({'loss': loss, 'log': log, 'progress_bar': log})
        return loss_out


    def __dataloader(self) -> DataLoader:
        """
        Initialize the Replay Buffer dataset used for retrieving experiences
        """

        dataset = RLDataset(self.replay_buffer, self.hparams.model.sample_size)
        dataloader = DataLoader(
            dataset=dataset,
            **self.hparams.dataset.loader)

        return dataloader

    def train_dataloader(self) -> DataLoader:
        """
        Get train loader
        """

        return self.__dataloader()

    def get_device(self) -> str:
        """
        Retrieve device currently being used by minibatch
        """

        return self.device.index if self.on_gpu else 'cpu'

    def forward(self, x):

        return self.net(x)
Exemplo n.º 2
0
class ActorCritic:
    def __init__(self, env):
        self.env = env
        self.num_robots = env.num_robots

        self.learning_rate = 0.0001
        self.epsilon = .9
        self.epsilon_decay = .99995
        self.eps_counter = 0
        self.gamma = .90
        self.tau = .01

        self.buffer_size = 1000000
        self.batch_size = 512

        self.hyper_parameters_lambda3 = 0.2
        self.hyper_parameters_eps = 0.2
        self.hyper_parameters_eps_d = 0.4

        self.demo_size = 1000
        self.time_str = time.strftime("%Y%m%d-%H%M%S")
        self.parent_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/DRLbasedController/weights"
        self.path = os.path.join(self.parent_dir, self.time_str)
        os.mkdir(self.path)

        # Replay buffer
        self.memory = deque(maxlen=1000000)
        # Replay Buffer
        self.replay_buffer = ExperienceReplayBuffer(total_timesteps=5000 * 256,
                                                    type_buffer="HER")
        # File name
        self.file_name = "reward_{}_{}_{}".format(
            self.time_str, self.num_robots, self.replay_buffer.type_buffer)
        # Hidden Layer list
        self.hid_list = [1024, 512, 512]
        # ===================================================================== #
        #                               Actor Model                             #
        # Chain rule: find the gradient of chaging the actor network params in  #
        # getting closest to the final value network predictions, i.e. de/dA    #
        # Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act #
        # ===================================================================== #

        self.actor_model = Actor(self.env.observation_space.shape,
                                 self.env.action_space.shape, self.hid_list)
        self.target_actor_model = Actor(self.env.observation_space.shape,
                                        self.env.action_space.shape,
                                        self.hid_list)
        self.actor_optim = optim.Adam(self.actor_model.parameters(),
                                      lr=self.learning_rate)

        # ===================================================================== #
        #                              Critic Model                             #
        # ===================================================================== #

        self.critic_model = Critic(self.env.observation_space.shape,
                                   self.env.action_space.shape, 1,
                                   self.hid_list)
        self.target_critic_model = Critic(self.env.observation_space.shape,
                                          self.env.action_space.shape, 1,
                                          self.hid_list)
        self.critic_optim = optim.Adam(self.critic_model.parameters(),
                                       lr=self.learning_rate)

        hard_update(
            self.target_actor_model,
            self.actor_model)  # Make sure target is with the same weight
        hard_update(self.target_critic_model, self.critic_model)

        self.cuda()

    # ========================================================================= #
    #                               Model Training                              #
    # ========================================================================= #

    def remember(self, cur_state, action, reward, new_state, done):
        for i in range(self.num_robots):
            self.memory.append(
                [cur_state[i], action[i], reward[i], new_state[i], done[i]])

    def _train_critic_actor(self, samples):

        Loss = nn.MSELoss()

        # 1, sample
        cur_states, actions, rewards, new_states, dones, weights, batch_idxes = stack_samples(
            samples)  # PER version also checks if I need to use stack_samples
        target_actions = to_numpy(
            self.target_actor_model(to_tensor(new_states)))

        # Critic Update
        self.critic_model.zero_grad()
        Q_now = self.critic_model([cur_states, actions])
        next_Q = self.target_critic_model([new_states, target_actions])
        dones = dones.astype(bool)
        Q_target = to_tensor(rewards) + self.gamma * next_Q.reshape(
            next_Q.shape[0]) * to_tensor(1 - dones)

        td_errors = Q_target - Q_now.reshape(Q_now.shape[0])

        value_loss = Loss(Q_target, Q_now.squeeze())
        value_loss.backward()
        self.critic_optim.step()

        # Actor Update
        self.actor_model.zero_grad()
        policy_loss = -self.critic_model(
            [to_tensor(cur_states),
             self.actor_model(to_tensor(cur_states))])
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # NoisyNet noise reset
        self.actor_model.reset_noise()
        self.target_actor_model.reset_noise()

        return td_errors

    def read_Q_values(self, cur_states, actions):
        critic_values = self.critic_model.predict([cur_states, actions])
        return critic_values

    def train(self, t):
        batch_size = self.batch_size
        if self.replay_buffer.replay_buffer.__len__() < batch_size:  #per
            return

        samples = self.replay_buffer.replay_buffer.sample(
            batch_size, beta=self.replay_buffer.beta_schedule.value(t))
        (obses_t, actions, rewards, obses_tp1, dones, weights,
         batch_idxes) = samples

        self.samples = samples
        td_errors = self._train_critic_actor(samples)

        # priority updates
        #new_priorities = np.abs(td_errors) + self.replay_buffer.prioritized_replay_eps
        #self.replay_buffer.replay_buffer.update_priorities(batch_idxes, new_priorities)

    # ========================================================================= #
    #                         Target Model Updating                             #
    # ========================================================================= #

    def _update_actor_target(self):
        soft_update(self.target_actor_model, self.actor_model, self.tau)

    def _update_critic_target(self):
        soft_update(self.target_critic_model, self.critic_model, self.tau)

    def update_target(self):
        self._update_actor_target()
        self._update_critic_target()

    # ========================================================================= #
    #                              Model Predictions                            #
    # ========================================================================= #

    def act(
        self, cur_state
    ):  # this function returns action, which is predicted by the model. parameter is epsilon
        if self.eps_counter >= self.num_robots:
            self.epsilon *= self.epsilon_decay
            self.eps_counter = 0
        else:
            self.eps_counter += 1
        eps = self.epsilon
        cur_state = np.array(cur_state).reshape(1, 8)
        action = to_numpy(self.actor_model(to_tensor(cur_state))).squeeze(0)
        action = action.reshape(1, 2)
        if np.random.random() < self.epsilon:
            action[0][0] = action[0][0] + (np.random.random() - 0.5) * 0.4
            action[0][1] = action[0][1] + (np.random.random()) * 0.4
            return action, eps
        else:
            action[0][0] = action[0][0]
            action[0][1] = action[0][1]
            return action, eps

    # ========================================================================= #
    #                              save weights                                 #
    # ========================================================================= #

    def save_weight(self, num_trials, trial_len):
        torch.save(
            self.actor_model.state_dict(), self.path + '/actormodel' + '-' +
            str(num_trials) + '-' + str(trial_len) + '.pkl')
        torch.save(
            self.critic_model.state_dict(), self.path + '/criticmodel' + '-' +
            str(num_trials) + '-' + str(trial_len) + '.pkl')
        #self.actor_model.save_weights(self.path + 'actormodel' + '-' +  str(num_trials) + '-' + str(trial_len) + '.h5', overwrite=True)
        #self.critic_model.save_weights(self.path + 'criticmodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.h5', overwrite=True)#("criticmodel.h5", overwrite=True)

        # ========================================================================= #

    #                              load weights                                 #
    # ========================================================================= #

    def load_weights(self, output):

        self.actor_model.load_state_dict(torch.load('{}.pkl'.format(output)))

        self.critic_model.load_state_dict(torch.load('{}.pkl'.format(output)))

    def play(self, cur_state):
        return to_numpy(self.actor_model(to_tensor(cur_state),
                                         volatile=True)).squeeze(0)

    def cuda(self):
        self.actor_model.cuda()
        self.target_actor_model.cuda()
        self.critic_model.cuda()
        self.target_critic_model.cuda()
Exemplo n.º 3
0
class DDPG:
    def __init__(self,
                 n_states,
                 n_actions,
                 hidden_dim=90,
                 device="cpu",
                 critic_lr=5e-3,
                 actor_lr=5e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128):
        self.device = device
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.actor = Actor(n_states, n_actions, hidden_dim).to(device)
        self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device)

        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.memory = ReplayBuffer(memory_capacity)
        self.batch_size = batch_size
        self.soft_tau = soft_tau
        self.gamma = gamma

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor(state)
        # torch.detach()用于切断反向传播
        return action.detach().cpu().numpy()[0]

    def update(self):
        if len(self.memory) < self.batch_size:
            return
        state, action, reward, next_state, done = self.memory.sample(
            self.batch_size)
        # 将所有变量转为张量
        state = torch.FloatTensor(state).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
        # 注意critic将(s_t,a)作为输入

        actor_loss = self.critic(state, self.actor(state))
        actor_loss = -actor_loss.mean()

        next_action = self.target_actor(next_state)
        target_value = self.target_critic(next_state, next_action.detach())
        expected_value = reward + (1.0 - done) * self.gamma * target_value
        expected_value = torch.clamp(expected_value, -np.inf, np.inf)

        value = self.critic(state, action)
        critic_loss = nn.MSELoss()(value, expected_value.detach())

        #训练优化actor及critic网络
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # soft更新目标网络
        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)

    def save_model(self, path):
        torch.save(self.target_actor.state_dict(), path)

    def load_model(self, path):
        self.actor.load_state_dict(torch.load(path))

    def buffer_model_save(self, saved_dir):
        self.memory.save(saved_dir)
        torch.save(self.critic.state_dict(),
                   saved_dir + "/critic_checkpoint.pth")
        torch.save(self.actor.state_dict(),
                   saved_dir + "/actor_checkpoint.pth")
        torch.save(self.target_critic.state_dict(),
                   saved_dir + "/target_critic_checkpoint.pth")
        torch.save(self.target_actor.state_dict(),
                   saved_dir + "/target_actor_checkpoint.pth")

    def buffer_model_load(self, saved_dir):
        if not os.path.exists(saved_dir):  # 检测是否存在文件夹
            os.makedirs(saved_dir)
            return
        self.memory.load(saved_dir)

        self.critic.load_state_dict(
            torch.load(saved_dir + "/critic_checkpoint.pth"))
        self.actor.load_state_dict(
            torch.load(saved_dir + "/actor_checkpoint.pth"))
        self.target_critic.load_state_dict(
            torch.load(saved_dir + "/target_critic_checkpoint.pth"))
        self.target_actor.load_state_dict(
            torch.load(saved_dir + "/target_actor_checkpoint.pth"))

        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)