class AgentTrainer(pl.LightningModule): ''' Pytorch trainer class for Drone Reinforcement learning ''' def __init__(self, hparams): ''' Initializations ''' super().__init__() self.hparams = hparams # Position of human source_position = torch.tensor([[self.hparams.environment.position.end.x], [self.hparams.environment.position.end.y], [self.hparams.environment.position.end.z]]).float() # Position of agent agent_position = torch.tensor([[self.hparams.environment.position.start.x], [self.hparams.environment.position.start.y], [self.hparams.environment.position.start.z]]).float() # Initialize Replay buffer self.replay_buffer = ReplayBuffer(capacity = self.hparams.model.replay_buffer_size) # Initialize drone self.agent = Drone(start_position = agent_position, goal_position = source_position, velocity_factor = self.hparams.environment.agent.velocity_factor, hparams = self.hparams, buffer = self.replay_buffer) # Actor networks self.net = Actor(**self.hparams.model.actor) self.target_net = Actor(**self.hparams.model.actor) # Critic networks self.critic = Critic(**self.hparams.model.critic) self.target_critic = Critic(**self.hparams.model.critic) # Hard update self.target_net.load_state_dict(self.net.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) self.total_reward = -10000 self.episode_steps = 0.0 self.max_episode_steps = self.hparams.model.max_episode self.episode_reward = 0.0 self.populate(self.hparams.model.replay_buffer_size) def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_( target_param.data * (1.0 - tau) + param.data * tau ) def configure_optimizers(self): optimizer2 = getattr(torch.optim, self.hparams.optimizer.type)([{"params": self.net.parameters(), "lr": self.hparams.optimizer.args.lr}], **self.hparams.optimizer.args) optimizer = getattr(torch.optim, self.hparams.optimizer.type)(self.critic.parameters(), **self.hparams.optimizer.args, weight_decay=1e-3) scheduler2 = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.type)(optimizer, **self.hparams.scheduler.args) scheduler = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.type)(optimizer, **self.hparams.scheduler.args) return [optimizer, optimizer2], [scheduler, scheduler2] def dqn_mse_loss(self, batch) -> torch.Tensor: """ Calculates the mse loss using a mini batch from the replay buffer Args: batch: current mini batch of replay data Returns: loss """ states, actions, rewards, dones, next_states = batch #print(states["image"].shape, rewards.shape) rewards_out = rewards[:, -1] print(actions.shape, rewards_out.shape, rewards.shape, "shapes") #print(rewards.shape, actions.shape, "reward, action") # print(states["image"].shape) # state_action_values = self.net(states["image"], states["signal"]).gather(1, actions.unsqueeze(-1)).squeeze(-1) action_value = self.net(next_states["image"]) Q_value = self.critic(next_states["image"], action_value).squeeze(-1) # print(state_action_values) with torch.no_grad(): #next_action_value = self.target_net(next_states["image"], next_states["signal"]) #print(next_action_value.shape, "action") next_Q_value = self.target_critic(states["image"], actions.float()).squeeze(-1) # next_state_values[dones] = 0.0 #print("Q value:", next_Q_value.shape) #next_action_value = next_action_value.detach() next_Q_value = next_Q_value.detach() #Q_value_actor = self.critic(next_states["image"], next_states["signal"], action_value).squeeze(-1) #print(next_Q_value.shape, rewards_out.shape) expected_state_action_values = Q_value * self.hparams.model.gamma + rewards_out #print(expected_state_action_values.shape, Q_value.shape) return {"loss": nn.MSELoss()(next_Q_value, expected_state_action_values), "policy_loss": - (Q_value).mean()} def populate(self, steps: int = 1000) -> None: ''' Carries out several random steps through the environment to initially fill up the replay buffer with experiences ''' for i in range(steps): print(i) self.agent.playStep(self.net, 1.0, self.get_device()) if i % self.max_episode_steps == 0: self.agent.reset() self.agent.reset() def playTrajectory(self): ''' Play the trajectory ''' self.agent.reset() device = self.get_device() while (True): self.agent.playStep(self.net, 0, device) def training_step(self, batch, batch_idx, optimizer_idx): ''' Training steps ''' self.episode_steps = self.episode_steps + 1 device = self.get_device() epsilon = max(self.hparams.model.min_epsilon, self.hparams.model.max_epsilon - (self.global_step + 1) / self.hparams.model.stop_decay) print("eps:", epsilon) # step through environment with agent reward, done = self.agent.playStep(self.target_net, epsilon, device) self.episode_reward += reward # calculates training loss loss = self.dqn_mse_loss(batch) #print(loss) self.log("train_loss", loss["loss"], on_epoch = True, prog_bar = True, on_step = True, logger = True) self.log("policy_loss", loss["policy_loss"], on_epoch = True, prog_bar = True, on_step = True, logger = True) if done: if self.episode_reward > self.total_reward: self.total_reward = self.episode_reward self.episode_reward = 0 self.episode_steps = 0 if optimizer_idx: loss_out = loss["policy_loss"] else: loss_out = loss["loss"] # Soft update of target network if self.global_step % self.hparams.model.sync_rate == 0: self.soft_update(self.target_net, self.net, self.hparams.model.tau) self.soft_update(self.target_critic, self.critic, self.hparams.model.tau) # self.target_net.load_state_dict(self.net.state_dict()) # self.target_critic.load_state_dict(self.critic.state_dict()) log = { 'total_reward': torch.tensor(self.total_reward).to(device), 'reward': torch.tensor(reward).to(device), 'steps': torch.tensor(self.global_step).to(device) } for key in log: self.log(key, log[key], logger = True, prog_bar = True, on_step = True) if self.episode_steps > self.max_episode_steps: self.episode_steps = 0 self.total_reward = self.episode_reward self.agent.reset() #print(loss_out) #return OrderedDict({'loss': loss, 'log': log, 'progress_bar': log}) return loss_out def __dataloader(self) -> DataLoader: """ Initialize the Replay Buffer dataset used for retrieving experiences """ dataset = RLDataset(self.replay_buffer, self.hparams.model.sample_size) dataloader = DataLoader( dataset=dataset, **self.hparams.dataset.loader) return dataloader def train_dataloader(self) -> DataLoader: """ Get train loader """ return self.__dataloader() def get_device(self) -> str: """ Retrieve device currently being used by minibatch """ return self.device.index if self.on_gpu else 'cpu' def forward(self, x): return self.net(x)
class ActorCritic: def __init__(self, env): self.env = env self.num_robots = env.num_robots self.learning_rate = 0.0001 self.epsilon = .9 self.epsilon_decay = .99995 self.eps_counter = 0 self.gamma = .90 self.tau = .01 self.buffer_size = 1000000 self.batch_size = 512 self.hyper_parameters_lambda3 = 0.2 self.hyper_parameters_eps = 0.2 self.hyper_parameters_eps_d = 0.4 self.demo_size = 1000 self.time_str = time.strftime("%Y%m%d-%H%M%S") self.parent_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/DRLbasedController/weights" self.path = os.path.join(self.parent_dir, self.time_str) os.mkdir(self.path) # Replay buffer self.memory = deque(maxlen=1000000) # Replay Buffer self.replay_buffer = ExperienceReplayBuffer(total_timesteps=5000 * 256, type_buffer="HER") # File name self.file_name = "reward_{}_{}_{}".format( self.time_str, self.num_robots, self.replay_buffer.type_buffer) # Hidden Layer list self.hid_list = [1024, 512, 512] # ===================================================================== # # Actor Model # # Chain rule: find the gradient of chaging the actor network params in # # getting closest to the final value network predictions, i.e. de/dA # # Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act # # ===================================================================== # self.actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list) self.target_actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list) self.actor_optim = optim.Adam(self.actor_model.parameters(), lr=self.learning_rate) # ===================================================================== # # Critic Model # # ===================================================================== # self.critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list) self.target_critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list) self.critic_optim = optim.Adam(self.critic_model.parameters(), lr=self.learning_rate) hard_update( self.target_actor_model, self.actor_model) # Make sure target is with the same weight hard_update(self.target_critic_model, self.critic_model) self.cuda() # ========================================================================= # # Model Training # # ========================================================================= # def remember(self, cur_state, action, reward, new_state, done): for i in range(self.num_robots): self.memory.append( [cur_state[i], action[i], reward[i], new_state[i], done[i]]) def _train_critic_actor(self, samples): Loss = nn.MSELoss() # 1, sample cur_states, actions, rewards, new_states, dones, weights, batch_idxes = stack_samples( samples) # PER version also checks if I need to use stack_samples target_actions = to_numpy( self.target_actor_model(to_tensor(new_states))) # Critic Update self.critic_model.zero_grad() Q_now = self.critic_model([cur_states, actions]) next_Q = self.target_critic_model([new_states, target_actions]) dones = dones.astype(bool) Q_target = to_tensor(rewards) + self.gamma * next_Q.reshape( next_Q.shape[0]) * to_tensor(1 - dones) td_errors = Q_target - Q_now.reshape(Q_now.shape[0]) value_loss = Loss(Q_target, Q_now.squeeze()) value_loss.backward() self.critic_optim.step() # Actor Update self.actor_model.zero_grad() policy_loss = -self.critic_model( [to_tensor(cur_states), self.actor_model(to_tensor(cur_states))]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # NoisyNet noise reset self.actor_model.reset_noise() self.target_actor_model.reset_noise() return td_errors def read_Q_values(self, cur_states, actions): critic_values = self.critic_model.predict([cur_states, actions]) return critic_values def train(self, t): batch_size = self.batch_size if self.replay_buffer.replay_buffer.__len__() < batch_size: #per return samples = self.replay_buffer.replay_buffer.sample( batch_size, beta=self.replay_buffer.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = samples self.samples = samples td_errors = self._train_critic_actor(samples) # priority updates #new_priorities = np.abs(td_errors) + self.replay_buffer.prioritized_replay_eps #self.replay_buffer.replay_buffer.update_priorities(batch_idxes, new_priorities) # ========================================================================= # # Target Model Updating # # ========================================================================= # def _update_actor_target(self): soft_update(self.target_actor_model, self.actor_model, self.tau) def _update_critic_target(self): soft_update(self.target_critic_model, self.critic_model, self.tau) def update_target(self): self._update_actor_target() self._update_critic_target() # ========================================================================= # # Model Predictions # # ========================================================================= # def act( self, cur_state ): # this function returns action, which is predicted by the model. parameter is epsilon if self.eps_counter >= self.num_robots: self.epsilon *= self.epsilon_decay self.eps_counter = 0 else: self.eps_counter += 1 eps = self.epsilon cur_state = np.array(cur_state).reshape(1, 8) action = to_numpy(self.actor_model(to_tensor(cur_state))).squeeze(0) action = action.reshape(1, 2) if np.random.random() < self.epsilon: action[0][0] = action[0][0] + (np.random.random() - 0.5) * 0.4 action[0][1] = action[0][1] + (np.random.random()) * 0.4 return action, eps else: action[0][0] = action[0][0] action[0][1] = action[0][1] return action, eps # ========================================================================= # # save weights # # ========================================================================= # def save_weight(self, num_trials, trial_len): torch.save( self.actor_model.state_dict(), self.path + '/actormodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.pkl') torch.save( self.critic_model.state_dict(), self.path + '/criticmodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.pkl') #self.actor_model.save_weights(self.path + 'actormodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.h5', overwrite=True) #self.critic_model.save_weights(self.path + 'criticmodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.h5', overwrite=True)#("criticmodel.h5", overwrite=True) # ========================================================================= # # load weights # # ========================================================================= # def load_weights(self, output): self.actor_model.load_state_dict(torch.load('{}.pkl'.format(output))) self.critic_model.load_state_dict(torch.load('{}.pkl'.format(output))) def play(self, cur_state): return to_numpy(self.actor_model(to_tensor(cur_state), volatile=True)).squeeze(0) def cuda(self): self.actor_model.cuda() self.target_actor_model.cuda() self.critic_model.cuda() self.target_critic_model.cuda()
class DDPG: def __init__(self, n_states, n_actions, hidden_dim=90, device="cpu", critic_lr=5e-3, actor_lr=5e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128): self.device = device self.critic_lr = critic_lr self.actor_lr = actor_lr self.critic = Critic(n_states, n_actions, hidden_dim).to(device) self.actor = Actor(n_states, n_actions, hidden_dim).to(device) self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device) self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.memory = ReplayBuffer(memory_capacity) self.batch_size = batch_size self.soft_tau = soft_tau self.gamma = gamma def select_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action = self.actor(state) # torch.detach()用于切断反向传播 return action.detach().cpu().numpy()[0] def update(self): if len(self.memory) < self.batch_size: return state, action, reward, next_state, done = self.memory.sample( self.batch_size) # 将所有变量转为张量 state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device) done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) # 注意critic将(s_t,a)作为输入 actor_loss = self.critic(state, self.actor(state)) actor_loss = -actor_loss.mean() next_action = self.target_actor(next_state) target_value = self.target_critic(next_state, next_action.detach()) expected_value = reward + (1.0 - done) * self.gamma * target_value expected_value = torch.clamp(expected_value, -np.inf, np.inf) value = self.critic(state, action) critic_loss = nn.MSELoss()(value, expected_value.detach()) #训练优化actor及critic网络 self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # soft更新目标网络 for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) def save_model(self, path): torch.save(self.target_actor.state_dict(), path) def load_model(self, path): self.actor.load_state_dict(torch.load(path)) def buffer_model_save(self, saved_dir): self.memory.save(saved_dir) torch.save(self.critic.state_dict(), saved_dir + "/critic_checkpoint.pth") torch.save(self.actor.state_dict(), saved_dir + "/actor_checkpoint.pth") torch.save(self.target_critic.state_dict(), saved_dir + "/target_critic_checkpoint.pth") torch.save(self.target_actor.state_dict(), saved_dir + "/target_actor_checkpoint.pth") def buffer_model_load(self, saved_dir): if not os.path.exists(saved_dir): # 检测是否存在文件夹 os.makedirs(saved_dir) return self.memory.load(saved_dir) self.critic.load_state_dict( torch.load(saved_dir + "/critic_checkpoint.pth")) self.actor.load_state_dict( torch.load(saved_dir + "/actor_checkpoint.pth")) self.target_critic.load_state_dict( torch.load(saved_dir + "/target_critic_checkpoint.pth")) self.target_actor.load_state_dict( torch.load(saved_dir + "/target_actor_checkpoint.pth")) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr)