class DDpgAgent(): def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=64): self.alpha = alpha self.tau = tau self.beta = beta self.batch_size = batch_size self.gamma = gamma self.n_actions = n_actions print(batch_size, fc1Dms, fc2Dms) self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='actor') self.critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='critic') self.target_actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_actor') self.target_critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_critic') self.update_network_parameters(tau=1) def choose_action(self, state): self.actor.eval( ) # set the network into evaluation mode (because we are batch norm) state = T.tensor([state], dtype=T.float).to(self.actor.device) # the actions we got is totally determenistic so we need to add noise mu = self.actor.forward(state).to(self.actor.device) # adding noise to the actor output (states in the paper p4) mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to( self.actor.device) # back to train mode self.actor.train() # detach() takes the tensor from the cpu, then we convert it to numpy # in order to feed it to the invironment return mu_prime.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, new_states, dones = \ self.memory.sampling(self.batch_size) actions = T.tensor(actions, dtype=T.float).to(self.actor.device) rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) states = T.tensor(states, dtype=T.float).to(self.actor.device) dones = T.tensor(dones).to(self.actor.device) new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device) # print(states.shape) # print('actions size inside learn', actions.shape) target_actions = self.target_actor.forward(new_states) target_critic_value = self.target_critic.forward( new_states, target_actions) critic_value = self.critic.forward(states, actions) target_critic_value[ dones] = 0.0 # make the value of the terminal state =0 target_critic_value = target_critic_value.view( -1) # not sure why ?? TODO:test target = rewards + self.gamma * target_critic_value target = target.view(self.batch_size, 1) # convert to the same size as critic_value self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.actor.optimizer.zero_grad() actor_loss = -self.critic.forward(states, self.actor.forward(states)) actor_loss = T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() # print("im inside the learn function >>>>>>") self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_state_dict = dict(target_critic_params) target_actor_state_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau * critic_state_dict[name].clone() + \ (1-tau) * target_critic_state_dict[name].clone() for name in actor_state_dict: actor_state_dict[name] = tau * actor_state_dict[name].clone() + \ (1-tau) * target_actor_state_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) self.target_actor.load_state_dict(actor_state_dict)
class DDPGAgent: def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # _build up the actor/critic evaluated network self.actor_net = Actor(env_params, hidden_units=256) self.critic_net = Critic(env_params, hidden_units=256) # sync the networks across the cpus for parallel training (when running at workstation) sync_networks(self.actor_net) sync_networks(self.critic_net) # _build up the actor/critic target network self.actor_target_net = Actor(env_params, hidden_units=256) self.critic_target_net = Critic(env_params, hidden_units=256) # if gpu is used if self.args.cuda: self.actor_net.cuda() self.critic_net.cuda() self.actor_target_net.cuda() self.critic_target_net.cuda() # the optimizer of the networks self.actor_optimizer = torch.optim.Adam( self.actor_net.parameters(), lr=self.args.learning_rate_actor) self.critic_optimizer = torch.optim.Adam( self.critic_net.parameters(), lr=self.args.learning_rate_critic) # HER sample function self.her_sample = HER(self.args.replay_strategy, self.args.replay_ratio, self.env.compute_reward) # experience buffer self.exp_buffer = ReplayBuffer(self.env_params, self.args.buffer_size, self.her_sample.her_sample_transitions) # the normalization of the observation and goal self.obs_norm = Normalizer(size=env_params['obs'], clip_range=self.args.clip_range) self.goal_norm = Normalizer(size=env_params['d_goal'], clip_range=self.args.clip_range) # create the dictionary to save the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # get the model path self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) ############################### # Name: learning # Function: Training the model # Comment: ############################### def learning(self): success_rate_history = [] for epoch in range(self.args.n_epochs): for _ in range(self.args.n_cycles): exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff, exp_actions_buff = [], [], [], [] for _ in range(self.args.num_exp_per_mpi): # reset the environment and experience exp_obs, exp_a_goal, exp_d_goal, exp_actions = [], [], [], [] observations = self.env.reset() obs = observations['observation'] a_goal = observations['achieved_goal'] d_goal = observations['desired_goal'] # interact with the environment for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._pre_process_inputs( obs, d_goal) policy_predictions = self.actor_net(input_tensor) action = self._choose_action(policy_predictions) # get the observations from the action observations_next, _, _, info = self.env.step(action) obs_next = observations_next['observation'] a_goal_next = observations_next['achieved_goal'] exp_obs.append(obs.copy()) exp_a_goal.append(a_goal.copy()) exp_d_goal.append(d_goal.copy()) exp_actions.append(action.copy()) # update the state obs = obs_next a_goal = a_goal_next exp_obs.append(obs.copy()) exp_a_goal.append(a_goal.copy()) exp_obs_buff.append(exp_obs) exp_a_goal_buff.append(exp_a_goal) exp_d_goal_buff.append(exp_d_goal) exp_actions_buff.append(exp_actions) exp_obs_buff = np.array(exp_obs_buff) exp_a_goal_buff = np.array(exp_a_goal_buff) exp_d_goal_buff = np.array(exp_d_goal_buff) exp_actions_buff = np.array(exp_actions_buff) # store the transitions self.exp_buffer.store_transition([ exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff, exp_actions_buff ]) self._update_normalizer([ exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff, exp_actions_buff ]) for _ in range(self.args.n_batches): self._update_network() # training the network # soft update the network parameter self._soft_update_target_network(self.actor_target_net, self.actor_net) self._soft_update_target_network(self.critic_target_net, self.critic_net) # start evaluation success_rate = self._evaluate_agent() if MPI.COMM_WORLD.Get_rank() == 0: print('[{}] epoch is: {}, eval success rate is: {:.3f}'.format( datetime.now(), epoch, success_rate)) torch.save([ self.obs_norm.mean, self.obs_norm.std, self.goal_norm.mean, self.goal_norm.std, self.actor_net.state_dict() ], self.model_path + '/model.pt') success_rate_history.append(success_rate) success_rate_history = np.array(success_rate_history) np.savetxt('Plot_Data/Pen_HER.txt', success_rate_history, fmt='%f', delimiter=',') ############################### # Name: _pre_process_inputs # Function: process the inputs for the actor network # Comment: ############################### def _pre_process_inputs(self, obs, goal): obs_norm = self.obs_norm.normalize(obs) goal_norm = self.goal_norm.normalize(goal) # concatenate the stuffs inputs = np.concatenate([obs_norm, goal_norm]) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda() return inputs def _choose_action(self, policy_predictions): action = policy_predictions.cpu().numpy().squeeze() # create the noise action += self.args.noise_epsilon * self.env_params[ 'action_max'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max']) random_action = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], size=self.env_params['action']) # decide random or not action += np.random.binomial(1, self.args.random_epsilon, 1)[0] * (random_action - action) return action def _update_normalizer(self, experience_buff): exp_obs, exp_a_goal, exp_d_goal, exp_actions = experience_buff exp_obs_next = exp_obs[:, 1:, :] exp_a_goal_next = exp_a_goal[:, 1:, :] num_exps = exp_actions.shape[1] buffer_temp = { 'obs': exp_obs, 'a_goal': exp_a_goal, 'd_goal': exp_d_goal, 'actions': exp_actions, 'obs_next': exp_obs_next, 'a_goal_next': exp_a_goal_next, } transitions = self.her_sample.her_sample_transitions( buffer_temp, num_exps) obs, d_goal = transitions['obs'], transitions['d_goal'] transitions['obs'], transitions['d_goal'] = self._pre_process_obs_goal( obs, d_goal) # update self.obs_norm.update(transitions['obs']) self.goal_norm.update(transitions['d_goal']) # recompute the stats self.obs_norm.recompute_stats() self.goal_norm.recompute_stats() ############################### # Name: _pre_process_obs_goal # Function: process the observation and desired goal for the normalization # Comment: ############################### def _pre_process_obs_goal(self, obs, goal): obs_proceed = np.clip(obs, -self.args.clip_obs, self.args.clip_obs) goal_proceed = np.clip(goal, -self.args.clip_obs, self.args.clip_obs) return obs_proceed, goal_proceed ############################### # Name: _soft_update_target_network # Function: soft update the parameters of the target network # Comment: ############################### def _soft_update_target_network(self, target_net, eval_net): for target_param, param in zip(target_net.parameters(), eval_net.parameters()): target_param.data.copy_((1 - self.args.avg_coeff) * param.data + self.args.avg_coeff * target_param.data) ############################### # Name: _update_network # Function: train the parameters of the actor network and critic network # Comment: ############################### def _update_network(self): # sample the transitions transitions = self.exp_buffer.sample(self.args.batch_size) obs, obs_next, d_goal = transitions['obs'], transitions[ 'obs_next'], transitions['d_goal'] transitions['obs'], transitions['d_goal'] = self._pre_process_obs_goal( obs, d_goal) transitions['obs_next'], transitions[ 'd_goal_next'] = self._pre_process_obs_goal(obs_next, d_goal) observation_norm = self.obs_norm.normalize(transitions['obs']) d_goal_norm = self.goal_norm.normalize(transitions['d_goal']) inputs_norm = np.concatenate([observation_norm, d_goal_norm], axis=1) observation_next_norm = self.obs_norm.normalize( transitions['obs_next']) d_goal_next_norm = self.goal_norm.normalize(transitions['d_goal_next']) inputs_next_norm = np.concatenate( [observation_next_norm, d_goal_next_norm], axis=1) inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32) inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32) actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32) reward_tensor = torch.tensor(transitions['reward'], dtype=torch.float32) if self.args.cuda: inputs_norm_tensor = inputs_norm_tensor.cuda() inputs_next_norm_tensor = inputs_next_norm_tensor.cuda() actions_tensor = actions_tensor.cuda() reward_tensor = reward_tensor.cuda() # calculate the target Q value function with torch.no_grad(): actions_next = self.actor_target_net(inputs_next_norm_tensor) q_next_value = self.critic_target_net(inputs_next_norm_tensor, actions_next) q_next_value = q_next_value.detach() target_q_value = reward_tensor + self.args.gamma * q_next_value target_q_value = target_q_value.detach() clip_return = 1 / (1 - self.args.gamma) # ?????????????? target_q_value = torch.clamp(target_q_value, -clip_return, 0) # calculate the loss real_q_value = self.critic_net(inputs_norm_tensor, actions_tensor) critic_loss = (target_q_value - real_q_value).pow(2).mean() # the actor loss actions_real = self.actor_net(inputs_norm_tensor) actor_loss = -self.critic_net(inputs_norm_tensor, actions_real).mean() actor_loss += self.args.action_l2 * ( actions_real / self.env_params['action_max']).pow(2).mean() # start to train the network self.actor_optimizer.zero_grad() actor_loss.backward() sync_grads(self.actor_net) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() sync_grads(self.critic_net) self.critic_optimizer.step() ############################### # Name: _evaluate_agent # Function: evaluate the agent # Comment: ############################### def _evaluate_agent(self): all_success_rate = [] for _ in range(self.args.n_eval): per_success_rate = [] observations = self.env.reset() obs = observations['observation'] d_goal = observations['desired_goal'] for _ in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._pre_process_inputs(obs, d_goal) policy_predictions = self.actor_net(input_tensor) action = policy_predictions.detach().cpu().numpy().squeeze( ) observations_next, _, _, info = self.env.step(action) obs = observations_next['observation'] d_goal = observations_next['desired_goal'] per_success_rate.append(info['is_success']) all_success_rate.append(per_success_rate) all_success_rate = np.array(all_success_rate) local_success_rate = np.mean(all_success_rate[:, -1]) global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) return global_success_rate / MPI.COMM_WORLD.Get_size()
class TD3Agent(): def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2, gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=100,warmup=1000, noise=0.1): self.alpha = alpha self.beta = beta self.tau = tau self.batch_size = batch_size self.max_action = env.action_space.high self.min_action = env.action_space.low self.gamma = gamma self.n_actions = n_actions self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.update_actor_iter = update_actor_interval self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic1') self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic2') self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='actor') # target nets self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic1') self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic2') self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_actor') self.noise = noise # set the target nets to be exactly as our nets self.update_network_parameters(tau=1) def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def choose_action(self, state): # check if we are in time after the warmup if self.time_step < self.warmup: # scale is the standard deviation mu = T.tensor(np.random.normal(scale=self.noise, size=(self.n_actions,))) else: # print("the warmup is done") state = T.tensor(state, dtype=T.float).to(self.actor.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu + T.tensor(np.random.normal(scale=self.noise), dtype=T.float).to(self.actor.device) # we want to make sure the mu is not out of the max action the env can take mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0]) self.time_step +=1 # action.shape= (2,) # print(action) return mu_prime.cpu().detach().numpy() def learn(self): if self.memory.mem_cntr < self.batch_size: # print("not learning") return states, actions, rewards, new_states, dones = \ self.memory.sampling(self.batch_size) actions = T.tensor(actions, dtype=T.float).to(self.actor.device) rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) states = T.tensor(states, dtype=T.float).to(self.actor.device) dones = T.tensor(dones).to(self.actor.device) new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device) # regularization # might breake if elements of min and max are not equal target_action = self.target_actor.forward(new_states) + \ T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5) target_action = T.clamp(target_action, self.min_action[0], self.max_action[0]) target_critic1_q = self.target_critic_1.forward(new_states, target_action) target_critic2_q = self.target_critic_2.forward(new_states, target_action) target_critic1_q[dones] = 0 target_critic2_q[dones] = 0 target_critic1_q = target_critic1_q.view(-1) target_critic2_q = target_critic2_q.view(-1) q1 = self.critic_1.forward(states, actions) q2 = self.critic_2.forward(states, actions) y = rewards + self.gamma * T.min(target_critic1_q, target_critic2_q) y = y.view(self.batch_size, 1) self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q1_loss = F.mse_loss(y, q1) q2_loss = F.mse_loss(y, q2) critic_loss = q1_loss + q2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.learn_step_cntr +=1 if self.learn_step_cntr % self.update_actor_iter != 0: return # print("learning>>") self.actor.optimizer.zero_grad() actor_loss = self.critic_1.forward(states, self.actor.forward(states)) actor_loss = -T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() target_actor_params = self.target_actor.named_parameters() critic1_params = self.critic_1.named_parameters() critic2_params = self.critic_2.named_parameters() target_critic1_params = self.target_critic_1.named_parameters() target_critic2_params = self.target_critic_2.named_parameters() critic1_state_dict = dict(critic1_params) critic2_state_dict = dict(critic2_params) target_critic1_state_dict = dict(target_critic1_params) target_critic2_state_dict = dict(target_critic2_params) actor_state_dict = dict(actor_params) target_actor_state_dict = dict(target_actor_params) # for name in target_actor_state_dict: # target_actor_state_dict[name] = tau * actor_state_dict[name].clone() + \ # (1-tau) * target_actor_state_dict[name].clone() # for name in target_critic1_state_dict: # target_critic1_state_dict[name] = tau * critic1_state_dict[name].clone() +\ # (1-tau) * target_critic1_state_dict[name].clone() # for name in target_critic2_state_dict: # target_critic2_state_dict[name] = tau * critic2_state_dict[name].clone() +\ # (1-tau) * target_critic2_state_dict[name].clone() # self.target_actor.load_state_dict(target_actor_state_dict) # self.target_critic_1.load_state_dict(target_critic1_state_dict) # self.target_critic_2.load_state_dict(target_critic2_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau) * target_actor_state_dict[name].clone() for name in critic1_state_dict: critic1_state_dict[name] = tau*critic1_state_dict[name].clone() + \ (1-tau) * target_critic1_state_dict[name].clone() for name in critic2_state_dict: critic2_state_dict[name] = tau*critic2_state_dict[name].clone() + \ (1-tau) * target_critic2_state_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) self.target_critic_1.load_state_dict(critic1_state_dict) self.target_critic_2.load_state_dict(critic2_state_dict) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.target_critic_1.save_checkpoint() self.target_critic_2.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.target_actor.load_checkpoint() self.target_critic_1.load_checkpoint() self.target_critic_2.load_checkpoint()