def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2, gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=100,warmup=1000, noise=0.1): self.alpha = alpha self.beta = beta self.tau = tau self.batch_size = batch_size self.max_action = env.action_space.high self.min_action = env.action_space.low self.gamma = gamma self.n_actions = n_actions self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.update_actor_iter = update_actor_interval self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic1') self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic2') self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='actor') # target nets self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic1') self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic2') self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_actor') self.noise = noise # set the target nets to be exactly as our nets self.update_network_parameters(tau=1)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # _build up the actor/critic evaluated network self.actor_net = Actor(env_params, hidden_units=256) self.critic_net = Critic(env_params, hidden_units=256) # sync the networks across the cpus for parallel training (when running at workstation) sync_networks(self.actor_net) sync_networks(self.critic_net) # _build up the actor/critic target network self.actor_target_net = Actor(env_params, hidden_units=256) self.critic_target_net = Critic(env_params, hidden_units=256) # if gpu is used if self.args.cuda: self.actor_net.cuda() self.critic_net.cuda() self.actor_target_net.cuda() self.critic_target_net.cuda() # the optimizer of the networks self.actor_optimizer = torch.optim.Adam( self.actor_net.parameters(), lr=self.args.learning_rate_actor) self.critic_optimizer = torch.optim.Adam( self.critic_net.parameters(), lr=self.args.learning_rate_critic) # HER sample function self.her_sample = HER(self.args.replay_strategy, self.args.replay_ratio, self.env.compute_reward) # experience buffer self.exp_buffer = ReplayBuffer(self.env_params, self.args.buffer_size, self.her_sample.her_sample_transitions) # the normalization of the observation and goal self.obs_norm = Normalizer(size=env_params['obs'], clip_range=self.args.clip_range) self.goal_norm = Normalizer(size=env_params['d_goal'], clip_range=self.args.clip_range) # create the dictionary to save the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # get the model path self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, env, sess): # Environment self.n_state = env.observation_space.shape[0] self.n_action = env.action_space.shape[0] # Neural Networks self.sess = sess self.actor = Actor(self.sess, self.n_state, self.n_action) self.critic = Critic(self.sess, self.n_state, self.n_action) # Replay Buffer self.replay_buffer = ReplayBuffer(BUFFER_SIZE) # Ornstein-Uhlenbeck Noise self.exploration_noise = OUNoise(self.n_action)
def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=64): self.alpha = alpha self.tau = tau self.beta = beta self.batch_size = batch_size self.gamma = gamma self.n_actions = n_actions print(batch_size, fc1Dms, fc2Dms) self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='actor') self.critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='critic') self.target_actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_actor') self.target_critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_critic') self.update_network_parameters(tau=1)
########################## Make env and save inform. ########################### env = gym.make(args.env_name) args.action_dim = env.action_space.shape[0] args.max_action = int(env.action_space.high[0]) args.state_dim = env.observation_space.shape[0] ################################### Set seed ################################### env.seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) ############## Set replay-buffer, rl-agent, es-agents, and actor ############### replay_buffer = ReplayBuffer(args, args.state_dim, args.action_dim, max_size=args.buffer_size) rl_agent = TD3(args.state_dim, args.action_dim, args.max_action, args) es_agent = CEM(args, num_params=rl_agent.actor.get_size(), mu_init=rl_agent.actor.get_params()) actor = copy.deepcopy(rl_agent.actor) ##################### Set data-frame for saving results ######################## df_log = pd.DataFrame(columns=["Step", "AvgES", "BestES", "RL"]) df_steps = pd.DataFrame(columns=["Step"] + [f"Ind{i}" for i in range(1, args.pop_size + 1)]) df_fitness = pd.DataFrame(columns=["Step"] + [f"Ind{i}" for i in range(1, args.pop_size + 1)]) df_mu = pd.DataFrame(columns=["Step", "Mean", "Std"] + [f"Reward{i}" for i in range(1, args.n_eval + 1)])
class DDPGAgent: def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # _build up the actor/critic evaluated network self.actor_net = Actor(env_params, hidden_units=256) self.critic_net = Critic(env_params, hidden_units=256) # sync the networks across the cpus for parallel training (when running at workstation) sync_networks(self.actor_net) sync_networks(self.critic_net) # _build up the actor/critic target network self.actor_target_net = Actor(env_params, hidden_units=256) self.critic_target_net = Critic(env_params, hidden_units=256) # if gpu is used if self.args.cuda: self.actor_net.cuda() self.critic_net.cuda() self.actor_target_net.cuda() self.critic_target_net.cuda() # the optimizer of the networks self.actor_optimizer = torch.optim.Adam( self.actor_net.parameters(), lr=self.args.learning_rate_actor) self.critic_optimizer = torch.optim.Adam( self.critic_net.parameters(), lr=self.args.learning_rate_critic) # HER sample function self.her_sample = HER(self.args.replay_strategy, self.args.replay_ratio, self.env.compute_reward) # experience buffer self.exp_buffer = ReplayBuffer(self.env_params, self.args.buffer_size, self.her_sample.her_sample_transitions) # the normalization of the observation and goal self.obs_norm = Normalizer(size=env_params['obs'], clip_range=self.args.clip_range) self.goal_norm = Normalizer(size=env_params['d_goal'], clip_range=self.args.clip_range) # create the dictionary to save the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # get the model path self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) ############################### # Name: learning # Function: Training the model # Comment: ############################### def learning(self): success_rate_history = [] for epoch in range(self.args.n_epochs): for _ in range(self.args.n_cycles): exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff, exp_actions_buff = [], [], [], [] for _ in range(self.args.num_exp_per_mpi): # reset the environment and experience exp_obs, exp_a_goal, exp_d_goal, exp_actions = [], [], [], [] observations = self.env.reset() obs = observations['observation'] a_goal = observations['achieved_goal'] d_goal = observations['desired_goal'] # interact with the environment for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._pre_process_inputs( obs, d_goal) policy_predictions = self.actor_net(input_tensor) action = self._choose_action(policy_predictions) # get the observations from the action observations_next, _, _, info = self.env.step(action) obs_next = observations_next['observation'] a_goal_next = observations_next['achieved_goal'] exp_obs.append(obs.copy()) exp_a_goal.append(a_goal.copy()) exp_d_goal.append(d_goal.copy()) exp_actions.append(action.copy()) # update the state obs = obs_next a_goal = a_goal_next exp_obs.append(obs.copy()) exp_a_goal.append(a_goal.copy()) exp_obs_buff.append(exp_obs) exp_a_goal_buff.append(exp_a_goal) exp_d_goal_buff.append(exp_d_goal) exp_actions_buff.append(exp_actions) exp_obs_buff = np.array(exp_obs_buff) exp_a_goal_buff = np.array(exp_a_goal_buff) exp_d_goal_buff = np.array(exp_d_goal_buff) exp_actions_buff = np.array(exp_actions_buff) # store the transitions self.exp_buffer.store_transition([ exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff, exp_actions_buff ]) self._update_normalizer([ exp_obs_buff, exp_a_goal_buff, exp_d_goal_buff, exp_actions_buff ]) for _ in range(self.args.n_batches): self._update_network() # training the network # soft update the network parameter self._soft_update_target_network(self.actor_target_net, self.actor_net) self._soft_update_target_network(self.critic_target_net, self.critic_net) # start evaluation success_rate = self._evaluate_agent() if MPI.COMM_WORLD.Get_rank() == 0: print('[{}] epoch is: {}, eval success rate is: {:.3f}'.format( datetime.now(), epoch, success_rate)) torch.save([ self.obs_norm.mean, self.obs_norm.std, self.goal_norm.mean, self.goal_norm.std, self.actor_net.state_dict() ], self.model_path + '/model.pt') success_rate_history.append(success_rate) success_rate_history = np.array(success_rate_history) np.savetxt('Plot_Data/Pen_HER.txt', success_rate_history, fmt='%f', delimiter=',') ############################### # Name: _pre_process_inputs # Function: process the inputs for the actor network # Comment: ############################### def _pre_process_inputs(self, obs, goal): obs_norm = self.obs_norm.normalize(obs) goal_norm = self.goal_norm.normalize(goal) # concatenate the stuffs inputs = np.concatenate([obs_norm, goal_norm]) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda() return inputs def _choose_action(self, policy_predictions): action = policy_predictions.cpu().numpy().squeeze() # create the noise action += self.args.noise_epsilon * self.env_params[ 'action_max'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max']) random_action = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], size=self.env_params['action']) # decide random or not action += np.random.binomial(1, self.args.random_epsilon, 1)[0] * (random_action - action) return action def _update_normalizer(self, experience_buff): exp_obs, exp_a_goal, exp_d_goal, exp_actions = experience_buff exp_obs_next = exp_obs[:, 1:, :] exp_a_goal_next = exp_a_goal[:, 1:, :] num_exps = exp_actions.shape[1] buffer_temp = { 'obs': exp_obs, 'a_goal': exp_a_goal, 'd_goal': exp_d_goal, 'actions': exp_actions, 'obs_next': exp_obs_next, 'a_goal_next': exp_a_goal_next, } transitions = self.her_sample.her_sample_transitions( buffer_temp, num_exps) obs, d_goal = transitions['obs'], transitions['d_goal'] transitions['obs'], transitions['d_goal'] = self._pre_process_obs_goal( obs, d_goal) # update self.obs_norm.update(transitions['obs']) self.goal_norm.update(transitions['d_goal']) # recompute the stats self.obs_norm.recompute_stats() self.goal_norm.recompute_stats() ############################### # Name: _pre_process_obs_goal # Function: process the observation and desired goal for the normalization # Comment: ############################### def _pre_process_obs_goal(self, obs, goal): obs_proceed = np.clip(obs, -self.args.clip_obs, self.args.clip_obs) goal_proceed = np.clip(goal, -self.args.clip_obs, self.args.clip_obs) return obs_proceed, goal_proceed ############################### # Name: _soft_update_target_network # Function: soft update the parameters of the target network # Comment: ############################### def _soft_update_target_network(self, target_net, eval_net): for target_param, param in zip(target_net.parameters(), eval_net.parameters()): target_param.data.copy_((1 - self.args.avg_coeff) * param.data + self.args.avg_coeff * target_param.data) ############################### # Name: _update_network # Function: train the parameters of the actor network and critic network # Comment: ############################### def _update_network(self): # sample the transitions transitions = self.exp_buffer.sample(self.args.batch_size) obs, obs_next, d_goal = transitions['obs'], transitions[ 'obs_next'], transitions['d_goal'] transitions['obs'], transitions['d_goal'] = self._pre_process_obs_goal( obs, d_goal) transitions['obs_next'], transitions[ 'd_goal_next'] = self._pre_process_obs_goal(obs_next, d_goal) observation_norm = self.obs_norm.normalize(transitions['obs']) d_goal_norm = self.goal_norm.normalize(transitions['d_goal']) inputs_norm = np.concatenate([observation_norm, d_goal_norm], axis=1) observation_next_norm = self.obs_norm.normalize( transitions['obs_next']) d_goal_next_norm = self.goal_norm.normalize(transitions['d_goal_next']) inputs_next_norm = np.concatenate( [observation_next_norm, d_goal_next_norm], axis=1) inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32) inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32) actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32) reward_tensor = torch.tensor(transitions['reward'], dtype=torch.float32) if self.args.cuda: inputs_norm_tensor = inputs_norm_tensor.cuda() inputs_next_norm_tensor = inputs_next_norm_tensor.cuda() actions_tensor = actions_tensor.cuda() reward_tensor = reward_tensor.cuda() # calculate the target Q value function with torch.no_grad(): actions_next = self.actor_target_net(inputs_next_norm_tensor) q_next_value = self.critic_target_net(inputs_next_norm_tensor, actions_next) q_next_value = q_next_value.detach() target_q_value = reward_tensor + self.args.gamma * q_next_value target_q_value = target_q_value.detach() clip_return = 1 / (1 - self.args.gamma) # ?????????????? target_q_value = torch.clamp(target_q_value, -clip_return, 0) # calculate the loss real_q_value = self.critic_net(inputs_norm_tensor, actions_tensor) critic_loss = (target_q_value - real_q_value).pow(2).mean() # the actor loss actions_real = self.actor_net(inputs_norm_tensor) actor_loss = -self.critic_net(inputs_norm_tensor, actions_real).mean() actor_loss += self.args.action_l2 * ( actions_real / self.env_params['action_max']).pow(2).mean() # start to train the network self.actor_optimizer.zero_grad() actor_loss.backward() sync_grads(self.actor_net) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() sync_grads(self.critic_net) self.critic_optimizer.step() ############################### # Name: _evaluate_agent # Function: evaluate the agent # Comment: ############################### def _evaluate_agent(self): all_success_rate = [] for _ in range(self.args.n_eval): per_success_rate = [] observations = self.env.reset() obs = observations['observation'] d_goal = observations['desired_goal'] for _ in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._pre_process_inputs(obs, d_goal) policy_predictions = self.actor_net(input_tensor) action = policy_predictions.detach().cpu().numpy().squeeze( ) observations_next, _, _, info = self.env.step(action) obs = observations_next['observation'] d_goal = observations_next['desired_goal'] per_success_rate.append(info['is_success']) all_success_rate.append(per_success_rate) all_success_rate = np.array(all_success_rate) local_success_rate = np.mean(all_success_rate[:, -1]) global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) return global_success_rate / MPI.COMM_WORLD.Get_size()
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker and learning parameters #self.count = 0 #self.score = 0 #self.total_reward = 0 #self.best_w = None #self.best_score = -np.inf #self.noise_scale = 0.1 def reset_episode(self): #self.count = 0 #self.score = 0 #self.total_reward = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) #self.count += 1 #self.total_reward += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] self.noise.sample() return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent: def __init__(self, env, sess): # Environment self.n_state = env.observation_space.shape[0] self.n_action = env.action_space.shape[0] # Neural Networks self.sess = sess self.actor = Actor(self.sess, self.n_state, self.n_action) self.critic = Critic(self.sess, self.n_state, self.n_action) # Replay Buffer self.replay_buffer = ReplayBuffer(BUFFER_SIZE) # Ornstein-Uhlenbeck Noise self.exploration_noise = OUNoise(self.n_action) def noise_action(self, state): '''Get action with noise''' return self.action(state) + self.exploration_noise.noise() def action(self, state): '''Get action from online actor''' return self.actor.action(state) def train(self): '''Train Networks''' # Draw sample from Replay Buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([d[0] for d in minibatch]) action_batch = np.asarray([d[1] for d in minibatch]) reward_batch = np.asarray([d[2] for d in minibatch]) next_state_batch = np.asarray([d[3] for d in minibatch]) done_batch = np.asarray([d[4] for d in minibatch]) # Train Critic next_action_batch = self.actor.target_actions(next_state_batch) target_q_value_batch = self.critic.target_q(next_state_batch, next_action_batch) # q = r if done else r+gamma*target_q q_batch = reward_batch.reshape( (BATCH_SIZE, 1)) + (1. - done_batch.reshape( BATCH_SIZE, 1).astype(float)) * GAMMA * target_q_value_batch self.critic.train(q_batch, state_batch, action_batch) # Train Actor action_batch_grads = self.actor.actions(state_batch) q_grads_batch = self.critic.gradients(state_batch, action_batch_grads) self.actor.train(q_grads_batch, state_batch) # Slowly update Target Networks self.actor.update_target() self.critic.update_target() def perceive(self, state, action, reward, next_state, done): '''Add transition to replay buffer and train if there are sufficient amount of transitions''' # Add samples self.replay_buffer.add(state, action, reward, next_state, done) # Train if there are sufficient number of samples if self.replay_buffer.count() > REPLAY_START: self.train() # Reset the noise for next episode if done: self.exploration_noise.reset()
class DDpgAgent(): def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=64): self.alpha = alpha self.tau = tau self.beta = beta self.batch_size = batch_size self.gamma = gamma self.n_actions = n_actions print(batch_size, fc1Dms, fc2Dms) self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='actor') self.critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='critic') self.target_actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_actor') self.target_critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_critic') self.update_network_parameters(tau=1) def choose_action(self, state): self.actor.eval( ) # set the network into evaluation mode (because we are batch norm) state = T.tensor([state], dtype=T.float).to(self.actor.device) # the actions we got is totally determenistic so we need to add noise mu = self.actor.forward(state).to(self.actor.device) # adding noise to the actor output (states in the paper p4) mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to( self.actor.device) # back to train mode self.actor.train() # detach() takes the tensor from the cpu, then we convert it to numpy # in order to feed it to the invironment return mu_prime.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, new_states, dones = \ self.memory.sampling(self.batch_size) actions = T.tensor(actions, dtype=T.float).to(self.actor.device) rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) states = T.tensor(states, dtype=T.float).to(self.actor.device) dones = T.tensor(dones).to(self.actor.device) new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device) # print(states.shape) # print('actions size inside learn', actions.shape) target_actions = self.target_actor.forward(new_states) target_critic_value = self.target_critic.forward( new_states, target_actions) critic_value = self.critic.forward(states, actions) target_critic_value[ dones] = 0.0 # make the value of the terminal state =0 target_critic_value = target_critic_value.view( -1) # not sure why ?? TODO:test target = rewards + self.gamma * target_critic_value target = target.view(self.batch_size, 1) # convert to the same size as critic_value self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.actor.optimizer.zero_grad() actor_loss = -self.critic.forward(states, self.actor.forward(states)) actor_loss = T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() # print("im inside the learn function >>>>>>") self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_state_dict = dict(target_critic_params) target_actor_state_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau * critic_state_dict[name].clone() + \ (1-tau) * target_critic_state_dict[name].clone() for name in actor_state_dict: actor_state_dict[name] = tau * actor_state_dict[name].clone() + \ (1-tau) * target_actor_state_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) self.target_actor.load_state_dict(actor_state_dict)
class TD3Agent(): def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2, gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=100,warmup=1000, noise=0.1): self.alpha = alpha self.beta = beta self.tau = tau self.batch_size = batch_size self.max_action = env.action_space.high self.min_action = env.action_space.low self.gamma = gamma self.n_actions = n_actions self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.update_actor_iter = update_actor_interval self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic1') self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic2') self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='actor') # target nets self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic1') self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic2') self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_actor') self.noise = noise # set the target nets to be exactly as our nets self.update_network_parameters(tau=1) def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def choose_action(self, state): # check if we are in time after the warmup if self.time_step < self.warmup: # scale is the standard deviation mu = T.tensor(np.random.normal(scale=self.noise, size=(self.n_actions,))) else: # print("the warmup is done") state = T.tensor(state, dtype=T.float).to(self.actor.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu + T.tensor(np.random.normal(scale=self.noise), dtype=T.float).to(self.actor.device) # we want to make sure the mu is not out of the max action the env can take mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0]) self.time_step +=1 # action.shape= (2,) # print(action) return mu_prime.cpu().detach().numpy() def learn(self): if self.memory.mem_cntr < self.batch_size: # print("not learning") return states, actions, rewards, new_states, dones = \ self.memory.sampling(self.batch_size) actions = T.tensor(actions, dtype=T.float).to(self.actor.device) rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) states = T.tensor(states, dtype=T.float).to(self.actor.device) dones = T.tensor(dones).to(self.actor.device) new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device) # regularization # might breake if elements of min and max are not equal target_action = self.target_actor.forward(new_states) + \ T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5) target_action = T.clamp(target_action, self.min_action[0], self.max_action[0]) target_critic1_q = self.target_critic_1.forward(new_states, target_action) target_critic2_q = self.target_critic_2.forward(new_states, target_action) target_critic1_q[dones] = 0 target_critic2_q[dones] = 0 target_critic1_q = target_critic1_q.view(-1) target_critic2_q = target_critic2_q.view(-1) q1 = self.critic_1.forward(states, actions) q2 = self.critic_2.forward(states, actions) y = rewards + self.gamma * T.min(target_critic1_q, target_critic2_q) y = y.view(self.batch_size, 1) self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q1_loss = F.mse_loss(y, q1) q2_loss = F.mse_loss(y, q2) critic_loss = q1_loss + q2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.learn_step_cntr +=1 if self.learn_step_cntr % self.update_actor_iter != 0: return # print("learning>>") self.actor.optimizer.zero_grad() actor_loss = self.critic_1.forward(states, self.actor.forward(states)) actor_loss = -T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() target_actor_params = self.target_actor.named_parameters() critic1_params = self.critic_1.named_parameters() critic2_params = self.critic_2.named_parameters() target_critic1_params = self.target_critic_1.named_parameters() target_critic2_params = self.target_critic_2.named_parameters() critic1_state_dict = dict(critic1_params) critic2_state_dict = dict(critic2_params) target_critic1_state_dict = dict(target_critic1_params) target_critic2_state_dict = dict(target_critic2_params) actor_state_dict = dict(actor_params) target_actor_state_dict = dict(target_actor_params) # for name in target_actor_state_dict: # target_actor_state_dict[name] = tau * actor_state_dict[name].clone() + \ # (1-tau) * target_actor_state_dict[name].clone() # for name in target_critic1_state_dict: # target_critic1_state_dict[name] = tau * critic1_state_dict[name].clone() +\ # (1-tau) * target_critic1_state_dict[name].clone() # for name in target_critic2_state_dict: # target_critic2_state_dict[name] = tau * critic2_state_dict[name].clone() +\ # (1-tau) * target_critic2_state_dict[name].clone() # self.target_actor.load_state_dict(target_actor_state_dict) # self.target_critic_1.load_state_dict(target_critic1_state_dict) # self.target_critic_2.load_state_dict(target_critic2_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau) * target_actor_state_dict[name].clone() for name in critic1_state_dict: critic1_state_dict[name] = tau*critic1_state_dict[name].clone() + \ (1-tau) * target_critic1_state_dict[name].clone() for name in critic2_state_dict: critic2_state_dict[name] = tau*critic2_state_dict[name].clone() + \ (1-tau) * target_critic2_state_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) self.target_critic_1.load_state_dict(critic1_state_dict) self.target_critic_2.load_state_dict(critic2_state_dict) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.target_critic_1.save_checkpoint() self.target_critic_2.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.target_actor.load_checkpoint() self.target_critic_1.load_checkpoint() self.target_critic_2.load_checkpoint()