class DDPGAgents(): """ Agent used to interact with and learns from the environment """ def __init__(self, state_size, action_size, config): """ Initialize an agent object """ self.state_size = state_size self.action_size = action_size self.config = config # retrieve number of agents self.num_agents = config["DDPG"]["num_agents"] # logging for this class self.logger = logging.getLogger(self.__class__.__name__) # gpu support self.device = pick_device(config, self.logger) ## Actor local and target networks self.actor_local = Actor(state_size, action_size, config).to(self.device) self.actor_target = Actor(state_size, action_size, config).to(self.device) self.actor_optimizer = getattr( optim, config["optimizer_actor"]["optimizer_type"])( self.actor_local.parameters(), betas=tuple(config["optimizer_actor"]["betas"]), **config["optimizer_actor"]["optimizer_params"]) ## Critic local and target networks self.critic_local = Critic(state_size, action_size, config).to(self.device) self.critic_target = Critic(state_size, action_size, config).to(self.device) self.critic_optimizer = getattr( optim, config["optimizer_critic"]["optimizer_type"])( self.critic_local.parameters(), betas=tuple(config["optimizer_critic"]["betas"]), **config["optimizer_critic"]["optimizer_params"]) ## Noise process self.noise = OUNoise((self.num_agents, action_size)) ## Replay memory self.memory = ReplayBuffer(config=config, action_size=action_size, buffer_size=int( config["DDPG"]["buffer_size"]), batch_size=config["trainer"]["batch_size"]) def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use random sample from buffer to learn """ # Save experience in replay memory shared by all agents for agent in range(self.num_agents): self.memory.add(state[agent, :], action[agent, :], reward[agent], next_state[agent, :], done[agent]) # learn every timestep as long as enough samples are available in memory if len(self.memory) > self.config["trainer"]["batch_size"]: experiences = self.memory.sample() self.learn(experiences, self.config["DDPG"]["gamma"]) def act(self, states, add_noise=False): """ Returns actions for given state as per current policy """ # Convert state to tensor² states = torch.from_numpy(states).float().to(self.device) # prepare actions numpy array for all agents actions = np.zeros((self.num_agents, self.action_size)) ## Evaluation mode self.actor_local.eval() with torch.no_grad(): # Forward pass of local actor network for agent, state in enumerate(states): action_values = self.actor_local.forward( state).cpu().data.numpy() actions[agent, :] = action_values # pdb.set_trace() ## Training mode self.actor_local.train() if add_noise: # Add noise to improve exploration to our actor policy # action_values += torch.from_numpy(self.noise.sample()).type(torch.FloatTensor).to(self.device) actions += self.noise.sample() # Clip action to stay in the range [-1, 1] for our task actions = np.clip(actions, -1, 1) return actions def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples """ states, actions, rewards, next_states, dones = experiences ## Update actor (policy) network using the sampled policy gradient # Compute actor loss actions_pred = self.actor_local.forward(states) actor_loss = -self.critic_local.forward(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ## Update critic (value) network # Get predicted next-state actions and Q-values from target models actions_next = self.actor_target.forward(next_states) Q_targets_next = self.critic_target.forward(next_states, actions_next) # Compute Q-targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q-values from local critic model Q_expected = self.critic_local.forward(states, actions) # Compute loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() ## Update target networks with a soft update self.soft_update(self.actor_local, self.actor_target, self.config["DDPG"]["tau"]) self.soft_update(self.critic_local, self.critic_target, self.config["DDPG"]["tau"]) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters, improves the stability of learning """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): """ Reset noise """ self.noise.reset()
class TD3Agent(): def __init__(self, env: object, gamma: float, delay_step: int, tau: float, buffer_maxlen: int, noise_std: float, noise_bound: float, critic_lr: float, actor_lr: float): # Selecting the device to use, wheter CUDA (GPU) if available or CPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Creating the Gym environments for training and evaluation self.env = env # Get max and min values of the action of this environment self.action_range = [ self.env.action_space.low, self.env.action_space.high ] # Get dimension of of the state and the state self.obs_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Total_step initialization self.steps = 0 # hyperparameters self.gamma = gamma self.tau = tau self.critic_lr = critic_lr self.actor_lr = actor_lr self.buffer_maxlen = buffer_maxlen self.noise_std = noise_std self.noise_bound = noise_bound self.delay_step = delay_step # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions self.scale = (self.action_range[1] - self.action_range[0]) / 2.0 self.bias = (self.action_range[1] + self.action_range[0]) / 2.0 # initialize networks self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) self.target_critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) self.target_critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.target_actor = Actor(self.obs_dim, self.action_dim).to(self.device) # copy weight parameters to the target Q network and actor network for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param) # initialize optimizers self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=self.critic_lr) self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=self.critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) # Create a replay buffer self.replay_buffer = BasicBuffer(self.buffer_maxlen) def update(self, batch_size: int, steps: int): self.steps = steps # Sampling experiences from the replay buffer states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # Convert numpy arrays of experience tuples into pytorch tensors states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Critic update (computing the loss # Sample actions for the next states (s_t+1) using the target actor next_actions = self.target_actor.forward(next_states) next_actions = self.rescale_action(next_actions) # Adding gaussian noise to the actions noise = self.get_noise(next_actions, self.noise_std + 0.1, -self.noise_bound, self.noise_bound) noisy_next_actions = next_actions + noise # Compute Q(s_t+1,a_t+1) next_q1 = self.target_critic1(next_states, noisy_next_actions) next_q2 = self.target_critic2(next_states, noisy_next_actions) # Choose minimum Q min_q = torch.min(next_q1, next_q2) # Find expected Q, i.e., r(t) + gamma*next_q expected_q = rewards + (1 - dones) * self.gamma * min_q # Find current Q values for the given states and actions from replay buffer curr_q1 = self.critic1.forward(states, actions) curr_q2 = self.critic2.forward(states, actions) # Compute loss between Q network and expected Q critic1_loss = F.mse_loss(curr_q1, expected_q.detach()) critic2_loss = F.mse_loss(curr_q2, expected_q.detach()) # Backpropagate the losses and update Q network parameters self.critic1_optimizer.zero_grad() critic1_loss.backward() self.critic1_optimizer.step() self.critic2_optimizer.zero_grad() critic2_loss.backward() self.critic2_optimizer.step() # actor update (computing the loss) if self.steps % self.delay_step == 0: # Sample new actions for the current states (s_t) using the current actor new_actions = self.actor.forward(states) # Compute Q(s_t,a_t) new_q1 = self.critic1.forward(states, new_actions) # Compute the actor loss, i.e., -Q1 actor_loss = -new_q1.mean() # Backpropagate the losses and update actor network parameters self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the target networks for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) def get_noise(self, action: torch.Tensor, sigma: float, bottom: float, top: float) -> torch.Tensor: # sigma: standard deviation of the noise # bottom,top: minimum and maximum values for the given noiuse return torch.normal(torch.zeros(action.size()), sigma).clamp(bottom, top).to(self.device) def get_action(self, state: np.ndarray, stochastic: bool) -> np.ndarray: # state: the state input to the pi network # stochastic: boolean (True -> use noisy action, False -> use noiseless,deterministic action) # Convert state numpy to tensor state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action = self.actor.forward(state) if stochastic: # Add gaussian noise to the rescaled action action = self.rescale_action(action) + self.get_noise( action, self.noise_std, -self.noise_bound, self.noise_bound) else: action = self.rescale_action(action) # Convert action tensor to numpy action = action.squeeze(0).cpu().detach().numpy() return action def rescale_action(self, action: torch.Tensor) -> torch.Tensor: # we use a rescaled action since the output of the actor network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value # scale -> scalar multiplication # bias -> scalar offset return action * self.scale[0] + self.bias[0] def Actor_save(self, WORKSPACE: str): # save 각 node별 모델 저장 print("Save the torch model") savePath = WORKSPACE + "./actor_model5_Hop_.pth" torch.save(self.actor.state_dict(), savePath) def Actor_load(self, WORKSPACE: str): # save 각 node별 모델 로드 print("load the torch model") savePath = WORKSPACE + "./actor_model5_Hop_.pth" # Best self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor.load_state_dict(torch.load(savePath))
def main(): env = gym.make('InvertedPendulum-v2') # states: [x, theta, x', theta'] # action: [horizontal force] nstates = 4 nactions = 1 T = 2048 # environement steps per update batch_size = 64 epochs = 10 lr = 0.01 discount = 0.99 clipping_epsilon = 0.2 lam = 0.95 # GAE parameter total_timesteps = 1000000 actor = Actor(nstates, nactions) critic = Critic(nstates) n_updates = total_timesteps // T if total_timesteps % T != 0: n_updates += 1 n_batches_per_update = T // batch_size if T % batch_size != 0: n_batches_per_update += 1 episode_rewards = [] critic_losses = [] for update in tqdm(range(n_updates)): states, actions, rewards, dones, values, log_probs, ep_rewards = rollout( env, actor, critic, T, nstates, max_ep_length) episode_rewards += ep_rewards advantages, returns = get_advantages_and_returns( dones, rewards, values, discount, lam, T) idx = np.arange(T) for k in range(epochs): np.random.default_rng().shuffle(idx) for n in range(0, n_batches_per_update, batch_size): batch_idx = idx[n:n + batch_size] state = states[batch_idx] action = actions[batch_idx] log_prob = log_probs[batch_idx] advantage = advantages[batch_idx] G = returns[batch_idx] _, current_log_probs = actor.forward(batch_states, batch_actions, requires_grad=True) ratios = np.exp(current_log_probs - batch_log_probs) clipped_ratios = np.minimum( 1 + clipping_epsilon, np.maximum(1 - clipping_epsilon, ratios)) unclipped_surrogate = ratios * batch_A clipped_surrogate = clipped_ratios * batch_A actor_loss = -np.minimum(unclipped_surrogate, clipped_surrogate).mean() current_state_values = critic.forward(batch_states, requires_grad=True) critic_loss = ((current_state_values - batch_returns)**2).mean() # derivative of actor_loss w.r.t current_log_probs dAL_dlp = -unclipped_surrogate # derivative of clipped_ratios w.r.t ratios dcr_dr = np.zeros_like(ratios) dcr_dr[(ratios < 1 + clipping_epsilon) & (ratios > 1 - clipping_epsilon)] = 1.0 # only include the derivative of the clipped_ratio if the clipped_ratio was used clipped_used_idx = clipped_surrogate < unclipped_surrogate dAL_dlp[clipped_used_idx] *= dcr_dr[clipped_used_idx] # derivative of critic_loss w.r.t current_state_values dCL_dsv = current_state_values - batch_returns actor.backward(dAL_dlp) critic.backward(dCL_dsv) actor.optimization_step(lr) critic.optimization_step(lr) actor_losses.append(actor_loss) critic_losses.append(critic_loss) env.close() fig, ax = plt.subplots() ax.plot(moving_average(episode_rewards, 100)) plt.show() plt.close() fig, ax = plt.subplots() ax.plot(moving_average(critic_losses, 10)) plt.show() plt.close()
class AgentDDPG: def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] nn_params = params['nn_params'] nn_params['nn_actor']['l1'][0] = state_size nn_params['nn_actor']['l3'][1] = action_size nn_params['nn_critic']['l1'][0] = state_size + action_size self.__actor_local = Actor(nn_params['nn_actor']).to(device) self.__actor_target = Actor(nn_params['nn_actor']).to(device) self.__critic_local = Critic(nn_params['nn_critic']).to(device) self.__critic_target = Critic(nn_params['nn_critic']).to(device) self.__action_size = action_size self.__state_size = state_size self.__memory = ReplayBuffer(buf_params) self.__t = 0 self.gamma = params['gamma'] self.learning_rate_actor = params['learning_rate_actor'] self.learning_rate_critic = params['learning_rate_critic'] self.tau = params['tau'] self.__optimiser_actor = optim.Adam(self.__actor_local.parameters(), self.learning_rate_actor) self.__optimiser_critic = optim.Adam(self.__critic_local.parameters(), self.learning_rate_critic) self.__uo_process = UOProcess() # other parameters self.agent_loss = 0.0 # Set methods def set_learning_rate(self, lr_actor, lr_critic): self.learning_rate_actor = lr_actor self.learning_rate_critic = lr_critic for param_group in self.__optimiser_actor.param_groups: param_group['lr'] = lr_actor for param_group in self.__optimiser_critic.param_groups: param_group['lr'] = lr_critic # Get methods def get_actor(self): return self.__actor_local def get_critic(self): return self.__critic_local # Other methods def step(self, state, action, reward, next_state, done): # add experience to memory self.__memory.add(state, action, reward, next_state, done) if self.__memory.is_ready(): experiences = self.__memory.sample() self.__update(experiences) def choose_action(self, state, mode='train'): if mode == 'train': # state should be transformed to a tensor state = torch.from_numpy( np.array(state)).float().unsqueeze(0).to(device) self.__actor_local.eval() with torch.no_grad(): action = self.__actor_local(state) + self.__uo_process.sample() self.__actor_local.train() return list(np.clip(action.cpu().numpy().squeeze(), -1, 1)) elif mode == 'test': # state should be transformed to a tensor state = torch.from_numpy( np.array(state)).float().unsqueeze(0).to(device) self.__actor_local.eval() with torch.no_grad(): action = self.__actor_local(state) self.__actor_local.train() return list(np.clip(action.cpu().numpy().squeeze(), -1, 1)) else: print("Invalid mode value") def reset(self, sigma): self.__uo_process.reset(sigma) def __update(self, experiences): states, actions, rewards, next_states, dones = experiences # update critic # ---------------------------------------------------------- loss_fn = nn.MSELoss() self.__optimiser_critic.zero_grad() # form target next_actions = self.__actor_target(next_states) Q_target_next = self.__critic_target.forward( torch.cat((next_states, next_actions), dim=1)).detach() targets = rewards + self.gamma * Q_target_next * (1 - dones) # form output outputs = self.__critic_local.forward( torch.cat((states, actions), dim=1)) mean_loss_critic = loss_fn( outputs, targets) # minus added since it's gradient ascent mean_loss_critic.backward() self.__optimiser_critic.step() # update actor # ---------------------------------------------------------- self.__optimiser_actor.zero_grad() predicted_actions = self.__actor_local(states) mean_loss_actor = -self.__critic_local.forward( torch.cat((states, predicted_actions), dim=1)).mean() mean_loss_actor.backward() self.__optimiser_actor.step() # update actor self.__soft_update(self.__critic_local, self.__critic_target, self.tau) self.__soft_update(self.__actor_local, self.__actor_target, self.tau) @staticmethod def __soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class AgentDDPG: """Deep Deterministic Policy Gradient implementation for continuous action space reinforcement learning tasks""" def __init__(self, state_size, hidden_size, action_size, actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-2, use_cuda=False, actor_path=None, critic_path=None): # Params self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size self.gamma, self.tau = gamma, tau self.use_cuda = use_cuda # Networks self.actor = Actor(state_size, hidden_size, action_size) self.actor_target = Actor(state_size, hidden_size, action_size) self.critic = Critic(state_size + action_size, hidden_size, action_size) self.critic_target = Critic(state_size + action_size, hidden_size, action_size) # Load model state_dicts from saved file if actor_path and path.exists(actor_path): self.actor.load_state_dict(torch.load(actor_path)) if critic_path and path.exists(critic_path): self.critic.load_state_dict(torch.load(critic_path)) # Hard copy params from original networks to target networks copy_params(self.actor, self.actor_target) copy_params(self.critic, self.critic_target) if self.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() # Create replay buffer for storing experience self.replay_buffer = ReplayBuffer(cache_size=int(1e6)) # Training self.critic_criterion = nn.MSELoss() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) def save_to_file(self, actor_file, critic_file): # Save the state_dict's of the Actor and Critic networks torch.save(self.actor.state_dict(), actor_file) torch.save(self.critic.state_dict(), critic_file) def get_action(self, state): """Select action with respect to state according to current policy and exploration noise""" state = Variable(torch.from_numpy(state).float()) if self.use_cuda: state = state.cuda() a = self.actor.forward(state) if self.use_cuda: return a.detach().cpu().numpy() return a.detach().numpy() def save_experience(self, state_t, action_t, reward_t, state_t1): self.replay_buffer.add_sample(state_t, action_t, reward_t, state_t1) def update(self, batch_size): states, actions, rewards, next_states = self.replay_buffer.get_samples( batch_size) states = torch.FloatTensor(states) actions = torch.FloatTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) if self.use_cuda: states = states.cuda() next_states = next_states.cuda() actions = actions.cuda() rewards = rewards.cuda() # Critic loss Qvals = self.critic.forward(states, actions) next_actions = self.actor_target.forward(next_states) next_Q = self.critic_target.forward(next_states, next_actions.detach()) Qprime = rewards + self.gamma * next_Q critic_loss = self.critic_criterion(Qvals, Qprime) # Update critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Actor loss policy_loss = -self.critic.forward(states, self.actor.forward(states)).mean() # Update actor self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update target networks soft_copy_params(self.actor, self.actor_target, self.tau) soft_copy_params(self.critic, self.critic_target, self.tau) def add_noise_to_weights(self, amount=0.1): self.actor.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.critic.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.actor_target.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.critic_target.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
class TD3Agent: def __init__(self, env, gamma, tau, buffer_maxlen, delay_step, noise_std, noise_bound, critic_lr, actor_lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.noise_std = noise_std self.noise_bound = noise_bound self.update_step = 0 self.delay_step = delay_step # initialize actor and critic networks self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic1_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic2_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy critic target parameters for target_param, param in zip(self.critic1_target.parameters(), self.critic1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic2_target.parameters(), self.critic2.parameters()): target_param.data.copy_(param.data) # initialize optimizers self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=critic_lr) self.critic2_optimizer = optim.Adam(self.critic1.parameters(), lr=critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, obs): state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() return action def update(self, batch_size): state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) action_space_noise = self.generate_action_space_noise(action_batch) next_actions = self.actor.forward(state_batch) + action_space_noise next_Q1 = self.critic1_target.forward(next_state_batch, next_actions) next_Q2 = self.critic2_target.forward(next_state_batch, next_actions) expected_Q = reward_batch + self.gamma * torch.min(next_Q1, next_Q2) # critic loss curr_Q1 = self.critic1.forward(state_batch, action_batch) curr_Q2 = self.critic2.forward(state_batch, action_batch) critic1_loss = F.mse_loss(curr_Q1, expected_Q.detach()) critic2_loss = F.mse_loss(curr_Q2, expected_Q.detach()) # update critics self.critic1_optimizer.zero_grad() critic1_loss.backward() self.critic1_optimizer.step() self.critic2_optimizer.zero_grad() critic2_loss.backward() self.critic2_optimizer.step() # delyaed update for actor & target networks if (self.update_step % self.delay_step == 0): # actor self.actor_optimizer.zero_grad() policy_gradient = -self.critic1(state_batch, self.actor(state_batch)).mean() policy_gradient.backward() self.actor_optimizer.step() # target networks self.update_targets() self.update_step += 1 def generate_action_space_noise(self, action_batch): noise = torch.normal(torch.zeros(action_batch.size()), self.noise_std).clamp(-self.noise_bound, self.noise_bound).to( self.device) return noise def update_targets(self): for target_param, param in zip(self.critic1_target.parameters(), self.critic1.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic2_target.parameters(), self.critic2.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
class Agent: def __init__(self,env, env_params, args, models=None, record_episodes=[0,.1,.25,.5,.75,1.]): self.env= env self.env_params = env_params self.args = args # networks if models == None: self.actor = Actor(self.env_params).double() self.critic = Critic(self.env_params).double() else: self.actor , self.critic = self.LoadModels() # target networks used to predict env actions with self.actor_target = Actor(self.env_params,).double() self.critic_target = Critic(self.env_params).double() self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) if self.args.cuda: self.actor.cuda() self.critic.cuda() self.actor_target.cuda() self.critic_target.cuda() self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.001) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=0.001) self.normalize = Normalizer(env_params,self.args.gamma) self.buffer = ReplayBuffer(1_000_000, self.env_params) self.tensorboard = ModifiedTensorBoard(log_dir = f"logs") self.record_episodes = [int(eps * self.args.n_epochs) for eps in record_episodes] def ModelsEval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def ModelsTrain(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def GreedyAction(self, state): self.ModelsEval() with torch.no_grad(): state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0) if self.args.cuda: state = state.cuda() action = self.actor.forward(state).detach().cpu().numpy().squeeze() return action def NoiseAction(self, state): self.ModelsEval() with torch.no_grad(): state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0) if self.args.cuda: state = state.cuda() action = self.actor.forward(state).detach().cpu().numpy() action += self.args.noise_eps * self.env_params['max_action'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['max_action'], self.env_params['max_action']) return action.squeeze() def Update(self): self.ModelsTrain() for i in range(self.args.n_batch): state, a_batch, r_batch, nextstate, d_batch = self.buffer.SampleBuffer(self.args.batch_size) a_batch = torch.tensor(a_batch,dtype=torch.double) r_batch = torch.tensor(r_batch,dtype=torch.double) # d_batch = torch.tensor(d_batch,dtype=torch.double) state = torch.tensor(state,dtype=torch.double) nextstate = torch.tensor(nextstate,dtype=torch.double) # d_batch = 1 - d_batch if self.args.cuda: a_batch = a_batch.cuda() r_batch = r_batch.cuda() # d_batch = d_batch.cuda() state = state.cuda() nextstate = nextstate.cuda() with torch.no_grad(): action_next = self.actor_target.forward(nextstate) q_next = self.critic_target.forward(nextstate,action_next) q_next = q_next.detach().squeeze() q_target = r_batch + self.args.gamma * q_next q_target = q_target.detach().squeeze() q_prime = self.critic.forward(state, a_batch).squeeze() critic_loss = F.mse_loss(q_target, q_prime) action = self.actor.forward(state) actor_loss = -self.critic.forward(state, action).mean() # params = torch.cat([x.view(-1) for x in self.actor.parameters()]) # l2_reg = self.args.l2_norm *torch.norm(params,2) # actor_loss += l2_reg self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() self.SoftUpdateTarget(self.critic, self.critic_target) self.SoftUpdateTarget(self.actor, self.actor_target) def Explore(self): for epoch in range(self.args.n_epochs +1): start_time = time.process_time() for cycle in range(self.args.n_cycles): for _ in range(self.args.num_rollouts_per_mpi): state = self.env.reset() for t in range(self.env_params['max_timesteps']): action = self.NoiseAction(state) nextstate, reward, done, info = self.env.step([action]) nextstate = nextstate.squeeze() reward = self.normalize.normalize_reward(reward) self.buffer.StoreTransition(state, action, reward, nextstate, done) state = nextstate self.Update() avg_reward = self.Evaluate() self.tensorboard.step = epoch elapsed_time = time.process_time() - start_time print(f"Epoch {epoch} of total of {self.args.n_epochs +1} epochs, average reward is: {avg_reward}.\ Elapsedtime: {int(elapsed_time /60)} minutes {int(elapsed_time %60)} seconds") if epoch % 5 or epoch + 1 == self.args.n_epochs: self.SaveModels(epoch) self.record(epoch) def Evaluate(self): self.ModelsEval() total_reward = [] episode_reward = 0 succes_rate = [] for episode in range(self.args.n_evaluate): state = self.env.reset() episode_reward = 0 for t in range(self.env_params['max_timesteps']): action = self.GreedyAction(state) nextstate, reward, done, info = self.env.step([action]) episode_reward += reward state = nextstate if done or t + 1 == self.env_params['max_timesteps']: total_reward.append(episode_reward) episode_reward = 0 average_reward = sum(total_reward)/len(total_reward) min_reward = min(total_reward) max_reward = max(total_reward) self.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward) return average_reward def record(self, epoch): self.ModelsEval() try: if not os.path.exists("videos"): os.mkdir('videos') recorder = VideoRecorder(self.env, path=f'videos/epoch-{epoch}.mp4') for _ in range(self.args.n_record): done =False state = self.env.reset() while not done: recorder.capture_frame() action = self.GreedyAction(state) nextstate,reward,done,info = self.env.step([action]) state = nextstate recorder.close() except Exception as e: print(e) def SaveModels(self, ep): if not os.path.exists("models"): os.mkdir('models') torch.save(self.actor.state_dict(), os.path.join('models', 'Actor.pt')) torch.save(self.critic.state_dict(), os.path.join('models', 'Critic.pt')) def LoadModels(self, actorpath, criticpath): actor = Actor(self.env_params, self.hidden_neurons) critic = Critic(self.env_params, self.hidden_neurons) actor.load_state_dict(torch.load(actorpath)) critic.load_state_dict(torch.load(criticpath)) return actor, critic def SoftUpdateTarget(self, source, target): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
class DDPGAgent: def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.env = env self.gamma = gamma self.tau = tau # initialize actor and critic networks self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy critic target parameters for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) # optimizers self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.replay_buffer = BasicBuffer(buffer_maxlen) self.noise = OUNoise(self.env.action_space) def get_action(self, obs): state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() return action def update(self, batch_size): states, actions, rewards, next_states, _ = self.replay_buffer.sample( batch_size) state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) curr_Q = self.critic.forward(state_batch, action_batch) next_actions = self.actor_target.forward(next_state_batch) next_Q = self.critic_target.forward(next_state_batch, next_actions.detach()) expected_Q = reward_batch + self.gamma * next_Q # update critic q_loss = F.mse_loss(curr_Q, expected_Q.detach()) self.critic_optimizer.zero_grad() q_loss.backward() self.critic_optimizer.step() # update actor policy_loss = -self.critic.forward( state_batch, self.actor.forward(state_batch)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
class Agents: def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] num_agents = params['num_of_agents'] nn_params = params['nn_params'] nn_params['nn_actor']['l1'][0] = state_size nn_params['nn_actor']['l3'][1] = action_size nn_params['nn_critic']['l1'][0] = (state_size + action_size) * num_agents self.__actors_local = [Actor(nn_params['nn_actor']).to(device), Actor(nn_params['nn_actor']).to(device)] self.__actors_target = [Actor(nn_params['nn_actor']).to(device), Actor(nn_params['nn_actor']).to(device)] self.__critic_local = Critic(nn_params['nn_critic']).to(device) self.__critic_target = Critic(nn_params['nn_critic']).to(device) self.__action_size = action_size self.__state_size = state_size self.__num_agents = num_agents self.__memory = ReplayBuffer(buf_params) self.__t = 0 self.gamma = params['gamma'] self.learning_rate_actor = params['learning_rate_actor'] self.learning_rate_critic = params['learning_rate_critic'] self.tau = params['tau'] self.__optimisers_actor = [optim.Adam(self.__actors_local[0].parameters(), self.learning_rate_actor), optim.Adam(self.__actors_local[1].parameters(), self.learning_rate_actor)] self.__optimiser_critic = optim.Adam(self.__critic_local.parameters(), self.learning_rate_critic) self.__uo_process = UOProcess(shape=(self.__num_agents, self.__action_size)) # other parameters self.agent_loss = 0.0 # Set methods def set_learning_rate(self, lr_actor, lr_critic): self.learning_rate_actor = lr_actor self.learning_rate_critic = lr_critic for n in range(self.__num_agents): for param_group in self.__optimisers_actor[n].param_groups: param_group['lr'] = lr_actor for param_group in self.__optimiser_critic.param_groups: param_group['lr'] = lr_critic # Get methods def get_actor(self): return self.__actors_local def get_critic(self): return self.__critic_local # Other methods def step(self, state, action, reward, next_state, done): # add experience to memory self.__memory.add(state, action, reward, next_state, done) if self.__memory.is_ready(): self.__update() def choose_action(self, states, mode='train'): if mode == 'train': # state should be transformed to a tensor states = torch.from_numpy(np.array(states)).float().to(device) actions = np.zeros((self.__num_agents, self.__action_size)) for i, actor in enumerate(self.__actors_local): state = states[i, :] actor.eval() with torch.no_grad(): action = actor(state) actor.train() actions[i, :] = action.cpu().numpy() actions += np.array(self.__uo_process.sample()) return np.clip(actions, -1, 1) elif mode == 'test': # state should be transformed to a tensor states = torch.from_numpy(np.array(states)).float().to(device) actions = np.zeros((self.__num_agents, self.__action_size)) for i, actor in enumerate(self.__actors_local): state = states[i, :] actor.eval() with torch.no_grad(): action = actor(state) actions[i, :] = action.cpu().numpy() actions += np.array(self.__uo_process.sample()) return np.clip(actions, -1, 1) else: print("Invalid mode value") def reset(self, sigma): self.__uo_process.reset(sigma) def __update(self): for i in range(self.__num_agents): # update critic # ---------------------------------------------------------- # states, actions, rewards, next_states, dones = self.__memory.sample() states_i = states[:, i, :] actions_i = actions[:, i, :] rewards_i = rewards[:, i] next_states_i = next_states[:, i, :] dones_i = dones[:, i] loss_fn = nn.MSELoss() self.__optimiser_critic.zero_grad() # form target next_states_actions = torch.cat((next_states[:, 0, :], next_states[:, 1, :], self.__actors_target[0].forward(next_states[:, 0, :]), self.__actors_target[1].forward(next_states[:, 1, :])), dim=1) Q_target_next = self.__critic_target.forward(next_states_actions).detach() targets = (rewards_i + self.gamma * Q_target_next[:, i] * (1 - dones_i)) # form output states_actions = torch.cat((states[:, 0, :], states[:, 1, :], actions[:, 0, :], actions[:, 1, :]), dim=1) outputs = self.__critic_local.forward(states_actions) mean_loss_critic = loss_fn(outputs[:, i], targets) # minus added since it's gradient ascent mean_loss_critic.backward() self.__optimiser_critic.step() # update actor # ---------------------------------------------------------- self.__optimisers_actor[i].zero_grad() predicted_actions = copy.copy(actions) predicted_actions[:, i, :] = self.__actors_local[i](states_i) mean_loss_actor = - self.__critic_local.forward(torch.cat((states[:, 0, :], states[:, 1, :], predicted_actions[:, 0, :], predicted_actions[:, 1, :]), dim=1))[:, i].mean() mean_loss_actor.backward() self.__optimisers_actor[i].step() # update actor self.__soft_update(self.__critic_local, self.__critic_target, self.tau) self.__soft_update(self.__actors_local[i], self.__actors_target[i], self.tau) @staticmethod def __soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class RaCT(): def __init__(self, M, eh1, eh2, dh2, ci, lr_ac=0.001, lr_cr=0.001): ## Network initializations # Actor self.actor = VAE( M, eh1, eh2, dh2 ) # Number of inputs, units in encoder_hidden_layer1, encoder_hidden_layer2, #decoder_hidden_layer1 # Critic self.critic = Critic(ci) # Length of feature vector # Optimizers self.optim_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_ac) self.optim_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_cr) self.mse = torch.nn.MSELoss() def pretrain_actor(self, X, batch_size, beta_max, epochs, epochs_annealing, val_set, masked=True): ''' Pretraining of actor using MLE cost = NLL + Beta*KL Minimize NLL: Maximize the probability of interactions in the reconstruction which are 1 in the input. KL: Regulatory, makes sure the distribution of z is not very different than prior. X: Interaction Matrix, training dataset beta_max : Max Beta that will be reached after annealing epochs : Total number of epochs epochs_annealing: Number of epochs for annealing. Since beta_max is set, controls how quick beta grows. val_set : Validation set for validation masked: Controls the training task. If True, only a partial history is given to actor and only unobserved interactions are considered in NLL. Proposed method is not clear in the paper. ''' beta = 0 beta_increase = beta_max / epochs_annealing # Linear Growth of Beta for epoch in range(epochs): self.optim_actor.zero_grad() ## Sample a batch batch_ind = np.random.choice(X.shape[0], batch_size) xbatch = X[batch_ind, :] xbatch = torch.tensor( xbatch.toarray(), dtype=torch.float32) # Scipy Sparse to Tensor Dense ## UNMASKED if not masked: xlog, KL = self.actor.forward(xbatch) nll = -torch.mean(xlog * xbatch, dim=1) elbo_beta = torch.mean(nll + beta * KL) ## MASKED else: # Sample masks mask, xbatch_masked = self.mask(xbatch) xbatch_reverse_masked = xbatch * (1 - mask) xlog, KL = self.actor.forward(xbatch_masked) nll = -torch.mean(xlog * xbatch_reverse_masked, dim=1) elbo_beta = torch.mean(nll + beta * KL) print('NLLL : ', torch.mean(nll.detach())) print('Elbo : ', elbo_beta.detach()) elbo_beta.backward() self.optim_actor.step() # Update the actor if epoch < epochs_annealing: beta = beta + beta_increase if epoch % 20 == 0: self.evaluate(val_set) def pretrain_critic(self, X, batch_size, epochs): ''' Pretraining of critic using MSE between score predictions of Critic network and NDCG@100. Critic tries to learn giving similar results with NDCG. No unmasked option here, since NDCG only accounts for unobserver interactions. ''' for epoch in range(epochs): self.optim_actor.zero_grad() self.optim_critic.zero_grad() # Sample a batch batch_ind = np.random.choice(X.shape[0], batch_size) xbatch_spr = X[batch_ind, :] xbatch = torch.tensor(xbatch_spr.toarray(), dtype=torch.float32) # Prepare masks mask, xbatch_masked = self.mask(xbatch) xbatch_reverse_masked = xbatch * (1 - mask) # Find score prediction of critic given masked input xlog, KL = self.actor.forward(xbatch_masked) nll = -torch.mean(xlog * xbatch_reverse_masked, dim=1) score_pred = self.critic.forward(xbatch, nll, mask) ## I will try the one from implementation. 1st-arg=prediction, 2nd-arg = reverse-masked-input # 4th-arg = masked_input ndcg = NDCG_binary_at_k_batch(xlog.detach().numpy(), xbatch_reverse_masked, 100, xbatch_masked) ndcg = torch.tensor(ndcg.reshape(-1, 1), dtype=torch.float32) print('NDCG mean :', torch.mean(ndcg)) mse_loss = self.mse( score_pred, ndcg) ## Minimize the difference between Critic and NDCG print('MSE : ', mse_loss.detach()) mse_loss.backward() self.optim_critic.step() def alternative_training(self, X, batch_size, beta, epochs, recalculate_actor=False): ''' Train both of them together. Do the following epochs times. 1. Train Actor to maximize the score of predictions.Use Critic as a both differentiable and accurate metric.(At least this is what we hope to get.) 2. Train Critic using MSE cost with NDCG. We need this to make sure that we can predict the score of distributions produced by the new Actor. Note that in the tests, this stage is observed to be too unstable. Unlucky seeds can cause collapse of the whole training. TODO: Work on the unstability. recalculate_actor : Experimental parameter for Critic Phase. If True, reconstruct the graph of actor network for the training of Critic. If false, use the results from Actor phase as constants. ''' for epoch in range(epochs): # Sample a batch. Will use the same batch for both phases. batch_ind = np.random.choice(X.shape[0], batch_size) xbatch_spr = X[batch_ind, :] xbatch = torch.tensor(xbatch_spr.toarray(), dtype=torch.float32) # Mask it mask, xbatch_masked = self.mask(xbatch) xbatch_reverse_masked = xbatch * (1 - mask) ### Actor Phase self.optim_actor.zero_grad() self.optim_critic.zero_grad() xlog, KL = self.actor.forward(xbatch_masked) nll = -torch.mean(xlog * xbatch_reverse_masked, dim=1) actor_loss = -self.critic.forward( xbatch, nll, mask).mean() # Use -critic_score as the loss. #So maximize the critic score actor_loss.backward() self.optim_actor.step() print('Critic ', epoch, ' , ', actor_loss.detach()) print('NLLL : ', torch.mean(nll.detach())) ### Critic Phase self.optim_actor.zero_grad() self.optim_critic.zero_grad() if recalculate_actor: xlog, KL = self.actor.forward(xbatch) nll = -torch.mean(xlog * xbatch, dim=1) else: nll.detach_() score_pred = self.critic.forward(xbatch, nll, mask) ndcg = NDCG_binary_at_k_batch(xlog.detach().numpy(), xbatch_reverse_masked, 100, xbatch_masked) ndcg = torch.tensor(ndcg.reshape(-1, 1), dtype=torch.float32) print('NDCG mean :', torch.mean(ndcg)) mse_loss = self.mse(score_pred, ndcg) mse_loss.backward() # print('MSE Loss : ',mse_loss.detach()) self.optim_critic.step() def mask(self, X, p=0.5): ''' Generates a random(Bernoulli) matrix(mask) of same shape with X. p is the probability of each element being 1. Note that elements in the matrix sampled from independent distributions. ''' mask = torch.distributions.bernoulli.Bernoulli(p).sample( sample_shape=X.shape) X_masked = X * mask return mask, X_masked def evaluate(self, val_set): with torch.no_grad(): ## Convert from Scipy sparse to Torch Tensor. xbatch = torch.tensor(val_set.toarray(), dtype=torch.float32) mask, xbatch_masked = self.mask(xbatch) xbatch_reverse_masked = xbatch * ( 1 - mask) # Reverse Mask, 1 if not observed xlog, KL = self.actor.forward( xbatch_masked ) # Accepts a 'partial'(masked) interaction history. nll = -torch.mean( xlog * xbatch_reverse_masked, dim=1) # We only care about guessing unobserved interactionsl score_pred = self.critic.forward(xbatch, nll, mask) # Note that first argument # should be the original(unmasked) matrix. # Calculate NDCG@100. ndcg = NDCG_binary_at_k_batch(xlog.detach().numpy(), xbatch_reverse_masked, 100, xbatch_masked) ndcg = torch.tensor(ndcg.reshape(-1, 1), dtype=torch.float32) print('NDCG mean :', torch.mean(ndcg))