class TD3Agent(): def __init__(self, env_id, alpha, beta, input_dims, tau, env, gamma=0.99, update_actor_interval=2, warmup=1000, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=256, noise=0): self.gamma = gamma self.tau = tau self.max_action = env.action_space.high self.min_action = env.action_space.low self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.n_actions = n_actions self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id + '_actor') self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id + '_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id + '_critic_2') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id + '_target_actor') self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id + '_target_critic_1') self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id + '_target_critic_2') self.noise = noise self.update_network_parameters(tau=1) def choose_action(self, observation): if self.time_step < self.warmup: mu = T.tensor( np.random.normal(scale=self.noise, size=(self.n_actions, ))).to( self.actor.device) else: state = T.tensor(observation, dtype=T.float).to(self.actor.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu + T.tensor(np.random.normal(scale=self.noise), dtype=T.float).to(self.actor.device) mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0]) #mu_prime = T.clamp(mu_prime, self.min_action, self.max_action) self.time_step += 1 return mu_prime.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device) done = T.tensor(done).to(self.critic_1.device) state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device) state = T.tensor(state, dtype=T.float).to(self.critic_1.device) action = T.tensor(action, dtype=T.float).to(self.critic_1.device) target_actions = self.target_actor.forward(state_) target_actions = target_actions + \ T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5) target_actions = T.clamp(target_actions, self.min_action[0], self.max_action[0]) q1_ = self.target_critic_1.forward(state_, target_actions) q2_ = self.target_critic_2.forward(state_, target_actions) q1 = self.critic_1.forward(state, action) q2 = self.critic_2.forward(state, action) q1_[done] = 0.0 q2_[done] = 0.0 q1_ = q1_.view(-1) q2_ = q2_.view(-1) critic_value_ = T.min(q1_, q2_) target = reward + self.gamma * critic_value_ target = target.view(self.batch_size, 1) self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q1_loss = F.mse_loss(target, q1) q2_loss = F.mse_loss(target, q2) critic_loss = q1_loss + q2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.learn_step_cntr += 1 if self.learn_step_cntr % self.update_actor_iter != 0: return self.actor.optimizer.zero_grad() actor_q1_loss = self.critic_1.forward(state, self.actor.forward(state)) actor_loss = -T.mean(actor_q1_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_1_params = self.critic_1.named_parameters() critic_2_params = self.critic_2.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_1_params = self.target_critic_1.named_parameters() target_critic_2_params = self.target_critic_2.named_parameters() critic_1 = dict(critic_1_params) critic_2 = dict(critic_2_params) actor = dict(actor_params) target_actor = dict(target_actor_params) target_critic_1 = dict(target_critic_1_params) target_critic_2 = dict(target_critic_2_params) for name in critic_1: critic_1[name] = tau*critic_1[name].clone() + \ (1-tau)*target_critic_1[name].clone() for name in critic_2: critic_2[name] = tau*critic_2[name].clone() + \ (1-tau)*target_critic_2[name].clone() for name in actor: actor[name] = tau*actor[name].clone() + \ (1-tau)*target_actor[name].clone() self.target_critic_1.load_state_dict(critic_1) self.target_critic_2.load_state_dict(critic_2) self.target_actor.load_state_dict(actor) def load_models(self): print('... loading checkpoint ...') self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.target_critic_1.load_checkpoint() self.target_critic_2.load_checkpoint()
class DDPGAgent(): def __init__(self, env_id, alpha, beta, input_dims, tau, n_actions, gamma=0.99, max_size=1000000, fc1_dims=256, fc2_dims=256, batch_size=256): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.alpha = alpha self.beta = beta self.memory = ReplayBuffer(max_size, input_dims, n_actions) #self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name=env_id+'_actor') self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name=env_id+'_critic') self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name=env_id+'_target_actor') self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name=env_id+'_target_critic') self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() state = T.tensor([observation], dtype=T.float).to(self.actor.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu #+ T.tensor(self.noise(), dtype=T.float).to(self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy()[0] def remember(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def load_models(self): print("... loading checkpoint") self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, states_, dones = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(states, dtype=T.float).to(self.actor.device) actions = T.tensor(actions, dtype=T.float).to(self.actor.device) rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) states_ = T.tensor(states_, dtype=T.float).to(self.actor.device) dones = T.tensor(dones).to(self.actor.device) target_actions = self.target_actor.forward(states_) critic_value_ = self.target_critic.forward(states_, target_actions) critic_value = self.critic.forward(states, actions) critic_value_[dones] = 0.0 critic_value_ = critic_value_.view(-1) target = rewards + self.gamma * critic_value_ target = target.view(self.batch_size, 1) self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.actor.optimizer.zero_grad() actor_loss = -self.critic.forward(states, self.actor.forward(states)) actor_loss = T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_dict = dict(target_critic_params) target_actor_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ (1-tau) * target_critic_dict[name].clone() for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau) * target_actor_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) self.target_actor.load_state_dict(actor_state_dict)
class Agent: def __init__(self, input_dims, n_actions, env, fc1_dims, fc2_dims, alpha, beta, gamma, tau, noise1, noise2, clamp, delay, max_size, batch_size, warmup): self.gamma = gamma self.tau = tau self.noise1 = noise1 self.noise2 = noise2 self.clamp = clamp self.delay = delay self.batch_size = batch_size self.warmup = warmup self.learn_cntr = 0 self.env = env self.n_actions = n_actions self.actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Target_Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.memory = ReplayBuffer( max_size=max_size, input_shape=input_dims, n_actions=n_actions) self.update_target_networks() def update_target_networks(self): tau = self.tau actor = dict(self.actor.named_parameters()) critic_1 = dict(self.critic_1.named_parameters()) critic_2 = dict(self.critic_2.named_parameters()) target_actor = dict(self.target_actor.named_parameters()) target_critic_1 = dict(self.target_critic_1.named_parameters()) target_critic_2 = dict(self.target_critic_2.named_parameters()) for name in actor: actor[name] = tau*actor[name].clone() + (1-tau)*target_actor[name].clone() for name in critic_1: critic_1[name] = tau*critic_1[name].clone() + (1-tau)*target_critic_1[name].clone() for name in critic_2: critic_2[name] = tau*critic_2[name].clone() + (1-tau)*target_critic_2[name].clone() self.target_actor.load_state_dict(actor) self.target_critic_1.load_state_dict(critic_1) self.target_critic_2.load_state_dict(critic_2) def choose_action(self, observation): if self.learn_cntr < self.warmup: mu = np.random.normal(scale=self.noise1, size=self.n_actions) mu = T.tensor(mu).to(self.actor.device) else: state = T.tensor(observation, dtype=T.float).to(self.actor.device) mu = self.actor.forward(state) noise = T.tensor(np.random.normal(scale=self.noise1, size=self.n_actions), dtype=T.float).to(self.actor.device) mu_ = T.clamp(T.add(mu, noise), min=self.env.action_space.low[0], max=self.env.action_space.high[0]) self.learn_cntr += 1 return mu_.cpu().detach().numpy() def save_models(self): self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.target_actor.save_checkpoint() self.target_critic_1.save_checkpoint() self.target_critic_2.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.target_actor.load_checkpoint() self.target_critic_1.load_checkpoint() self.target_critic_2.load_checkpoint() def remember(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample(self): states, actions, rewards, states_, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(states, dtype=T.float).to(self.critic_1.device) actions = T.tensor(actions, dtype=T.float).to(self.critic_1.device) rewards = T.tensor(rewards, dtype=T.float).to(self.critic_1.device) states_ = T.tensor(states_, dtype=T.float).to(self.critic_1.device) done = T.tensor(done, dtype=T.int).to(self.critic_1.device) return states, actions, rewards, states_, done def learn(self): if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, states_, done = self.sample() Vs1 = self.critic_1.forward(states, actions) Vs2 = self.critic_2.forward(states, actions) actions_ = self.target_actor.forward(states_) noise = T.tensor(np.random.normal(scale=self.noise1, size=self.n_actions), dtype=T.float).to(self.actor.device) noise = T.clamp(noise, min=-self.clamp, max=self.clamp) actions_ = T.add(actions_, noise) actions_ = T.clamp(actions_, min=self.env.action_space.low[0], max=self.env.action_space.high[0]) critic_1_Vs_ = self.target_critic_1.forward(states_, actions_) critic_2_Vs_ = self.target_critic_2.forward(states_, actions_) min_Vs_ = T.min(critic_1_Vs_, critic_2_Vs_) target = rewards + self.gamma*min_Vs_*(1-done) self.critic_1.optim.zero_grad() self.critic_2.optim.zero_grad() critic_1_loss = F.mse_loss(Vs1, target) critic_2_loss = F.mse_loss(Vs2, target) critic_loss = T.add(critic_1_loss, critic_2_loss) critic_loss.backward() self.critic_1.optim.step() self.critic_2.optim.step() if self.learn_cntr % self.delay == 0: self.actor.optim.zero_grad() actor_loss = self.critic_1.forward(states_, self.actor.forward(states_)) actor_loss = -T.mean(actor_loss) actor_loss.backward() self.actor.optim.step() self.update_target_networks()
class DQNAgent: def __init__(self, env, render, config_info): self.env = env self._reset_env() self.render = render # Set seeds self.seed = 0 env.seed(self.seed) torch.manual_seed(self.seed) np.random.seed(self.seed) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define checkpoint checkpoint = Checkpoint(self.device, **config_info) # Create / load checkpoint dict ( self.ckpt, self.path_ckpt_dict, self.path_ckpt, config, ) = checkpoint.manage_checkpoint() # Unroll useful parameters from config dict self.batch_size = config["training"]["batch_size"] self.max_timesteps = config["training"]["max_timesteps"] self.replay_size = config["training"]["replay_size"] self.start_temp = config["training"]["start_temperature"] self.final_temp = config["training"]["final_temperature"] self.decay_temp = config["training"]["decay_temperature"] self.gamma = config["training"]["gamma"] self.early_stopping = config["training"]["early_stopping"] self.update_frequency = config["training"]["update_frequency"] self.eval_frequency = config["training"]["eval_frequency"] # Define state and action dimension spaces state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # Define Q-network and target Q-network self.network = DQN(state_dim, action_dim, **config["model"]).to(self.device) self.target_network = DQN(state_dim, action_dim, **config["model"]).to( self.device ) # Loss and optimizer self.criterion = nn.MSELoss() lr = config["optimizer"]["learning_rate"] self.optimizer = optim.Adam(self.network.parameters(), lr=lr) # Load network's weight if resume training checkpoint.load_weights( self.ckpt, self.network, self.target_network, self.optimizer ) # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.replay_size) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) def _reset_env(self): self.state, self.done = self.env.reset(), False self.episode_reward = 0.0 def play_step(self, temperature=1): reward_signal = None # Boltmann exploration state_v = torch.tensor(self.state, dtype=torch.float32).to(self.device) q_values = self.network(state_v) probas = Categorical(F.softmax(q_values / temperature, dim=0)) action = probas.sample().item() # Perform one step in the environment next_state, reward, self.done, _ = self.env.step(action) # Create a tuple for the new transition new_transition = self.transition( self.state, action, reward, self.done, next_state ) # Add transition to the replay buffer self.replay_buffer.store_transition(new_transition) self.state = next_state self.episode_reward += reward if self.render: self.env.render() if self.done: reward_signal = self.episode_reward self._reset_env() return reward_signal def train(self): # Initializations all_episode_rewards = [] episode_timestep = 0 best_mean_reward = None episode_num = 0 temp = self.start_temp # start epsilon to explore while filling the buffer writer = SummaryWriter(log_dir=self.path_ckpt, comment="-dqn") # Evaluate untrained policy evaluations = [self.eval_policy()] # Training loop for t in range(int(self.max_timesteps)): episode_timestep += 1 # -> is None if episode is not terminated # -> is episode reward when episode is terminated reward_signal = self.play_step(temp) # when episode is terminated if reward_signal is not None: episode_reward = reward_signal mean_reward = np.mean(all_episode_rewards[-10:]) print( f"Timestep [{t + 1}/{int(self.max_timesteps)}] ; " f"Episode num : {episode_num + 1} ; " f"Episode length : {episode_timestep} ; " f"Reward : {episode_reward:.2f} ; " f"Mean reward {mean_reward:.2f}" ) # Save episode's reward & reset counters all_episode_rewards.append(episode_reward) episode_timestep = 0 episode_num += 1 # Save checkpoint self.ckpt["episode_num"] = episode_num self.ckpt["all_episode_rewards"].append(episode_reward) self.ckpt["optimizer_state_dict"] = self.optimizer.state_dict() torch.save(self.ckpt, self.path_ckpt_dict) writer.add_scalar("episode reward", episode_reward, t) writer.add_scalar("mean reward", mean_reward, t) # Save network if performance is better than average if best_mean_reward is None or best_mean_reward < mean_reward: self.ckpt["best_mean_reward"] = mean_reward self.ckpt["model_state_dict"] = self.network.state_dict() self.ckpt[ "target_model_state_dict" ] = self.target_network.state_dict() if best_mean_reward is not None: print(f"Best mean reward updated : {best_mean_reward}") best_mean_reward = mean_reward # Criterion to early stop training if mean_reward > self.early_stopping: self.plot_reward() print(f"Solved in {t + 1} timesteps!") break # Fill the replay buffer if len(self.replay_buffer) < self.replay_size: continue else: # Adjust exploration parameter temp = np.maximum( self.final_temp, self.start_temp - (t / self.decay_temp) ) writer.add_scalar("temperature", temp, t) # Get the weights of the network before update weights_network = self.network.state_dict() # when it's time perform a batch gradient descent if t % self.update_frequency == 0: # Backward and optimize self.optimizer.zero_grad() batch = self.replay_buffer.sample_buffer(self.batch_size) loss = self.train_on_batch(batch) loss.backward() self.optimizer.step() # Synchronize target network self.target_network.load_state_dict(weights_network) # Evaluate episode if (t + 1) % self.eval_frequency == 0: evaluations.append(self.eval_policy()) np.save(self.path_ckpt, evaluations) def train_on_batch(self, batch_samples): # Unpack batch_size of transitions randomly drawn from the replay buffer states, actions, rewards, dones, next_states = batch_samples # Transform np arrays into tensors and send them to device states_v = torch.tensor(states).to(self.device) next_states_v = torch.tensor(next_states).to(self.device) actions_v = torch.tensor(actions).to(self.device) rewards_v = torch.tensor(rewards).to(self.device) dones_bool = torch.tensor(dones, dtype=torch.bool).to(self.device) # Vectorized version q_vals = self.network(states_v) # dim=batch_size x num_actions # Get the Q-values corresponding to the action q_vals = q_vals.gather(1, actions_v.view(-1, 1)) q_vals = q_vals.view(1, -1)[0] target_next_q_vals = self.target_network(next_states_v) # Max action of the target Q-values target_max_next_q_vals, _ = torch.max(target_next_q_vals, dim=1) # If state is terminal target_max_next_q_vals[dones_bool] = 0.0 # No update of the target during backpropagation target_max_next_q_vals = target_max_next_q_vals.detach() # Bellman approximation for target Q-values target_q_vals = rewards_v + self.gamma * target_max_next_q_vals return self.criterion(q_vals, target_q_vals) def eval_policy(self, eval_episodes=10): # Runs policy for X episodes and returns average reward # A fixed seed is used for the eval environment self.env.seed(self.seed + 100) avg_reward = 0.0 temperature = 1 for _ in range(eval_episodes): self._reset_env() reward_signal = None while reward_signal is None: reward_signal = self.play_step(temperature) avg_reward += reward_signal avg_reward /= eval_episodes print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") print("---------------------------------------") return avg_reward def plot_reward(self): plt.plot(self.ckpt["all_episode_rewards"]) plt.xlabel("Episode") plt.ylabel("Reward") plt.title(f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment") plt.tight_layout() path_fig = os.path.join(self.path_ckpt, "figure.png") plt.savefig(path_fig) print(f"Figure saved to {path_fig}") plt.show()
class MADDPG: def __init__(self, agent_init_params,batch_size=1024,replay_buffer_capacity=100000, gamma=0.95, tau=0.01, lr=0.01, hidden_dim=64, discrete_action=False,env='simple_reference'): """ Inputs: agent_init_params (list of dict): List of dicts with parameters to initialize each agent num_in_pol (int): Input dimensions to policy num_out_pol (int): Output dimensions to policy num_in_critic (int): Input dimensions to critic alg_types (list of str): Learning algorithm for each agent (DDPG or MADDPG) gamma (float): Discount factor tau (float): Target update rate lr (float): Learning rate for policy and critic hidden_dim (int): Number of hidden dimensions for networks discrete_action (bool): Whether or not to use discrete action space """ self.nagents = len(agent_init_params) self.agents=[] n_actions_total=np.sum([agent['n_actions_physical']+agent['n_actions_communication'] for agent in agent_init_params]) n_observations_total=np.sum([agent['input_dims'] for agent in agent_init_params]) for i in range(self.nagents): current_agent=Agent(id_number=i,**agent_init_params[i]) current_agent.initialize_critic(n_actions_total+n_observations_total) self.agents.append(current_agent) self.agent_init_params = agent_init_params self.gamma = gamma self.tau = tau self.lr = lr ''' self.discrete_action = discrete_action self.pol_dev = 'cpu' # device for policies self.critic_dev = 'cpu' # device for critics self.trgt_pol_dev = 'cpu' # device for target policies self.trgt_critic_dev = 'cpu' # device for target critics self.niter = 0 ''' self.replay_buffer=ReplayBuffer(replay_buffer_capacity) self.batch_size=batch_size self.env=make_env(env) self.loss=nn.MSELoss() def reset(self): return self.env.reset() def step(self,observations): #You have all the agents and policies. Once you have observations, you can just calculate actions to step #'observations' is a list of np arrays containing the observations of each agent. #'actions' should be a list of np arrays containing the actions of each agent. There's one np array per agent actions=[] for i,observation in enumerate(observations): current_action=self.agents[i].Action(observation) actions.append(current_action) new_states,rewards,terminals,_= self.env.step(actions) self.replay_buffer.store_transition(observations,actions,rewards,new_states,terminals) return new_states,rewards,terminals def update_networks(self,hard=False): #Carry out soft updates for agent in self.agents: agent.update_network_parameters(self.tau) def next_actions(self,next_states): next_step_actions=[] for i,agent in enumerate(self.agents): action=agent.Action(next_states[i],target=True) next_step_actions.append(action) return next_step_actions def learn(self): if self.replay_buffer.mem_cntr<self.batch_size: return states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch=self.replay_buffer.sample_buffer(self.batch_size) for agent in self.agents: #Critic Update self.update_critic(agent,states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch) self.update_actor(agent,states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch) def update_critic(self,agent,states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch): critic_losses=[] for i in range(self.batch_size): current_states=states_batch[i] current_actions=actions_batch[i] current_rewards=rewards_batch[i] next_states=next_states_batch[i] next_step_actions=[] for j,next_agent in enumerate(self.agents): action=next_agent.Action(next_states[j],target=True) next_step_actions.append(action) agent.critic.eval() Q=agent.critic.forward(current_states,current_actions).to(agent.critic.device) target=current_rewards[agent.id]+self.gamma*agent.critic.forward(next_states,next_step_actions).to(agent.critic.device).detach() loss=self.loss(Q,target) critic_losses.append(loss) agent.critic.train() critic_losses=torch.stack(critic_losses,0) mean_critic_loss=torch.mean(critic_losses).to(agent.critic.device) agent.critic.optimizer.zero_grad() mean_critic_loss.backward() nn.utils.clip_grad_norm_(agent.critic.parameters(), 0.5) agent.critic.optimizer.step() def update_actor(self,agent,states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch): Q_values=[] for i in range(self.batch_size): current_states=states_batch[i] current_actions=actions_batch[i] current_rewards=rewards_batch[i] agent.actor.eval() agent_action=agent.actor.forward(current_states[agent.id])#This is a tensor, not discretized. We could use gumbel softmax to approximate the discretization physical_actions=agent_action[0:agent.n_actions_physical] comm_actions=F.gumbel_softmax(agent_action[agent.n_actions_physical:],hard=True) agent_action=torch.cat((physical_actions,comm_actions)) actions_for_critic=deepcopy(current_actions) for i in range(self.nagents): if(i==agent.id): actions_for_critic[agent.id]=agent_action else:actions_for_critic[i]=torch.tensor(actions_for_critic[i],dtype=torch.float32).to(agent.critic.device) actions_for_critic=list(chain.from_iterable(actions_for_critic)) actions_for_critic=torch.stack(actions_for_critic) Q= -agent.critic.forward(current_states,actions_for_critic,actions_need_processing=False) Q_values.append(Q) Q_values=torch.stack(Q_values,0) mean_Q=torch.mean(Q_values) agent.actor.optimizer.zero_grad() agent.actor.train() mean_Q.backward() nn.utils.clip_grad_norm_(agent.actor.parameters(), 0.5) agent.actor.optimizer.step()
class Agent(): def __init__(self, env_id, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(env_id, alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(env_id, beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(env_id, beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(env_id, beta, input_dims, name='value') self.target_value = ValueNetwork(env_id, beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() + \ (1-tau)*target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.actor.device) done = T.tensor(done).to(self.actor.device) state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device) state = T.tensor(state, dtype=T.float).to(self.actor.device) action = T.tensor(action, dtype=T.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 actions, log_probs = self.actor.sample_normal(state, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * reward + self.gamma * value_ q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters()
class SACAgent(): def __init__(self, alpha, beta, tau, env, env_id, input_dims, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions # can use shared critic input layer and different outputs # but in this case using 2 seperate critics # env.action_space.max_action switched for env.action_space.high for LunarLanderContinuous self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, env.action_space.high[0], n_actions, env_id+'_actor') self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, env_id + '_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, env_id + '_critic_2') self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, env_id+'_value') self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, '_target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] # returned as arr of arrays on gpu as torch tensor def remember(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) # overwriting parameters - setting new values for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() + \ (1-tau)*target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): print("... saving models") self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() def load_models(self): print("... loading models") self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size) # self.critic_1.device or self.actor.device state = T.tensor(state, dtype=T.float).to(self.actor.device) action = T.tensor(action, dtype=T.float).to(self.actor.device) reward = T.tensor(reward, dtype=T.float).to(self.actor.device) state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device) done = T.tensor(done).to(self.actor.device) # passing states and new states through value and target value networks # collapsing along batch dimension since we don't need 2d tensor for scalar quantities value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 # setting terminal states to 0 # pass current states through current policy get action & log prob values actions, log_probs = self.actor.sample_normal(state, reparameterize=False) log_probs = log_probs.view(-1) # critic values for current policy state action pairs q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) # take critic min and collapse critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() # actor loss (using reparam trick) actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) # take critic min for new policy and collapse q1_new_policy = self.critic_1.forward(state, action) q2_new_policy = self.critic_2.forward(state, action) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) # calculating actor loss actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() q_hat = self.scale * reward + self.gamma*value_ # qhat q1_old_policy = self.critic_1.forward(state, action).view(-1) # old policy (from replay buffer) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters()
class Agent(): # needs functions init, choose_action, store_transition def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, update_actor_interval=2, n_actions=2, warmup=1000, max_size=1e6, layer1_size=400, layer2_size=300, batch_size=100, noise=0.1): self.gamma = gamma self.tau = tau self.max_action = env.action_space.high self.min_action = env.action_space.low #self.max_action = n_actions #self.min_action = 0 self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 # how often to call the learning function on the actor network self.time_step = 0 # handles countdown to end of warmup self.warmup = warmup self.n_actions = n_actions self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions, 'actor_net') self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, 'critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, 'critic_2') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions, name='target_actor') self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, name='target_critic_1') self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, name='target_critic_2') self.noise = noise self.update_network_parameters( tau=1) # sets the target network parameters to original def choose_action(self, observation): if self.time_step < self.warmup: mu = T.tensor( np.random.normal(scale=self.noise, size=(self.n_actions, ))).to( self.actor.device) else: state = T.tensor(observation, dtype=T.float).to(self.actor.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu + T.tensor(np.random.normal(scale=self.noise), dtype=T.float).to(self.actor.device) # clamping on action to make sure it stays in range mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0]) self.time_step += 1 return mu_prime.cpu().detach().numpy() def remember(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, state_, done = self.memory.sample_buffer( self.batch_size) state = T.tensor(state, dtype=T.float).to(self.critic_1.device) action = T.tensor(action, dtype=T.float).to(self.critic_1.device) reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device) state_ = T.tensor(state_, dtype=T.float).to(self.critic_1.device) done = T.tensor(done).to(self.critic_1.device) target_actions = self.actor.forward(state_) # get the new states target_actions = target_actions + T.clamp( T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5) # add noise #target_actions = T.clamp(target_actions, self.min_action[0], self.max_action[0]) target_actions = T.clamp(target_actions, self.min_action[0], self.max_action[0]) # clamp to ensure target action in bounds of environment () - may break if -ve element != -(+ve) element q1_ = self.target_critic_1.forward(state_, target_actions) q2_ = self.target_critic_2.forward(state_, target_actions) # needed for loss function q1 = self.critic_1.forward(state, action) q2 = self.critic_2.forward(state, action) #handle when new states are terminal q1_[done] = 0.0 q2_[done] = 0.0 # collapse on batch dimension q1_ = q1_.view(-1) q2_ = q2_.view(-1) critic_value_ = T.min( q1_, q2_) # perform minimisation operation (to get min) target = reward + self.gamma * critic_value_ target = target.view( self.batch_size, 1) # add batch dimension to feed through loss function self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() # Calculate and sum losses (can only backprop once in pytorch) q1_loss = F.mse_loss(target, q1) q2_loss = F.mse_loss(target, q2) critic_loss = q1_loss + q2_loss critic_loss.backward() # backprop # step optimizer self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.learn_step_cntr += 1 if self.learn_step_cntr % self.update_actor_iter != 0: return self.actor.optimizer.zero_grad() actor_q1_loss = self.critic_1.forward(state, self.actor.forward( state)) # actor loss proportional to loss of critic net (1) actor_loss = -T.mean(actor_q1_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): # using soft update rule # called at beginning of initializer to set init network params if tau is None: tau = self.tau # get the named parameters of every network actor_params = self.actor.named_parameters() critic_1_params = self.critic_1.named_parameters() critic_2_params = self.critic_2.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_1_params = self.target_critic_1.named_parameters() target_critic_2_params = self.target_critic_2.named_parameters() # converting to dicts actor_state_dict = dict(actor_params) critic_1_state_dict = dict(critic_1_params) critic_2_state_dict = dict(critic_2_params) target_actor_state_dict = dict(target_actor_params) target_critic_1_state_dict = dict(target_critic_1_params) target_critic_2_state_dict = dict(target_critic_2_params) # overwriting parameters - setting new values for name in critic_1_state_dict: critic_1_state_dict[name] = tau*critic_1_state_dict[name].clone() + \ (1-tau)*target_critic_1_state_dict[name].clone() for name in critic_2_state_dict: critic_2_state_dict[name] = tau*critic_2_state_dict[name].clone() + \ (1-tau)*target_critic_2_state_dict[name].clone() for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau)*target_actor_state_dict[name].clone() self.target_critic_1.load_state_dict(critic_1_state_dict) self.target_critic_2.load_state_dict(critic_2_state_dict) self.target_actor.load_state_dict(actor_state_dict) def save_model(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.target_critic_1.save_checkpoint() self.target_critic_2.save_checkpoint() def load_model(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.target_critic_1.load_checkpoint() self.target_critic_2.load_checkpoint()
class DQN: def __init__( self, input_dims=198, n_actions=6, gamma=0.1, epsilon=0.9, lr=0.0005, mem_size=10000, batch_size=32, eps_min=0.01, eps_dec=5e-10, replace=1000, algo="dnqagent", env_name="minerai", chkpt_dir="tmp/dqn", ): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.learn_step_counter = 0 self.n_actions = n_actions self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DQNetwork( self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + "_" + self.algo + "_q_eval", chkpt_dir=self.chkpt_dir, ) self.q_next = DQNetwork( self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + "_" + self.algo + "_q_next", chkpt_dir=self.chkpt_dir, ) # self.load_models() def choose_action(self, observation): if np.random.random() > self.epsilon: state = torch.tensor([observation], dtype=torch.float).to( self.q_eval.device ) actions = self.q_eval.forward(state, self.get_state2(observation)) action = torch.argmax(actions).item() else: action = randrange(self.n_actions) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size ) states = torch.tensor(state).to(self.q_eval.device) rewards = torch.tensor(reward).to(self.q_eval.device) dones = torch.tensor(done).to(self.q_eval.device) actions = torch.tensor(action).to(self.q_eval.device) states_ = torch.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.replace_target_cnt is not None and \ self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = ( self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min ) def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states, self.get_state2(states))[indices, actions] q_next = self.q_next.forward(states_, self.get_state2(states_)).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def get_state2(self, observation): observation = np.array(torch.tensor(observation,requires_grad=False).cpu()).reshape(-1, 198) for i in range(observation.shape[0]): observation[ i, min(int(observation[i, 192]) + int(observation[i, 193]) * 9, 0) ] = 1000 observation[ i, min(int(observation[i, 194]) + int(observation[i, 195]) * 9, 0) ] = 1000 observation[ i, min(int(observation[i, 196]) + int(observation[i, 197]) * 9, 0) ] = 1000 observation[ i, min(int(observation[i, 189]) + int(observation[i, 190]) * 9, 0) ] = 10000 return ( torch.tensor([observation], dtype=torch.float, requires_grad=True) .to(self.q_eval.device) .view(-1, 198) )