class TD3RolloutActor: def __init__(self, state_dim, action_dim, action_max, exploration_noise): self.actor = Actor(state_dim, 256, action_dim, action_max).eval() self.exploration_noise = exploration_noise def select_action(self, state): state = torch.tensor(state.reshape(1, -1), dtype=torch.float) action = self.actor.forward(state) noise = torch.randn_like(action) * self.exploration_noise action = action + noise return action.cpu().detach().numpy().flatten() def deterministic_action(self, state): state = torch.tensor(state.reshape(1, -1), dtype=torch.float) action = self.actor.forward(state) return action.cpu().detach().numpy().flatten() def parameters(self): return self.actor.parameters()
class DDPGAgent: def __init__(self, state_space_dim, action_space_dim, min_action_val, max_action_val, hidden_layer_size=512, gamma=0.99, tau=0.0001, path_to_load=None): self.gamma = gamma self.tau = tau self.min_action_val = min_action_val self.max_action_val = max_action_val self.buffer = Buffer(state_space_dim, action_space_dim) self.noise_generator = GaussianNoise(0., 0.2, action_space_dim) self.actor = Actor(state_space_dim, action_space_dim, max_action_val, hidden_layer_size) self.critic = Critic(state_space_dim, action_space_dim, hidden_layer_size) if path_to_load is not None: if os.path.exists(path_to_load + "_actor.h5") and \ os.path.exists(path_to_load + "_critic.h5"): self.load(path_to_load) self.target_actor = Actor(state_space_dim, action_space_dim, max_action_val, hidden_layer_size) self.target_critic = Critic(state_space_dim, action_space_dim, hidden_layer_size) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic.model.set_weights(self.critic.model.get_weights()) critic_lr = 0.002 actor_lr = 0.001 self.critic_optimizer = tf.keras.optimizers.Adam(critic_lr) self.actor_optimizer = tf.keras.optimizers.Adam(actor_lr) @tf.function def _apply_gradients(self, states, actions, next_states, rewards): with tf.GradientTape() as tape: target_actions = self.target_actor.forward(next_states) y = tf.cast(rewards, tf.float32) + self.gamma * self.target_critic.forward( [next_states, target_actions]) critic_value = self.critic.forward([states, actions]) critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.model.trainable_variables)) with tf.GradientTape() as tape: actions = self.actor.forward(states) critic_value = self.critic.forward([states, actions]) actor_loss = -tf.math.reduce_mean(critic_value) actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.model.trainable_variables)) def learn(self): states, actions, next_states, rewards = self.buffer.sample() self._apply_gradients(states, actions, next_states, rewards) def remember_step(self, info): self.buffer.remember(info) def update_targets(self): new_weights = [] target_variables = self.target_critic.model.weights for i, variable in enumerate(self.critic.model.weights): new_weights.append(variable * self.tau + target_variables[i] * (1 - self.tau)) self.target_critic.model.set_weights(new_weights) new_weights = [] target_variables = self.target_actor.model.weights for i, variable in enumerate(self.actor.model.weights): new_weights.append(variable * self.tau + target_variables[i] * (1 - self.tau)) self.target_actor.model.set_weights(new_weights) def get_best_action(self, state): tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0) return tf.squeeze(self.actor.forward(tf_state)).numpy() def get_action(self, state): actions = self.get_best_action( state) + self.noise_generator.get_noise() return np.clip(actions, self.min_action_val, self.max_action_val) def save(self, path): print(f"Model has been saved as '{path}'") self.actor.save(path) self.critic.save(path) def load(self, path): print(f"Model has been loaded from '{path}'") self.actor.load(path) self.critic.load(path)
class Agent(object): def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) # add noise to action - for exploration mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy() def choose_action_no_train(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) return mu.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.push(state, action, reward, new_state, done) def learn(self): if self.memory.idx_last < self.batch_size: # not enough data in replay buffer return # select random events state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device) done = torch.tensor(done).to(self.critic.device) new_state = torch.tensor(new_state, dtype=torch.float).to(self.critic.device) action = torch.tensor(action, dtype=torch.float).to(self.critic.device) state = torch.tensor(state, dtype=torch.float).to(self.critic.device) self.target_actor.eval() self.target_critic.eval() self.critic.eval() target_actions = self.target_actor.forward(new_state) critic_value_ = self.target_critic.forward(new_state, target_actions) critic_value = self.critic.forward(state, action) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = torch.tensor(target).to(self.critic.device) target = target.view(self.batch_size, 1) self.critic.train() self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.critic.eval() self.actor.optimizer.zero_grad() mu = self.actor.forward(state) self.actor.train() actor_loss = -self.critic.forward(state, mu) actor_loss = torch.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_dict = dict(target_critic_params) target_actor_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ (1-tau)*target_critic_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau)*target_actor_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) def save_models(self): timestamp = time.strftime("%Y%m%d-%H%M%S") self.actor.save("actor_" + timestamp) self.target_actor.save("target_actor_" + timestamp) self.critic.save("critic_" + timestamp) self.target_critic.save("target_critic_" + timestamp) def load_models(self, fn_actor, fn_target_actor, fn_critic, fn_target_critic): self.actor.load_checkpoint(fn_actor) self.target_actor.load_checkpoint(fn_target_actor) self.critic.load_checkpoint(fn_critic) self.target_critic.load_checkpoint(fn_target_critic)
class TD3: def __init__(self, device, state_dim, action_dim, action_max, gamma=0.99, tau=0.005, lr=3e-4, policy_noise=0.2, noise_clip=0.5, exploration_noise=0.1, policy_freq=2): self.actor = Actor(state_dim, 256, action_dim, action_max).to(device) self.target_actor = copy.deepcopy(self.actor) self.actor_optimizer = optim.Adam(params=self.actor.parameters(), lr=lr) self.critic = Critic(state_dim, 256, action_dim).to(device) self.target_critic = copy.deepcopy(self.critic) self.critic_optimizer = optim.Adam(params=self.critic.parameters(), lr=lr) self.device = device self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.rollout_actor = TD3RolloutActor(state_dim, action_dim, action_max, exploration_noise) self.sync_rollout_actor() self.iteration_num = 0 def train(self, replay_buffer, batch_size=256): self.iteration_num += 1 st, nx_st, ac, rw, mask = replay_buffer.sample(batch_size) with torch.no_grad(): noise = (torch.randn_like(ac) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) nx_ac = self.target_actor.forward(nx_st, noise) target_q1, target_q2 = self.target_critic.forward(nx_st, nx_ac) min_q = torch.min(target_q1, target_q2) target_q = rw + mask * self.gamma * min_q q1, q2 = self.critic.forward(st, ac) critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q) self.critic.zero_grad() critic_loss.backward() self.critic_optimizer.step() if self.iteration_num % self.policy_freq == 0: actor_loss = -self.critic.q1(st, self.actor.forward(st)).mean() self.actor.zero_grad() actor_loss.backward() self.actor_optimizer.step() for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) self.sync_rollout_actor() def sync_rollout_actor(self): for param, target_param in zip(self.actor.parameters(), self.rollout_actor.parameters()): target_param.data.copy_(param.data.cpu()) def save(self, path): torch.save(self.critic.state_dict(), os.path.join(path, 'critic.pth')) torch.save(self.target_critic.state_dict(), os.path.join(path, 'target_critic.pth')) torch.save(self.critic_optimizer.state_dict(), os.path.join(path, 'critic_optimizer.pth')) torch.save(self.actor.state_dict(), os.path.join(path, 'actor.pth')) torch.save(self.target_actor.state_dict(), os.path.join(path, 'target_actor.pth')) torch.save(self.actor_optimizer.state_dict(), os.path.join(path, 'actor_optimizer.pth')) def load(self, path): self.critic.load_state_dict( torch.load(os.path.join(path, 'critic.pth'))) self.target_critic.load_state_dict( torch.load(os.path.join(path, 'target_critic.pth'))) self.critic_optimizer.load_state_dict( torch.load(os.path.join(path, 'critic_optimizer.pth'))) self.actor.load_state_dict(torch.load(os.path.join(path, 'actor.pth'))) self.target_actor.load_state_dict( torch.load(os.path.join(path, 'target_actor.pth'))) self.actor_optimizer.load_state_dict( torch.load(os.path.join(path, 'actor_optimizer.pth'))) self.sync_rollout_actor()