class Agent(object): def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, max_size=10000, layer1_size=400, layer2_size=300, batch_size=64): n_actions = env.action_space.shape[0] self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.sess = tf.Session() self.actor = Actor(alpha, n_actions, 'Actor', input_dims, self.sess, layer1_size, layer2_size, env.action_space.high, self.batch_size, ckpt_dir='tmp/ddpg/actor') self.critic = Critic(beta, n_actions, 'Critic', input_dims, self.sess, layer1_size, layer2_size, self.batch_size, ckpt_dir='tmp/ddpg/critic') self.target_actor = Actor(alpha, n_actions, 'TargetActor', input_dims, self.sess, layer1_size, layer2_size, env.action_space.high, self.batch_size, ckpt_dir='tmp/ddpg/target_actor') self.target_critic = Critic(beta, n_actions, 'TargetCritic', input_dims, self.sess, layer1_size, layer2_size, self.batch_size, ckpt_dir='tmp/ddpg/target_critic') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_actor = [ self.target_actor.params[i].assign( tf.multiply(self.actor.params[i], self.tau) + tf.multiply(self.target_actor.params[i], 1. - self.tau)) for i in range(len(self.target_actor.params)) ] self.update_critic = [ self.target_critic.params[i].assign( tf.multiply(self.critic.params[i], self.tau) + tf.multiply(self.target_critic.params[i], 1. - self.tau)) for i in range(len(self.target_critic.params)) ] self.sess.run(tf.global_variables_initializer()) self.update_target_network_parameters(first=True) def update_target_network_parameters(self, first=False): for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): if first: old_tau = self.tau self.tau = 1.0 self.target_actor.sess.run(self.update_actor) self.target_critic.sess.run(self.update_critic) self.tau = old_tau else: self.target_critic.sess.run(self.update_critic) self.target_actor.sess.run(self.update_actor) def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def choose_action(self, state): # print("State[0]: ",state[0].shape) # print("State[1]: ",state[1].shape) state1 = state[0][np.newaxis, :] state2 = state[1][np.newaxis, :] state = [state1, state2] for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): mu = self.actor.predict(state) noise = self.noise() mu_prime = mu + noise return mu_prime[0] def learn(self): if self.memory.mem_cntr < self.batch_size: return for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) #target q-value(new_state) with actor's bounded action forward pass critic_value_ = self.target_critic.predict( new_state, self.target_actor.predict(new_state)) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = np.reshape(target, (self.batch_size, 1)) _ = self.critic.train(state, action, target) #s_i, a_i and y_i # a = mu(s_i) a_outs = self.actor.predict(state) # gradients of Q w.r.t actions grads = self.critic.get_action_gradients(state, a_outs) self.actor.train(state, grads[0]) self.update_target_network_parameters(first=True) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint()
class Agent(object): def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) # add noise to action - for exploration mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy() def choose_action_no_train(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) return mu.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.push(state, action, reward, new_state, done) def learn(self): if self.memory.idx_last < self.batch_size: # not enough data in replay buffer return # select random events state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device) done = torch.tensor(done).to(self.critic.device) new_state = torch.tensor(new_state, dtype=torch.float).to(self.critic.device) action = torch.tensor(action, dtype=torch.float).to(self.critic.device) state = torch.tensor(state, dtype=torch.float).to(self.critic.device) self.target_actor.eval() self.target_critic.eval() self.critic.eval() target_actions = self.target_actor.forward(new_state) critic_value_ = self.target_critic.forward(new_state, target_actions) critic_value = self.critic.forward(state, action) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = torch.tensor(target).to(self.critic.device) target = target.view(self.batch_size, 1) self.critic.train() self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.critic.eval() self.actor.optimizer.zero_grad() mu = self.actor.forward(state) self.actor.train() actor_loss = -self.critic.forward(state, mu) actor_loss = torch.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_dict = dict(target_critic_params) target_actor_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ (1-tau)*target_critic_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau)*target_actor_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) def save_models(self): timestamp = time.strftime("%Y%m%d-%H%M%S") self.actor.save("actor_" + timestamp) self.target_actor.save("target_actor_" + timestamp) self.critic.save("critic_" + timestamp) self.target_critic.save("target_critic_" + timestamp) def load_models(self, fn_actor, fn_target_actor, fn_critic, fn_target_critic): self.actor.load_checkpoint(fn_actor) self.target_actor.load_checkpoint(fn_target_actor) self.critic.load_checkpoint(fn_critic) self.target_critic.load_checkpoint(fn_target_critic)