class DDPGAgent(Agent): def __init__(self, actor_model, tgt_actor_model, critic_model, tgt_critic_model, action_limits, actor_lr=1e-4, critic_lr=1e-3, critic_decay=1e-2, process=None, rb_size=1e6, minibatch_size=64, tau=1e-3, gamma=0.99, warmup_episodes=None, logging=True): super(DDPGAgent, self).__init__(warmup_episodes, logging) self.actor = Actor(actor_model, critic_model, lr=actor_lr) self.tgt_actor = Actor(tgt_actor_model, tgt_critic_model, lr=actor_lr) self.tgt_actor.set_weights(self.actor.get_weights()) self.critic = Critic(critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic = Critic(tgt_critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic.set_weights(self.critic.get_weights()) self.action_limits = action_limits self.process = process self.buffer = ReplayBuffer(rb_size) self.minibatch_size = minibatch_size self.tau = tau self.gamma = gamma self.state_space = K.int_shape(critic_model.inputs[0])[1] self.action_space = K.int_shape(critic_model.inputs[1])[1] if process is None: self.process = OrnsteinUhlenbeck(x0=np.zeros(self.action_space), theta=0.15, mu=0, sigma=0.2) else: self.process = process def sense(self, s, a, r, s_new): # print(self.state_space) s = np.reshape(s, [-1, self.state_space]) s_new = np.reshape(s_new, [-1, self.state_space]) self.buffer.add((s, a, r, s_new)) def act(self, s): s = np.reshape(s, [-1, self.state_space]) a = self.actor(s) #self.tgt_actor(s) # why acting with the target_ctor instead of the actor? # Cache. self.last_state = np.copy(s) self.last_action = np.copy(a) if self.learning_phase: a = self.process(a) #a += self.process() # for OU process a = np.clip(a, self.action_limits[0], self.action_limits[1]) self.last_action_noisy = np.copy(a) return a[0] def train_step(self): minibatch = self.buffer.sample(self.minibatch_size) states = np.zeros([len(minibatch), self.state_space]) states_new = np.zeros([len(minibatch), self.state_space]) actions = np.zeros([len(minibatch), self.action_space]) r = np.zeros([len(minibatch), 1]) for i in range(len(minibatch)): states[i], actions[i], r[i], states_new[i] = minibatch[i] critic_out = self.critic(states_new, self.actor(states_new)) tgt_critic_out = self.tgt_critic(states_new, self.tgt_actor(states_new)) if self.logging: log = [('s', self.last_state), ('a', self.last_action), ('a_noisy', self.last_action_noisy), ('q', self.critic(self.last_state, self.last_action)), ('q_tgt', self.tgt_critic(self.last_state, self.last_action)), (('mse', np.mean(np.square(critic_out - tgt_critic_out))))] self.add_log(log) ys = r + self.gamma * tgt_critic_out loss = self.critic.step(states, actions, ys) # update critic by minimizing the loss self.actor.step(states) # update actor using the sampled policy gradient # Soft weight update. (update the target networks) critic_weights = self.critic.get_weights() tgt_critic_weights = self.tgt_critic.get_weights() actor_weights = self.actor.get_weights() tgt_actor_weights = self.tgt_actor.get_weights() for i in range(len(critic_weights)): tgt_critic_weights[i] = (1 - self.tau) * tgt_critic_weights[i] + \ self.tau * critic_weights[i] self.tgt_critic.set_weights(tgt_critic_weights) for i in range(len(actor_weights)): tgt_actor_weights[i] = (1 - self.tau) * tgt_actor_weights[i] + \ self.tau * actor_weights[i] self.tgt_actor.set_weights(tgt_actor_weights) return loss def new_episode(self): self.process.clear() if self.logging: self.logs.append({}) if len(self.logs) == 1: self.logs[-1]['episode'] = 1 # Initial episode. else: self.logs[-1]['episode'] = self.logs[-2]['episode'] + 1 def save_weights(self, actor_suffix, critic_suffix): self.actor.save_model_weights(actor_suffix) self.critic.save_model_weights(critic_suffix)
class Agent(): def __init__(self, env, hparams): n_action = len(env.action_space.high) self.actor_main = Actor(n_action, hparams) self.actor_target = Actor(n_action, hparams) self.critic_main = Critic(hparams) self.critic_target = Critic(hparams) self.batch_size = 64 self.n_actions = len(env.action_space.high) self.a_opt = tf.keras.optimizers.Adam(hparams['lr']) # self.actor_target = tf.keras.optimizers.Adam(.001) self.c_opt = tf.keras.optimizers.Adam(hparams['lr']) # self.critic_target = tf.keras.optimizers.Adam(.002) self.memory = RBuffer(1_00_000, env.observation_space.shape, len(env.action_space.high)) self.trainstep = 0 self.replace = 5 self.gamma = 0.99 self.min_action = env.action_space.low[0] self.max_action = env.action_space.high[0] def act(self, state, evaluate=False): state = tf.convert_to_tensor([state], dtype=tf.float32) actions = self.actor_main(state) if not evaluate: actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=0.1) actions = self.max_action * (tf.clip_by_value(actions, self.min_action, self.max_action)) # print(actions) return actions[0] def savexp(self, state, next_state, action, done, reward): self.memory.storexp(state, next_state, action, done, reward) def update_target(self): self.actor_target.set_weights(self.actor_main.get_weights()) self.critic_target.set_weights(self.critic_main.get_weights()) def train(self): if self.memory.cnt < self.batch_size: return states, next_states, rewards, actions, dones = self.memory.sample(self.batch_size) states = tf.convert_to_tensor(states, dtype=tf.float32) next_states = tf.convert_to_tensor(next_states, dtype=tf.float32) rewards = tf.convert_to_tensor(rewards, dtype=tf.float32) actions = tf.convert_to_tensor(actions, dtype=tf.float32) # dones = tf.convert_to_tensor(dones, dtype= tf.bool) with tf.GradientTape() as tape1, tf.GradientTape() as tape2: target_actions = self.actor_target(next_states) target_next_state_values = tf.squeeze(self.critic_target(next_states, target_actions), 1) critic_value = tf.squeeze(self.critic_main(states, actions), 1) target_values = rewards + self.gamma * target_next_state_values * dones critic_loss = tf.keras.losses.MSE(target_values, critic_value) new_policy_actions = self.actor_main(states) actor_loss = -self.critic_main(states, new_policy_actions) actor_loss = tf.math.reduce_mean(actor_loss) grads1 = tape1.gradient(actor_loss, self.actor_main.trainable_variables) grads2 = tape2.gradient(critic_loss, self.critic_main.trainable_variables) self.a_opt.apply_gradients(zip(grads1, self.actor_main.trainable_variables)) self.c_opt.apply_gradients(zip(grads2, self.critic_main.trainable_variables)) if self.trainstep % self.replace == 0: self.update_target() self.trainstep += 1