def on_step_end(self): data = self.transitions.get() # contains only one transition S, A, R, Snext, dones = data batch_size = len(self.transitions) batch_shape = (batch_size, ) gamma, policy, critic = self.gamma, self.policy, self.critic t = self.episode_step targets, deltas = self.compute_td_zero(data, V=critic.predict) with tf.GradientTape() as tape: # Policy Objective probs = policy(S) probs = tf.gather(probs, A.reshape([-1, 1]), batch_dims=1) probs = tf.reshape(probs, [-1]) U.check_shape(probs, batch_shape) policy_objective = (gamma**t) * deltas * tf.math.log(probs) U.check_shape(policy_objective, batch_shape) policy_objective = tf.reduce_mean(policy_objective) U.check_shape(policy_objective, ()) # Critic Loss V = critic(S) V = tf.reshape(V, [-1]) U.check_shape(V, batch_shape) critic_loss = tf.reduce_mean(tf.square(targets - V)) U.check_shape(critic_loss, ()) # Total Loss loss = -policy_objective + critic_loss grads = tape.gradient(loss, self.parameters) self.optimizer.apply_gradients(zip(grads, self.parameters))
def optimize(self, batch): S, A, old_probs, advantages, targets = batch batch_size = S.shape[0] batch_shape = (batch_size, ) epsilon, policy, critic = self.epsilon, self.policy, self.critic with nn.GradientTape() as tape: # Policy Objective probs = policy(S).gather(A, batch_dims=1).flatten() U.check_shape(probs, batch_shape) ratios = probs / old_probs Lcpi = ratios * advantages Lclip = ratios.clip(1 - epsilon, 1 + epsilon) * advantages policy_objective = Lcpi.minimum(Lclip) U.check_shape(policy_objective, batch_shape) policy_objective = policy_objective.mean() U.check_shape(policy_objective, ()) # Critic Loss V = critic(S).flatten() U.check_shape(V, batch_shape) critic_loss = (targets - V).pow(2).mean() U.check_shape(critic_loss, ()) # Total Loss loss = -policy_objective + critic_loss grads = tape.gradient(loss, self.parameters) self.optimizer.apply_gradients(zip(grads, self.parameters))
def optimize(self, S, A, old_probs, advantages, targets): batch_size = len(A) batch_shape = (batch_size, ) epsilon, policy, critic = self.epsilon, self.policy, self.critic with tf.GradientTape() as tape: # Policy Objective probs = policy(S) probs = tf.gather(probs, A.reshape([-1, 1]), batch_dims=1) probs = tf.reshape(probs, [-1]) U.check_shape(probs, batch_shape) ratios = probs / old_probs Lcpi = ratios * advantages Lclip = tf.clip_by_value(ratios, 1 - epsilon, 1 + epsilon) * advantages policy_objective = tf.minimum(Lcpi, Lclip) U.check_shape(policy_objective, batch_shape) policy_objective = tf.reduce_mean(policy_objective) U.check_shape(policy_objective, ()) # Critic Loss V = critic(S) V = tf.reshape(V, [-1]) U.check_shape(V, batch_shape) critic_loss = tf.reduce_mean(tf.square(targets - V)) U.check_shape(critic_loss, ()) # Total Loss loss = -policy_objective + critic_loss grads = tape.gradient(loss, self.parameters) self.optimizer.apply_gradients(zip(grads, self.parameters))
def optimize(self, batch): S, A, old_probs, advantages, targets = batch batch_size = S.shape[0] batch_shape = (batch_size, ) epsilon, policy, critic, optimizer = self.epsilon, self.policy, self.critic, self.optimizer # Policy Objective probs = policy(S).gather(1, A).flatten() U.check_shape(probs, batch_shape) ratios = probs / old_probs Lcpi = ratios * advantages Lclip = ratios.clamp(1 - epsilon, 1 + epsilon) * advantages policy_objective = torch.min(Lcpi, Lclip) U.check_shape(policy_objective, batch_shape) policy_objective = policy_objective.mean() U.check_shape(policy_objective, ()) # Critic Loss V = critic(S).flatten() U.check_shape(V, batch_shape) critic_loss = (targets - V).pow(2).mean() U.check_shape(critic_loss, ()) # Total Loss loss = -policy_objective + critic_loss optimizer.zero_grad() loss.backward() optimizer.step()
def learn(self, data): S, A, R, Snext, dones = data S, A = nn.tensors((S, A)) A = A.reshape([-1, 1]) T = S.shape[0] batch_shape = (T, ) gamma, lambd = self.gamma, self.lambd policy, old, critic = self.policy, self.old, self.critic old_probs = old(S).detach().gather(A, batch_dims=1).flatten() U.check_shape(old_probs, batch_shape) targets, deltas = self.compute_td_zero(data, V=lambda x: critic(x).detach()) advantages = self.compute_gae(deltas=deltas.numpy(), dones=dones) advantages = nn.tensor(advantages) indices = np.arange(T) for _ in range(self.epochs): np.random.shuffle(indices) for batch in np.array_split(indices, self.batch_size): batch = nn.tensor(batch) self.optimize( (S.gather(batch), A.gather(batch), old_probs.gather(batch), advantages.gather(batch), targets.gather(batch)))
def learn(self, data): S, A, R, Snext, dones = data dones = tf.cast(dones, 'float32') batch_shape = (S.shape[0], ) gamma, model, target = self.gamma, self.model, self.target Qtarget = tf.stop_gradient(target(Snext)) if self.double: Qnext = tf.stop_gradient(model(Snext)) Amax = tf.argmax(Qnext, axis=-1) Qmax = tf.gather(Qtarget, tf.reshape(Amax, [-1, 1]), batch_dims=1) Qmax = tf.reshape(Qmax, [-1]) else: Qmax = tf.reduce_max(Qtarget, axis=-1) U.check_shape(Qmax, batch_shape) targets = R + gamma * Qmax * (1 - dones) U.check_shape(targets, batch_shape) with tf.GradientTape() as tape: Q = model(S) Q = tf.gather(Q, tf.reshape(A, [-1, 1]), batch_dims=1) Q = tf.reshape(Q, [-1]) U.check_shape(Q, batch_shape) loss = tf.reduce_mean(tf.square(targets - Q)) U.check_shape(loss, ()) grads = tape.gradient(loss, model.trainable_variables) self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
def on_episode_end(self): batch_size = len(self.transitions) data = self.transitions.get() self.transitions.reset() batch_shape = (batch_size, ) self.transitions.reset() S, A, R, Snext, dones = data gamma, policy, baseline = self.gamma, self.policy, self.baseline if baseline: V = baseline.predict(S).flatten() else: V = np.zeros_like(rewards) U.check_shape(V, batch_shape) G, T = self.compute_returns(R), len(R) for t in range(T): s, a, g = S[t], A[t], G[t] delta = g - V[t] with tf.GradientTape() as tape: # Policy Objective probs = policy(s[None])[0] p = probs[a] policy_objective = (gamma**t) * delta * tf.math.log(p) U.check_shape(policy_objective.shape, ()) # Baseline Loss if baseline: v = baseline(s[None])[0][0] baseline_loss = tf.square(g - v) U.check_shape(baseline_loss, policy_objective) else: baseline_loss = 0 # Total Loss loss = -policy_objective + baseline_loss grads = tape.gradient(loss, self.parameters) self.optimizer.apply_gradients(zip(grads, self.parameters))
def learn(self, data): data = [torch.from_numpy(v) for v in data] S, A, R, Snext, dones = data A = A.long().reshape([-1, 1]) T = len(R) batch_shape = (T, ) gamma, lambd = self.gamma, self.lambd policy, old, critic = self.policy, self.old, self.critic old_probs = old(S).detach().gather(1, A).flatten() U.check_shape(old_probs, batch_shape) targets, deltas = self.compute_td_zero(data, V=lambda x: critic(x).detach()) advantages = self.compute_gae(deltas=deltas, dones=dones) advantages = torch.from_numpy(advantages) for _ in range(self.epochs): indices = torch.randperm(T) for batch in torch.split(indices, self.batch_size): self.optimize((S[batch], A[batch], old_probs[batch], advantages[batch], targets[batch]))
def learn(self, data): S, A, R, Snext, dones = data T = len(R) batch_shape = (T, ) gamma, lambd = self.gamma, self.lambd policy, old, critic = self.policy, self.old, self.critic old_probs = old.predict(S) old_probs = old_probs[range(old_probs.shape[0]), A] U.check_shape(old_probs, batch_shape) targets, deltas = self.compute_td_zero(data, V=critic.predict) advantages = self.compute_gae(deltas=deltas, dones=dones) indices = np.arange(T) for _ in range(self.epochs): np.random.shuffle(indices) for batch in np.array_split(indices, self.batch_size): self.optimize(S=S[batch], A=A[batch], old_probs=old_probs[batch], advantages=advantages[batch], targets=targets[batch])
def learn(self, data): S, A, R, Snext, dones = [torch.from_numpy(v) for v in data] A = A.long() batch_shape = (S.shape[0], ) gamma, model, target, optimizer = self.gamma, self.model, self.target, self.optimizer Qtarget = target(Snext).detach() if self.double: Amax = model(Snext).detach().argmax(-1) Qmax = Qtarget.gather(1, Amax.reshape([-1, 1])).flatten() else: Qmax = Qtarget.max(-1) U.check_shape(Qmax, batch_shape) targets = R + gamma * Qmax * (1 - dones) U.check_shape(targets, batch_shape) Q = model(S).gather(1, A.reshape([-1, 1])).flatten() U.check_shape(Q, batch_shape) loss = (targets - Q).pow(2).mean() U.check_shape(loss, ()) optimizer.zero_grad() loss.backward() optimizer.step()
def learn(self, data): S, A, R, Snext, dones = nn.tensors(data) dones = dones.cast('float32') batch_shape = (S.shape[0], ) gamma, model, target = self.gamma, self.model, self.target Qtarget = target(Snext).detach() if self.double: Amax = model(Snext).detach().argmax(-1) Qmax = Qtarget.gather(Amax.reshape([-1, 1]), batch_dims=1).flatten() else: Qmax = Qtarget.max(-1) U.check_shape(Qmax, batch_shape) targets = R + gamma * Qmax * (1 - dones) U.check_shape(targets, batch_shape) with nn.GradientTape() as tape: Q = model(S).gather(A.reshape([-1, 1]), batch_dims=1).flatten() U.check_shape(Q, batch_shape) loss = (targets - Q).square().mean() U.check_shape(loss, ()) grads = tape.gradient(loss, model.trainable_variables) self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
def step(self, x): x_shape, batch_shape = x.shape, (x.shape[0], ) x = (x - tf.reduce_mean(x, axis=0)) / tf.math.reduce_std(x, axis=0) # x = tf.clip_by_value(x, -5, 5) U.check_shape(x, x_shape) target, model = self.target, self.model targets = tf.stop_gradient(target(x)) with tf.GradientTape() as tape: predictions = model(x) errors = tf.reduce_sum(tf.square(targets - predictions), axis=-1) U.check_shape(errors, batch_shape) loss = tf.reduce_mean(errors) U.check_shape(loss, ()) grads = tape.gradient(loss, model.trainable_variables) self.optimizer.apply_gradients(zip(grads, model.trainable_variables)) return errors
def learn(self): batch_size = len(self.transitions) data = self.transitions.get() self.transitions.reset() data = [torch.from_numpy(v) for v in data] S, A, R, Snext, dones = data A = A.long().reshape([-1, 1]) batch_shape = (batch_size, ) gamma, policy, critic = self.gamma, self.policy, self.critic # If last state is not terminal then bootstrap from it if not dones[-1]: R[-1] += gamma * critic( Snext[-1:])[0][0].detach().numpy() # handle batching G = self.compute_returns(R) G = torch.from_numpy(G) deltas = G - critic(S).detach().flatten() U.check_shape(deltas, batch_shape) # Policy Objective probs = policy(S).gather(1, A).flatten() U.check_shape(probs, batch_shape) policy_objective = deltas * probs.log() U.check_shape(policy_objective, batch_shape) policy_objective = policy_objective.mean() U.check_shape(policy_objective, ()) # Critic Loss V = critic(S).flatten() U.check_shape(V, batch_shape) critic_loss = (G - V).pow(2).mean() U.check_shape(critic_loss, ()) # Total Loss loss = -policy_objective + critic_loss self.optimizer.zero_grad() loss.backward() grads = self.get_gradients() self.send_gradients(grads) self.receive_parameters()
def learn(self): batch_size = len(self.transitions) data = self.transitions.get() self.transitions.reset() S, A, R, Snext, dones = data batch_shape = (batch_size, ) gamma, policy, critic = self.gamma, self.policy, self.critic # If last state is not terminal then bootstrap from it if not dones[-1]: R[-1] += gamma * critic.predict( Snext[-1:])[0][0] # handle batching G = self.compute_returns(R) deltas = G - critic.predict(S).flatten() U.check_shape(deltas, batch_shape) with tf.GradientTape() as tape: # Policy Objective probs = policy(S) probs = tf.gather(probs, A.reshape([-1, 1]), batch_dims=1) probs = tf.reshape(probs, [-1]) U.check_shape(probs, batch_shape) policy_objective = deltas * tf.math.log(probs) U.check_shape(policy_objective, batch_shape) policy_objective = tf.reduce_mean(policy_objective) U.check_shape(policy_objective, ()) # Critic Loss V = critic(S) V = tf.reshape(V, [-1]) U.check_shape(V, batch_shape) critic_loss = tf.reduce_mean(tf.square(G - V)) U.check_shape(critic_loss, ()) # Total Loss loss = -policy_objective + critic_loss grads = tape.gradient(loss, self.parameters) self.send_gradients(grads) self.receive_parameters()
def learn(self): batch_size = len(self.transitions) data = self.transitions.get() self.transitions.reset() S, A, R, Snext, dones = data A = A.reshape([-1, 1]) batch_shape = (batch_size, ) gamma, policy, critic = self.gamma, self.policy, self.critic # If last state is not terminal then bootstrap from it if not dones[-1]: R[-1] += gamma * critic( Snext[-1:])[0][0].numpy() # handle batching G = self.compute_returns(R) deltas = G - critic(S).detach().flatten() U.check_shape(deltas, batch_shape) with nn.GradientTape() as tape: # Policy Objective probs = policy(S).gather(A, batch_dims=1).flatten() U.check_shape(probs, batch_shape) policy_objective = deltas * probs.log() U.check_shape(policy_objective, batch_shape) policy_objective = policy_objective.mean() U.check_shape(policy_objective, ()) # Critic Loss V = critic(S).flatten() U.check_shape(V, batch_shape) critic_loss = (G - V).pow(2).mean() U.check_shape(critic_loss, ()) # Total Loss loss = -policy_objective + critic_loss grads = tape.gradient(loss, self.parameters) self.send_gradients(grads) self.receive_parameters()