Пример #1
0
 def on_step_end(self):
     data = self.transitions.get()  # contains only one transition
     S, A, R, Snext, dones = data
     batch_size = len(self.transitions)
     batch_shape = (batch_size, )
     gamma, policy, critic = self.gamma, self.policy, self.critic
     t = self.episode_step
     targets, deltas = self.compute_td_zero(data, V=critic.predict)
     with tf.GradientTape() as tape:
         # Policy Objective
         probs = policy(S)
         probs = tf.gather(probs, A.reshape([-1, 1]), batch_dims=1)
         probs = tf.reshape(probs, [-1])
         U.check_shape(probs, batch_shape)
         policy_objective = (gamma**t) * deltas * tf.math.log(probs)
         U.check_shape(policy_objective, batch_shape)
         policy_objective = tf.reduce_mean(policy_objective)
         U.check_shape(policy_objective, ())
         # Critic Loss
         V = critic(S)
         V = tf.reshape(V, [-1])
         U.check_shape(V, batch_shape)
         critic_loss = tf.reduce_mean(tf.square(targets - V))
         U.check_shape(critic_loss, ())
         # Total Loss
         loss = -policy_objective + critic_loss
     grads = tape.gradient(loss, self.parameters)
     self.optimizer.apply_gradients(zip(grads, self.parameters))
Пример #2
0
 def optimize(self, batch):
     S, A, old_probs, advantages, targets = batch
     batch_size = S.shape[0]
     batch_shape = (batch_size, )
     epsilon, policy, critic = self.epsilon, self.policy, self.critic
     with nn.GradientTape() as tape:
         # Policy Objective
         probs = policy(S).gather(A, batch_dims=1).flatten()
         U.check_shape(probs, batch_shape)
         ratios = probs / old_probs
         Lcpi = ratios * advantages
         Lclip = ratios.clip(1 - epsilon, 1 + epsilon) * advantages
         policy_objective = Lcpi.minimum(Lclip)
         U.check_shape(policy_objective, batch_shape)
         policy_objective = policy_objective.mean()
         U.check_shape(policy_objective, ())
         # Critic Loss
         V = critic(S).flatten()
         U.check_shape(V, batch_shape)
         critic_loss = (targets - V).pow(2).mean()
         U.check_shape(critic_loss, ())
         # Total Loss
         loss = -policy_objective + critic_loss
     grads = tape.gradient(loss, self.parameters)
     self.optimizer.apply_gradients(zip(grads, self.parameters))
Пример #3
0
 def optimize(self, S, A, old_probs, advantages, targets):
     batch_size = len(A)
     batch_shape = (batch_size, )
     epsilon, policy, critic = self.epsilon, self.policy, self.critic
     with tf.GradientTape() as tape:
         # Policy Objective
         probs = policy(S)
         probs = tf.gather(probs, A.reshape([-1, 1]), batch_dims=1)
         probs = tf.reshape(probs, [-1])
         U.check_shape(probs, batch_shape)
         ratios = probs / old_probs
         Lcpi = ratios * advantages
         Lclip = tf.clip_by_value(ratios, 1 - epsilon,
                                  1 + epsilon) * advantages
         policy_objective = tf.minimum(Lcpi, Lclip)
         U.check_shape(policy_objective, batch_shape)
         policy_objective = tf.reduce_mean(policy_objective)
         U.check_shape(policy_objective, ())
         # Critic Loss
         V = critic(S)
         V = tf.reshape(V, [-1])
         U.check_shape(V, batch_shape)
         critic_loss = tf.reduce_mean(tf.square(targets - V))
         U.check_shape(critic_loss, ())
         # Total Loss
         loss = -policy_objective + critic_loss
     grads = tape.gradient(loss, self.parameters)
     self.optimizer.apply_gradients(zip(grads, self.parameters))
Пример #4
0
 def optimize(self, batch):
     S, A, old_probs, advantages, targets = batch
     batch_size = S.shape[0]
     batch_shape = (batch_size, )
     epsilon, policy, critic, optimizer = self.epsilon, self.policy, self.critic, self.optimizer
     # Policy Objective
     probs = policy(S).gather(1, A).flatten()
     U.check_shape(probs, batch_shape)
     ratios = probs / old_probs
     Lcpi = ratios * advantages
     Lclip = ratios.clamp(1 - epsilon, 1 + epsilon) * advantages
     policy_objective = torch.min(Lcpi, Lclip)
     U.check_shape(policy_objective, batch_shape)
     policy_objective = policy_objective.mean()
     U.check_shape(policy_objective, ())
     # Critic Loss
     V = critic(S).flatten()
     U.check_shape(V, batch_shape)
     critic_loss = (targets - V).pow(2).mean()
     U.check_shape(critic_loss, ())
     # Total Loss
     loss = -policy_objective + critic_loss
     optimizer.zero_grad()
     loss.backward()
     optimizer.step()
Пример #5
0
    def learn(self, data):
        S, A, R, Snext, dones = data
        S, A = nn.tensors((S, A))
        A = A.reshape([-1, 1])
        T = S.shape[0]
        batch_shape = (T, )
        gamma, lambd = self.gamma, self.lambd
        policy, old, critic = self.policy, self.old, self.critic

        old_probs = old(S).detach().gather(A, batch_dims=1).flatten()
        U.check_shape(old_probs, batch_shape)

        targets, deltas = self.compute_td_zero(data,
                                               V=lambda x: critic(x).detach())
        advantages = self.compute_gae(deltas=deltas.numpy(), dones=dones)
        advantages = nn.tensor(advantages)

        indices = np.arange(T)
        for _ in range(self.epochs):
            np.random.shuffle(indices)
            for batch in np.array_split(indices, self.batch_size):
                batch = nn.tensor(batch)
                self.optimize(
                    (S.gather(batch), A.gather(batch), old_probs.gather(batch),
                     advantages.gather(batch), targets.gather(batch)))
Пример #6
0
 def learn(self, data):
     S, A, R, Snext, dones = data
     dones = tf.cast(dones, 'float32')
     batch_shape = (S.shape[0], )
     gamma, model, target = self.gamma, self.model, self.target
     Qtarget = tf.stop_gradient(target(Snext))
     if self.double:
         Qnext = tf.stop_gradient(model(Snext))
         Amax = tf.argmax(Qnext, axis=-1)
         Qmax = tf.gather(Qtarget, tf.reshape(Amax, [-1, 1]), batch_dims=1)
         Qmax = tf.reshape(Qmax, [-1])
     else:
         Qmax = tf.reduce_max(Qtarget, axis=-1)
     U.check_shape(Qmax, batch_shape)
     targets = R + gamma * Qmax * (1 - dones)
     U.check_shape(targets, batch_shape)
     with tf.GradientTape() as tape:
         Q = model(S)
         Q = tf.gather(Q, tf.reshape(A, [-1, 1]), batch_dims=1)
         Q = tf.reshape(Q, [-1])
         U.check_shape(Q, batch_shape)
         loss = tf.reduce_mean(tf.square(targets - Q))
         U.check_shape(loss, ())
     grads = tape.gradient(loss, model.trainable_variables)
     self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
Пример #7
0
 def on_episode_end(self):
     batch_size = len(self.transitions)
     data = self.transitions.get()
     self.transitions.reset()
     batch_shape = (batch_size, )
     self.transitions.reset()
     S, A, R, Snext, dones = data
     gamma, policy, baseline = self.gamma, self.policy, self.baseline
     if baseline:
         V = baseline.predict(S).flatten()
     else:
         V = np.zeros_like(rewards)
     U.check_shape(V, batch_shape)
     G, T = self.compute_returns(R), len(R)
     for t in range(T):
         s, a, g = S[t], A[t], G[t]
         delta = g - V[t]
         with tf.GradientTape() as tape:
             # Policy Objective
             probs = policy(s[None])[0]
             p = probs[a]
             policy_objective = (gamma**t) * delta * tf.math.log(p)
             U.check_shape(policy_objective.shape, ())
             # Baseline Loss
             if baseline:
                 v = baseline(s[None])[0][0]
                 baseline_loss = tf.square(g - v)
                 U.check_shape(baseline_loss, policy_objective)
             else:
                 baseline_loss = 0
             # Total Loss
             loss = -policy_objective + baseline_loss
         grads = tape.gradient(loss, self.parameters)
         self.optimizer.apply_gradients(zip(grads, self.parameters))
Пример #8
0
    def learn(self, data):
        data = [torch.from_numpy(v) for v in data]
        S, A, R, Snext, dones = data
        A = A.long().reshape([-1, 1])
        T = len(R)
        batch_shape = (T, )
        gamma, lambd = self.gamma, self.lambd
        policy, old, critic = self.policy, self.old, self.critic

        old_probs = old(S).detach().gather(1, A).flatten()
        U.check_shape(old_probs, batch_shape)

        targets, deltas = self.compute_td_zero(data,
                                               V=lambda x: critic(x).detach())
        advantages = self.compute_gae(deltas=deltas, dones=dones)
        advantages = torch.from_numpy(advantages)

        for _ in range(self.epochs):
            indices = torch.randperm(T)
            for batch in torch.split(indices, self.batch_size):
                self.optimize((S[batch], A[batch], old_probs[batch],
                               advantages[batch], targets[batch]))
Пример #9
0
    def learn(self, data):
        S, A, R, Snext, dones = data
        T = len(R)
        batch_shape = (T, )
        gamma, lambd = self.gamma, self.lambd
        policy, old, critic = self.policy, self.old, self.critic

        old_probs = old.predict(S)
        old_probs = old_probs[range(old_probs.shape[0]), A]
        U.check_shape(old_probs, batch_shape)

        targets, deltas = self.compute_td_zero(data, V=critic.predict)
        advantages = self.compute_gae(deltas=deltas, dones=dones)

        indices = np.arange(T)
        for _ in range(self.epochs):
            np.random.shuffle(indices)
            for batch in np.array_split(indices, self.batch_size):
                self.optimize(S=S[batch],
                              A=A[batch],
                              old_probs=old_probs[batch],
                              advantages=advantages[batch],
                              targets=targets[batch])
Пример #10
0
 def learn(self, data):
     S, A, R, Snext, dones = [torch.from_numpy(v) for v in data]
     A = A.long()
     batch_shape = (S.shape[0], )
     gamma, model, target, optimizer = self.gamma, self.model, self.target, self.optimizer
     Qtarget = target(Snext).detach()
     if self.double:
         Amax = model(Snext).detach().argmax(-1)
         Qmax = Qtarget.gather(1, Amax.reshape([-1, 1])).flatten()
     else:
         Qmax = Qtarget.max(-1)
     U.check_shape(Qmax, batch_shape)
     targets = R + gamma * Qmax * (1 - dones)
     U.check_shape(targets, batch_shape)
     Q = model(S).gather(1, A.reshape([-1, 1])).flatten()
     U.check_shape(Q, batch_shape)
     loss = (targets - Q).pow(2).mean()
     U.check_shape(loss, ())
     optimizer.zero_grad()
     loss.backward()
     optimizer.step()
Пример #11
0
 def learn(self, data):
     S, A, R, Snext, dones = nn.tensors(data)
     dones = dones.cast('float32')
     batch_shape = (S.shape[0], )
     gamma, model, target = self.gamma, self.model, self.target
     Qtarget = target(Snext).detach()
     if self.double:
         Amax = model(Snext).detach().argmax(-1)
         Qmax = Qtarget.gather(Amax.reshape([-1, 1]),
                               batch_dims=1).flatten()
     else:
         Qmax = Qtarget.max(-1)
     U.check_shape(Qmax, batch_shape)
     targets = R + gamma * Qmax * (1 - dones)
     U.check_shape(targets, batch_shape)
     with nn.GradientTape() as tape:
         Q = model(S).gather(A.reshape([-1, 1]), batch_dims=1).flatten()
         U.check_shape(Q, batch_shape)
         loss = (targets - Q).square().mean()
         U.check_shape(loss, ())
     grads = tape.gradient(loss, model.trainable_variables)
     self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
Пример #12
0
 def step(self, x):
     x_shape, batch_shape = x.shape, (x.shape[0], )
     x = (x - tf.reduce_mean(x, axis=0)) / tf.math.reduce_std(x, axis=0)
     # x = tf.clip_by_value(x, -5, 5)
     U.check_shape(x, x_shape)
     target, model = self.target, self.model
     targets = tf.stop_gradient(target(x))
     with tf.GradientTape() as tape:
         predictions = model(x)
         errors = tf.reduce_sum(tf.square(targets - predictions), axis=-1)
         U.check_shape(errors, batch_shape)
         loss = tf.reduce_mean(errors)
         U.check_shape(loss, ())
     grads = tape.gradient(loss, model.trainable_variables)
     self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
     return errors
Пример #13
0
 def learn(self):
     batch_size = len(self.transitions)
     data = self.transitions.get()
     self.transitions.reset()
     data = [torch.from_numpy(v) for v in data]
     S, A, R, Snext, dones = data
     A = A.long().reshape([-1, 1])
     batch_shape = (batch_size, )
     gamma, policy, critic = self.gamma, self.policy, self.critic
     # If last state is not terminal then bootstrap from it
     if not dones[-1]:
         R[-1] += gamma * critic(
             Snext[-1:])[0][0].detach().numpy()  # handle batching
     G = self.compute_returns(R)
     G = torch.from_numpy(G)
     deltas = G - critic(S).detach().flatten()
     U.check_shape(deltas, batch_shape)
     # Policy Objective
     probs = policy(S).gather(1, A).flatten()
     U.check_shape(probs, batch_shape)
     policy_objective = deltas * probs.log()
     U.check_shape(policy_objective, batch_shape)
     policy_objective = policy_objective.mean()
     U.check_shape(policy_objective, ())
     # Critic Loss
     V = critic(S).flatten()
     U.check_shape(V, batch_shape)
     critic_loss = (G - V).pow(2).mean()
     U.check_shape(critic_loss, ())
     # Total Loss
     loss = -policy_objective + critic_loss
     self.optimizer.zero_grad()
     loss.backward()
     grads = self.get_gradients()
     self.send_gradients(grads)
     self.receive_parameters()
Пример #14
0
 def learn(self):
     batch_size = len(self.transitions)
     data = self.transitions.get()
     self.transitions.reset()
     S, A, R, Snext, dones = data
     batch_shape = (batch_size, )
     gamma, policy, critic = self.gamma, self.policy, self.critic
     # If last state is not terminal then bootstrap from it
     if not dones[-1]:
         R[-1] += gamma * critic.predict(
             Snext[-1:])[0][0]  # handle batching
     G = self.compute_returns(R)
     deltas = G - critic.predict(S).flatten()
     U.check_shape(deltas, batch_shape)
     with tf.GradientTape() as tape:
         # Policy Objective
         probs = policy(S)
         probs = tf.gather(probs, A.reshape([-1, 1]), batch_dims=1)
         probs = tf.reshape(probs, [-1])
         U.check_shape(probs, batch_shape)
         policy_objective = deltas * tf.math.log(probs)
         U.check_shape(policy_objective, batch_shape)
         policy_objective = tf.reduce_mean(policy_objective)
         U.check_shape(policy_objective, ())
         # Critic Loss
         V = critic(S)
         V = tf.reshape(V, [-1])
         U.check_shape(V, batch_shape)
         critic_loss = tf.reduce_mean(tf.square(G - V))
         U.check_shape(critic_loss, ())
         # Total Loss
         loss = -policy_objective + critic_loss
     grads = tape.gradient(loss, self.parameters)
     self.send_gradients(grads)
     self.receive_parameters()
Пример #15
0
 def learn(self):
     batch_size = len(self.transitions)
     data = self.transitions.get()
     self.transitions.reset()
     S, A, R, Snext, dones = data
     A = A.reshape([-1, 1])
     batch_shape = (batch_size, )
     gamma, policy, critic = self.gamma, self.policy, self.critic
     # If last state is not terminal then bootstrap from it
     if not dones[-1]:
         R[-1] += gamma * critic(
             Snext[-1:])[0][0].numpy()  # handle batching
     G = self.compute_returns(R)
     deltas = G - critic(S).detach().flatten()
     U.check_shape(deltas, batch_shape)
     with nn.GradientTape() as tape:
         # Policy Objective
         probs = policy(S).gather(A, batch_dims=1).flatten()
         U.check_shape(probs, batch_shape)
         policy_objective = deltas * probs.log()
         U.check_shape(policy_objective, batch_shape)
         policy_objective = policy_objective.mean()
         U.check_shape(policy_objective, ())
         # Critic Loss
         V = critic(S).flatten()
         U.check_shape(V, batch_shape)
         critic_loss = (G - V).pow(2).mean()
         U.check_shape(critic_loss, ())
         # Total Loss
         loss = -policy_objective + critic_loss
     grads = tape.gradient(loss, self.parameters)
     self.send_gradients(grads)
     self.receive_parameters()