def build_trajectory(size, from_action_space=False, action_space=None): """ Build a trajctory of four-tuples using ranfom action """ T= [] d = Domain() x = d.initial_state() while len(T) < size: if not from_action_space: # continuous action in (-1,1) u = d.random_action() else: # choose an action from a custom discrete action space u = np.random.choice(action_space, size=1) new_x, r = d.f(u) # add the four-tuple to the trajectory T.append([x, u, r, new_x, d.is_final_state()]) if d.is_final_state(): x = d.initial_state() else: x = new_x # shuffle the trajectory np.random.shuffle(T) return T
def J(policy, N, d=None, x=None): """ Compute the expected return of a policy for a state """ if N == 0: return 0 else: if d is not None: # if domain is initialized use it u = policy(x) new_x, r = d.f(u) return r + gamma*J(policy, N-1, d, new_x) else: # else we create it d = Domain() x = d.initial_state() u = policy(x) new_x, r = d.f(u) return r + gamma*J(policy, N-1, d, new_x)
def td_error(model, action_space, nb_approximations=20): """ Computes an estimation of TD-error for a model """ d = Domain() s = d.initial_state() deltas = [] for i in range(nb_approximations): u = d.random_action() next_s, r = d.f(u) td = delta([s, u, r, next_s], model, action_space) deltas.append(td) if d.is_final_state(): s = d.initial_state() else: s = next_s return np.mean(deltas)
# create an Extra-Tree and train it with 2-dimensional action space tree = train_ExtraTree(action_space=[-1, 1]) # compute the expected reward of the discrete actor-critic model j_list = [] for i in range(1000): j_list.append(utils.J(discrete_ac, 100)) print(f'Expected return of discrete actor critic : {np.mean(j_list)}') # run a simulation using the continuous actor-critic as policy d = Domain() d.env.render() s = d.initial_state() while not d.is_final_state(): # we continue until we reach a final state u = continuous_ac(s) next_s, r = d.f(u) time.sleep(0.01) # let time for the rendering if d.is_final_state(): s = d.initial_state() else: s = next_s # compute the expected reward of the Extra-Tree model # be careful : the action space used here has to match the action space used for the training ! mu = Policy(tree, action_space=[-1, 1]) j_list = [] for i in range( 1000 ): # compute 1000 episode to have the expected return as an average j_list.append(utils.J(mu, 100)) print(f'Expected return of Extra-Tree : {np.mean(j_list)}')
def train(self, episode): # critic network optimizer critic_optimizer = optim.SGD(self.critic.parameters(), lr=0.001) critic_optimizer.zero_grad() # actor network optimizer actor_optimizer = optim.SGD(self.actor.parameters(), lr=0.001) actor_optimizer.zero_grad() actor_losses = [] critic_losses = [] rewards = [] d = Domain() for e in range(episode): print(f'========== episode {e} ==========') transitions = [] log_probs = [] values = [] s = d.initial_state() while not d.is_final_state(): # predict the distribution parameters mu, sigma = self.get_distribution(s) # sample an action from distribution u = torch.randn(1)*sigma + mu # clip the value between -1 and 1 u = u.detach().numpy() u = np.clip(u, a_min=-1, a_max=1).item() # check that u is a number, otherwise go next episode if not np.isfinite(u): print('Warning : action not finite number.') break # apply the action and observe next state and reward next_s, r = d.f(u) transitions.append([s, u, r, next_s]) # value predicted by the critic network value = self.critic(torch.tensor(next_s, dtype=torch.float32)) values.append(value) # log used in actor loss log_prob = -((u - mu) ** 2) / (2 * sigma ** 2) - torch.log(sigma * math.sqrt(2 * math.pi)) log_probs.append(log_prob) # keep track of next state s = next_s if not np.isfinite(u): continue episode_rewards = np.array(transitions) episode_rewards = episode_rewards[:, 2].tolist() rewards.append(sum(episode_rewards)) R = 0 A = torch.zeros(len(values)) for t in reversed(range(len(transitions))): R = transitions[t][2] + utils.gamma * R A[t] = R # advantage A = A - torch.cat(values) # actor and critic loss critic_loss = (A**2).mean() A = A.detach() log_probs = torch.stack(log_probs) actor_loss = (-log_probs*A).mean() # critic update critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() # actor update actor_optimizer.zero_grad() actor_loss.backward() actor_optimizer.step() # save the loss actor_losses.append(actor_loss.item()) critic_losses.append(critic_loss.item()) print(f' critic loss : {critic_losses[-1]} | actor loss : {actor_losses[-1]}') return actor_losses, critic_losses, rewards
def train(self, episode=10): # optimizer of critic network critic_optimizer = optim.SGD(self.critic.parameters(), lr=0.001) critic_optimizer.zero_grad() # actor optimizer actor_optimizer = optim.SGD(self.actor.parameters(), lr=0.001) actor_optimizer.zero_grad() actor_losses = [] critic_losses = [] rewards = [] d = Domain() for e in range(episode): print(f'========== episode {e} ==========') transitions = [] log_probs = [] values = [] s = d.initial_state() while not d.is_final_state( ): # episode terminates when we reach a final state p = self.get_distribution(s) if not np.isfinite(p.detach().numpy()).all(): print('Warning : probabilities not finite numbers.') break # get action with highest probability idx = torch.argmax(p).detach().numpy().item() u = self.action_space[idx] # apply the action and observe next state and reward next_s, r = d.f(u) transitions.append([s, u, r, next_s]) # save log probability log_probs.append(torch.log(p[idx])) value = self.critic(torch.tensor(s, dtype=torch.float32)) values.append(value) # keep track of next state s = next_s if not np.isfinite(p.detach().numpy()).all(): continue # save the sum of episode rewards episode_rewards = np.array(transitions) episode_rewards = episode_rewards[:, 2].tolist() rewards.append(sum(episode_rewards)) R = 0 A = torch.zeros(len(values)) for t in reversed(range(len(transitions))): R = transitions[t][2] + utils.gamma * R A[t] = R # advantage A = A - torch.cat(values) # actor and critic loss critic_loss = (A**2).mean() A = A.detach() log_probs = torch.stack(log_probs) actor_loss = (-log_probs * A).mean() # update the critic network critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() # update tha actor network actor_optimizer.zero_grad() actor_loss.backward() actor_optimizer.step() # save the loss actor_losses.append(actor_loss.item()) critic_losses.append(critic_loss.item()) print( f' critic loss : {critic_losses[-1]} | actor loss : {actor_losses[-1]}' ) return actor_losses, critic_losses, rewards