示例#1
0
def build_trajectory(size, from_action_space=False, action_space=None):
    """
    Build a trajctory of four-tuples using ranfom action
    """
    T= []
    d = Domain()
    x = d.initial_state()
    while len(T) < size:
        if not from_action_space:  # continuous action in (-1,1)
            u = d.random_action()
        else:  # choose an action from a custom discrete action space
            u = np.random.choice(action_space, size=1)

        new_x, r = d.f(u)

        # add the four-tuple to the trajectory
        T.append([x, u, r, new_x, d.is_final_state()])

        if d.is_final_state():
            x = d.initial_state()
        else:
            x = new_x

    # shuffle the trajectory
    np.random.shuffle(T)
    return T
示例#2
0
def J(policy, N, d=None, x=None):
    """
    Compute the expected return of a policy for a state
    """
    if N == 0:
        return 0
    else:
        if d is not None:  # if domain is initialized use it
            u = policy(x)
            new_x, r = d.f(u)
            return r + gamma*J(policy, N-1, d, new_x)
        else:  # else we create it
            d = Domain()
            x = d.initial_state()
            u = policy(x)
            new_x, r = d.f(u)
            return r + gamma*J(policy, N-1, d, new_x)
示例#3
0
def td_error(model, action_space, nb_approximations=20):
    """
    Computes an estimation of TD-error for a model
    """
    d = Domain()
    s = d.initial_state()
    deltas = []

    for i in range(nb_approximations):
        u = d.random_action()
        next_s, r = d.f(u)

        td = delta([s, u, r, next_s], model, action_space)
        deltas.append(td)

        if d.is_final_state():
            s = d.initial_state()
        else:
            s = next_s

    return np.mean(deltas)
示例#4
0
# create an Extra-Tree and train it with 2-dimensional action space
tree = train_ExtraTree(action_space=[-1, 1])

# compute the expected reward of the discrete actor-critic model
j_list = []
for i in range(1000):
    j_list.append(utils.J(discrete_ac, 100))
print(f'Expected return of discrete actor critic : {np.mean(j_list)}')

# run a simulation using the continuous actor-critic as policy
d = Domain()
d.env.render()
s = d.initial_state()
while not d.is_final_state():  # we continue until we reach a final state
    u = continuous_ac(s)
    next_s, r = d.f(u)
    time.sleep(0.01)  # let time for the rendering
    if d.is_final_state():
        s = d.initial_state()
    else:
        s = next_s

# compute the expected reward of the Extra-Tree model
# be careful : the action space used here has to match the action space used for the training !
mu = Policy(tree, action_space=[-1, 1])
j_list = []
for i in range(
        1000
):  # compute 1000 episode to have the expected return as an average
    j_list.append(utils.J(mu, 100))
print(f'Expected return of Extra-Tree : {np.mean(j_list)}')
    def train(self, episode):
        # critic network optimizer
        critic_optimizer = optim.SGD(self.critic.parameters(), lr=0.001)
        critic_optimizer.zero_grad()

        # actor network optimizer
        actor_optimizer = optim.SGD(self.actor.parameters(), lr=0.001)
        actor_optimizer.zero_grad()

        actor_losses = []
        critic_losses = []
        rewards = []

        d = Domain()
        for e in range(episode):
            print(f'========== episode {e} ==========')
            transitions = []
            log_probs = []
            values = []

            s = d.initial_state()
            while not d.is_final_state():
                # predict the distribution parameters
                mu, sigma = self.get_distribution(s)

                # sample an action from distribution
                u = torch.randn(1)*sigma + mu

                # clip the value between -1 and 1
                u = u.detach().numpy()
                u = np.clip(u, a_min=-1, a_max=1).item()

                # check that u is a number, otherwise go next episode
                if not np.isfinite(u):
                    print('Warning : action not finite number.')
                    break

                # apply the action and observe next state and reward
                next_s, r = d.f(u)
                transitions.append([s, u, r, next_s])

                # value predicted by the critic network
                value = self.critic(torch.tensor(next_s, dtype=torch.float32))
                values.append(value)

                # log used in actor loss
                log_prob = -((u - mu) ** 2) / (2 * sigma ** 2) - torch.log(sigma * math.sqrt(2 * math.pi))
                log_probs.append(log_prob)

                # keep track of next state
                s = next_s

            if not np.isfinite(u):
                continue

            episode_rewards = np.array(transitions)
            episode_rewards = episode_rewards[:, 2].tolist()
            rewards.append(sum(episode_rewards))

            R = 0
            A = torch.zeros(len(values))
            for t in reversed(range(len(transitions))):
                R = transitions[t][2] + utils.gamma * R
                A[t] = R

            # advantage
            A = A - torch.cat(values)

            # actor and critic loss
            critic_loss = (A**2).mean()
            A = A.detach()
            log_probs = torch.stack(log_probs)
            actor_loss = (-log_probs*A).mean()

            # critic update
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # actor update
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # save the loss
            actor_losses.append(actor_loss.item())
            critic_losses.append(critic_loss.item())
            print(f' critic loss : {critic_losses[-1]} | actor loss : {actor_losses[-1]}')

        return actor_losses, critic_losses, rewards
示例#6
0
    def train(self, episode=10):
        # optimizer of critic network
        critic_optimizer = optim.SGD(self.critic.parameters(), lr=0.001)
        critic_optimizer.zero_grad()

        # actor optimizer
        actor_optimizer = optim.SGD(self.actor.parameters(), lr=0.001)
        actor_optimizer.zero_grad()

        actor_losses = []
        critic_losses = []
        rewards = []

        d = Domain()
        for e in range(episode):
            print(f'========== episode {e} ==========')
            transitions = []
            log_probs = []
            values = []

            s = d.initial_state()
            while not d.is_final_state(
            ):  # episode terminates when we reach a final state
                p = self.get_distribution(s)

                if not np.isfinite(p.detach().numpy()).all():
                    print('Warning : probabilities not finite numbers.')
                    break

                # get action with highest probability
                idx = torch.argmax(p).detach().numpy().item()
                u = self.action_space[idx]

                # apply the action and observe next state and reward
                next_s, r = d.f(u)
                transitions.append([s, u, r, next_s])

                # save log probability
                log_probs.append(torch.log(p[idx]))

                value = self.critic(torch.tensor(s, dtype=torch.float32))
                values.append(value)

                # keep track of next state
                s = next_s

            if not np.isfinite(p.detach().numpy()).all():
                continue

            # save the sum of episode rewards
            episode_rewards = np.array(transitions)
            episode_rewards = episode_rewards[:, 2].tolist()
            rewards.append(sum(episode_rewards))

            R = 0
            A = torch.zeros(len(values))
            for t in reversed(range(len(transitions))):
                R = transitions[t][2] + utils.gamma * R
                A[t] = R

            # advantage
            A = A - torch.cat(values)

            # actor and critic loss
            critic_loss = (A**2).mean()
            A = A.detach()
            log_probs = torch.stack(log_probs)
            actor_loss = (-log_probs * A).mean()

            # update the critic network
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # update tha actor network
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # save the loss
            actor_losses.append(actor_loss.item())
            critic_losses.append(critic_loss.item())
            print(
                f' critic loss : {critic_losses[-1]} | actor loss : {actor_losses[-1]}'
            )

        return actor_losses, critic_losses, rewards