示例#1
0
def clipped_surrogate(policy, old_probs, states, actions, rewards,
                      discount=0.995, epsilon=0.1, beta=0.01):
    actions = torch.tensor(actions, dtype=torch.int8, device=device)
    rewards = torch.tensor(rewards, dtype=torch.float, device=device)
    old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)

    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states, actions)
    new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0 - new_probs)

    # discounted cumulative reward
    R_future = discounted_future_rewards(rewards, discount)

    # subtract baseline (= mean of reward)
    R_mean = torch.mean(R_future)
    R_future -= R_mean

    ratio = new_probs / (old_probs + 1e-6)
    ratio_clamped = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
    ratio_PPO = torch.where(ratio < ratio_clamped, ratio, ratio_clamped)

    # policy gradient maxmize target
    surrogates = (R_future * ratio_PPO).mean()

    # include a regularization term
    # this steers new_policy towards 0.5
    # which prevents policy to become exactly 0 or 1
    # this helps with exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    # entropy = -(new_probs*torch.log(old_probs+1.e-10) + (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))
    # surrogates += torch.mean(beta*entropy)

    return surrogates
示例#2
0
def clipped_surrogate(policy,
                      old_probs,
                      states,
                      actions,
                      rewards,
                      discount=0.995,
                      epsilon=0.1,
                      beta=0.01):
    actions = torch.tensor(actions, dtype=torch.int8, device=device)
    rewards = torch.tensor(rewards, dtype=torch.float, device=device)
    old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)

    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states, actions)
    new_probs = torch.where(actions == pong_utils.RIGHT, new_probs,
                            1.0 - new_probs)

    # discounted cumulative reward
    R_future = discounted_future_rewards(rewards, discount)

    # subtract baseline (= mean of reward)
    R_mean = torch.mean(R_future)
    R_future -= R_mean

    ratio = new_probs / (old_probs + 1e-6)
    ratio_clamped = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
    ratio_PPO = torch.where(ratio < ratio_clamped, ratio, ratio_clamped)

    surrogates = (R_future * ratio_PPO).mean()
    return surrogates
def surrogate(policy, old_probs, states, actions, rewards,
              discount = 0.995, beta=0.01):

    discount = discount**np.arange(len(rewards))
    rewards = np.asarray(rewards)*discount[:,np.newaxis]
    
    # convert rewards to future rewards
    rewards_future = rewards[::-1].cumsum(axis=0)[::-1]
    
    # Normalize rewards
    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10
    rewards_normalized = (rewards_future - mean[:,np.newaxis])/std[:,np.newaxis]
    
    actions = torch.tensor(actions, dtype=torch.int8, device=device)
    old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
    rewards = torch.tensor(rewards_normalized, dtype=torch.float, device=device)
    
    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states)
    new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0-new_probs)

    # include a regularization term
    # this steers new_policy towards 0.5
    # which prevents policy to become exactly 0 or 1
    # this helps with exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    entropy = -(new_probs*torch.log(old_probs+1.e-10)+         (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))

    return torch.mean(torch.log(new_probs)*rewards + beta*entropy)
示例#4
0
def clipped_surrogate_PPO(policy,
                          old_probs,
                          states,
                          actions,
                          rewards,
                          gamma=0.995,
                          epsilon=0.1,
                          beta=0.01):

    # get number of trajectories = num of agents
    steps_in_trajectories = len(states)
    num_trajectories = len(states[0])

    actions = torch.tensor(actions, dtype=torch.int8, device=device)

    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states)
    new_probs = torch.where(actions == pong_utils.RIGHT, new_probs,
                            1.0 - new_probs)

    rewards_future = get_future_rewards(rewards, num_trajectories, gamma)
    R = torch.tensor(reward_normalization(rewards_future)).float().to(device)

    # ratio for clipping
    old_probs = torch.tensor(old_probs).to(device)
    ratio = new_probs / old_probs

    # clipped function
    clip = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
    clipped_surrogate = torch.min(ratio * R, clip * R)
    return torch.sum(clipped_surrogate) / num_trajectories
    """
示例#5
0
def surrogate(policy,
              old_probs,
              states,
              actions,
              rewards,
              discount=0.995,
              beta=0.01,
              epsilon=0.1,
              use_ppo_clip=False):

    norm_rewards = disc_rewards(rewards, discount)

    t_actions = torch.tensor(actions, dtype=torch.int8, device=device)
    t_old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
    t_norm_rewards = torch.tensor(norm_rewards,
                                  dtype=torch.float,
                                  device=device)

    # convert states to policy (or probability)
    t_new_probs = pong_utils.states_to_prob(policy, states)
    t_new_probs = torch.where(t_actions == pong_utils.RIGHT, t_new_probs,
                              1.0 - t_new_probs)
    """
     now we can either calc log(t_new_probs) or 
     take directly t_new_probs/t_old_probs (old_probs is same same and fixed)
    """
    t_rap = t_new_probs / t_old_probs  # only t_new_probs is diferentiable

    if use_ppo_clip:
        t_rap_clip = torch.clamp(t_rap, 1 - epsilon, 1 + epsilon)

        t_main_loss = torch.min(t_norm_rewards * t_rap,
                                t_norm_rewards * t_rap_clip)
    else:
        t_main_loss = t_norm_rewards * t_rap

    # include a regularization term
    # this steers new_policy towards 0.5
    # which prevents policy to become exactly 0 or 1
    # this helps with exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    entropy = -(t_new_probs*torch.log(t_old_probs+1.e-10)+ \
        (1.0-t_new_probs)*torch.log(1.0-t_old_probs+1.e-10))

    return torch.mean(t_main_loss + beta * entropy)
示例#6
0
def calculate_loss_REINFORCE(policy,
                             old_probs,
                             states,
                             actions,
                             rewards,
                             gamma=0.995,
                             beta=0.01):

    ########
    ## CREDIT ASSIGNMENT --> TAKING INTO ACCOUNT ONLY FUTURE REWARDS
    ##                   --> DISCOUNTED REWARD IMPLEMENTED WITH GAMMA
    ## NOISE REDUCTION   --> NORMALIZATION OF REWARD
    ########

    # get number of trajectories = num of agents
    steps_in_trajectories = len(states)
    num_trajectories = len(states[0])

    actions = torch.tensor(
        actions, dtype=torch.int8,
        device=device)  # ACTIONS: 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5
    new_probs = pong_utils.states_to_prob(
        policy, states)  # convert states to policy (or probability)
    new_probs = torch.where(actions == pong_utils.RIGHT, new_probs,
                            1.0 - new_probs)

    # REWARDS
    #rewards_future = get_future_rewards_recursive(rewards,gamma)
    rewards_future = get_future_rewards(rewards, num_trajectories, gamma)
    R_np = reward_normalization(rewards_future)
    with torch.no_grad():
        R = torch.from_numpy(R_np).float().to(
            device)  #specify to prepare data for CUDA

    # POLICY_LOSS
    policy_loss = []
    for i, prob in enumerate(new_probs):
        log_prob = torch.log(prob)
        result = torch.mul(log_prob, R[i]).to(
            device)  # CALCULATE GRADIENT --> multiply element-wise
        policy_loss.append(result)
    policy_loss = torch.cat(policy_loss)  #concat in single 1D-tensor
    policy_loss = policy_loss.sum(dim=0)  # sum all values
    policy_loss /= num_trajectories  # calculate the gradient estimation (divide total number of trajectories)
    return policy_loss
示例#7
0
def surrogate(policy,
              old_probs,
              states,
              actions,
              rewards,
              discount=0.995,
              beta=0.01):

    discount = discount**np.arange(len(rewards))
    rewards = np.asarray(rewards) * discount[:, np.newaxis]

    # convert rewards to future rewards
    rewards_future = rewards[::-1].cumsum(axis=0)[::-1]

    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10

    rewards_normalized = (rewards_future -
                          mean[:, np.newaxis]) / std[:, np.newaxis]

    # convert everything into pytorch tensors and move to gpu if available
    actions = torch.tensor(actions, dtype=torch.int8, device=device)
    # old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
    rewards = torch.tensor(rewards_normalized,
                           dtype=torch.float,
                           device=device)

    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states)
    new_probs = torch.where(actions == RIGHT, new_probs, 1.0 - new_probs)

    log_probs_new = torch.log(new_probs)

    log_prob_actions_v = rewards * log_probs_new

    loss_policy_v = -log_prob_actions_v.mean()

    entropy_v = -(new_probs * log_probs_new).sum(dim=1).mean()

    entropy_loss_v = -beta * entropy_v

    loss_v = loss_policy_v + entropy_loss_v

    return loss_v
示例#8
0
def surrogate(policy,
              old_probs,
              states,
              actions,
              rewards,
              discount=0.995,
              beta=0.01,
              epsilon=0.1):

    discount = discount**np.arange(len(rewards))
    rewards = np.asarray(rewards) * discount[:, np.newaxis]

    # convert rewards to future rewards
    rewards_future = rewards[::-1].cumsum(axis=0)[::-1]

    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10

    rewards_normalized = (rewards_future -
                          mean[:, np.newaxis]) / std[:, np.newaxis]

    # convert everything into pytorch tensors and move to gpu if available
    actions = torch.tensor(actions, dtype=torch.int8, device=device)
    old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
    rewards = torch.tensor(rewards_normalized,
                           dtype=torch.float,
                           device=device)

    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states)
    new_probs = torch.where(actions == RIGHT, new_probs, 1.0 - new_probs)

    reweighting_factor = new_probs / old_probs

    clipped = torch.clamp(reweighting_factor, 1 - epsilon, 1 + epsilon)

    clipped_surrogate = torch.min(reweighting_factor * rewards,
                                  clipped * rewards)

    entropy = -(new_probs*torch.log(old_probs+1.e-10)+ \
        (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))

    return torch.mean(clipped_surrogate + beta * entropy)