예제 #1
0
파일: ppo.py 프로젝트: UCASChief/RL_related
def GAE(reward, value, mask, gamma, lam):
    adv = FloatTensor(reward.shape)
    delta = FloatTensor(reward.shape)

    # pre_value, pre_adv = 0, 0
    pre_value = torch.zeros(reward.shape[1:], device=device)
    pre_adv = torch.zeros(reward.shape[1:], device=device)
    for i in reversed(range(reward.shape[0])):
        delta[i] = reward[i] + gamma * pre_value * mask[i] - value[i]
        adv[i] = delta[i] + gamma * lam * pre_adv * mask[i]
        pre_adv = adv[i, ...]
        pre_value = value[i, ...]
    returns = value + adv
    adv = (adv - adv.mean()) / adv.std()
    return adv, returns
예제 #2
0
def GAE(reward, value, mask, gamma, lam):
    # adv = FloatTensor(reward.shape, device=device)
    # delta = FloatTensor(reward.shape, device=device)

    # # pre_value, pre_adv = 0, 0
    # pre_value = torch.zeros(reward.shape[1:], device=device)
    # pre_adv = torch.zeros(reward.shape[1:], device=device)
    # for i in reversed(range(reward.shape[0])):
    #     delta[i] = reward[i] + gamma * pre_value * mask[i] - value[i]
    #     adv[i] = delta[i] + gamma * lam * pre_adv * mask[i]
    #     pre_adv = adv[i, ...]
    #     pre_value = value[i, ...]
    # returns = value + adv
    # adv = (adv - adv.mean()) / adv.std()

    reward = reward.reshape(-1, args.sample_traj_length, 1)
    value = value.reshape(-1, args.sample_traj_length, 1)
    mask = mask.reshape(-1, args.sample_traj_length, 1)

    adv = FloatTensor(reward.shape, device=device)
    delta = FloatTensor(reward.shape, device=device)

    # pre_value, pre_adv = 0, 0
    pre_value = torch.zeros((reward.shape[0], 1), device=device)
    pre_adv = torch.zeros((reward.shape[0], 1), device=device)
    for i in reversed(range(reward.shape[1])):
        delta[:,
              i] = reward[:, i] + gamma * pre_value * mask[:, i] - value[:, i]
        adv[:, i] = delta[:, i] + gamma * lam * pre_adv * mask[:, i]
        pre_adv = adv[:, i, ...]
        pre_value = value[:, i, ...]
    returns = value + adv
    adv = (adv - adv.mean()) / adv.std()
    returns = returns.reshape(-1, 1)
    adv = adv.reshape(-1, 1)

    return adv, returns