Exemplo n.º 1
0
def backward(rolls, anns, valWeight=0.5, entWeight=0):
    atns, vals, rets = rollouts.mergeRollouts(rolls.values())
    returns = torch.tensor(rets).view(-1, 1).float()
    vals = torch.cat(vals)
    pg, entropy, attackentropy = 0, 0, 0
    for i, atnList in enumerate(atns):
        aArg, aArgIdx = list(zip(*atnList))
        aArgIdx = torch.stack(aArgIdx)
        l, e = loss.PG(aArg, aArgIdx, vals, returns)
        pg += l
        entropy += e

    valLoss = loss.valueLoss(vals, returns)
    totLoss = pg + valWeight * valLoss + entWeight * entropy

    totLoss.backward()
    grads = [param.getGrads(ann) for ann in anns]
    reward = np.mean(rets)

    return reward, vals.mean(), grads, pg, valLoss, entropy
Exemplo n.º 2
0
def backward(rollouts, config):
    '''Computes gradients from a list of rollouts

   Args:
      rolls: A list of rollouts
      valWeight (float): Scale to apply to the value loss
      entWeight (float): Scale to apply to the entropy bonus
      device (str): Hardware to run backward on

   Returns:
      reward: Mean reward achieved across rollouts
      val: Mean value function estimate across rollouts
      pg: Policy gradient loss
      valLoss: Value loss
      entropy: Entropy bonus      
   '''
    device = config.DEVICE
    outs, n = merge(rollouts)
    pgLoss, valLoss, entLoss = 0, 0, 0
    for k, out in outs.items():
        atns = out['atns']
        vals = torch.stack(out['vals'])
        idxs = torch.tensor(out['idxs']).to(device)
        rets = torch.tensor(out['rets']).view(-1, 1).to(device)

        l, v, e = loss.PG(atns, idxs, vals, rets)

        #Averaging results in no learning. Need to retune LR?
        pgLoss += l  # / n
        valLoss += v  # / n
        entLoss += e  # / n

    totLoss = (config.PG_WEIGHT * pgLoss + config.VAL_WEIGHT * valLoss +
               config.ENTROPY * entLoss)

    totLoss.backward(retain_graph=True)

    return pgLoss, valLoss, entLoss
Exemplo n.º 3
0
def backward(rollouts, valWeight=0.5, entWeight=0, device='cpu'):
    '''Computes gradients from a list of rollouts

   Args:
      rolls: A list of rollouts
      valWeight (float): Scale to apply to the value loss
      entWeight (float): Scale to apply to the entropy bonus
      device (str): Hardware to run backward on

   Returns:
      reward: Mean reward achieved across rollouts
      val: Mean value function estimate across rollouts
      pg: Policy gradient loss
      valLoss: Value loss
      entropy: Entropy bonus      
   '''
    outs = merge(rollouts)
    pg, entropy, attackentropy = 0, 0, 0
    for k, out in outs['action'].items():
        atns = out['atns']
        vals = torch.stack(out['vals']).to(device)
        idxs = torch.tensor(out['idxs']).to(device)
        rets = torch.tensor(out['rets']).to(device).view(-1, 1)
        l, e = loss.PG(atns, idxs, vals, rets)
        pg += l
        entropy += e

    returns = torch.stack(outs['value']).to(device)
    values = torch.tensor(outs['return']).to(device).view(-1, 1)
    valLoss = loss.valueLoss(values, returns)
    totLoss = pg + valWeight * valLoss + entWeight * entropy

    totLoss.backward()
    reward = np.mean(outs['return'])

    return reward, vals.mean(), pg, valLoss, entropy
Exemplo n.º 4
0
def backwardActorOffPolicy(rho, adv, pi):
    pg, entropy = loss.PG(pi, rho, adv)
    return pg, entropy