예제 #1
0
파일: cGANWIL.py 프로젝트: Ark0617/CoTGAIL
def update_params(batch):
    rewards = torch.Tensor(batch.reward).to(device)
    masks = torch.Tensor(batch.mask).to(device)
    actions = torch.Tensor(np.concatenate(batch.action, 0)).to(device)
    states = torch.Tensor(batch.state).to(device)
    values = value_net(states)

    returns = torch.Tensor(actions.size(0), 1).to(device)
    deltas = torch.Tensor(actions.size(0), 1).to(device)
    advantages = torch.Tensor(actions.size(0), 1).to(device)

    prev_return = 0
    prev_value = 0
    prev_advantage = 0

    for i in reversed(range(rewards.size(0))):
        returns[i] = rewards[i] + args.gamma * prev_return * masks[i]
        deltas[i] = rewards[
            i] + args.gamma * prev_value * masks[i] - values.data[i]
        advantages[
            i] = deltas[i] + args.gamma * args.tau * prev_advantage * masks[i]
        prev_return = returns[i, 0]
        prev_value = values.data[i, 0]
        prev_advantage = advantages[i, 0]

    targets = returns
    batch_size = math.ceil(states.shape[0] / args.vf_iters)
    idx = np.random.permutation(states.shape[0])
    for i in range(args.vf_iters):
        smp_idx = idx[i * batch_size:(i + 1) * batch_size]
        smp_states = states[smp_idx, :]
        smp_targets = targets[smp_idx, :]
        value_optimizer.zero_grad()
        value_loss = value_criterion(value_net(smp_states), smp_targets)
        value_loss.backward()
        value_optimizer.step()

    advantages = (advantages - advantages.mean()) / advantages.std()
    action_means, action_log_stds, action_stds = policy_net(states.cpu())
    fixed_log_prob = normal_log_density(actions.cpu(), action_means,
                                        action_log_stds,
                                        action_stds).data.clone()

    def get_loss():
        action_means, action_log_stds, action_stds = policy_net(states.cpu())
        log_prob = normal_log_density(actions.cpu(), action_means,
                                      action_log_stds, action_stds)
        action_loss = -advantages.cpu() * torch.exp(log_prob - fixed_log_prob)
        return action_loss.mean()

    def get_kl():
        mean1, log_std1, std1 = policy_net(states.cpu())
        mean0 = mean1.data
        log_std0 = log_std1.data
        std0 = std1.data
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (
            2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)

    trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping)
예제 #2
0
파일: train.py 프로젝트: 5l1v3r1/trpo
def update_policy(batch):
    advantages = batch["advantages"]
    states = batch["states"]
    actions = batch["actions"]

    fixed_log_prob = policy_net.getLogProbabilityDensity(
        Variable(states), actions).detach()
    trpo_step(policy_net, states, actions, advantages, fixed_log_prob,
              args.max_kl, args.damping)
예제 #3
0
def update_policy(batch):
    """ Get advantage , states and action and calls trpo step
    Parameters:
    batch (dict of arrays of numpy) : TODO (batch is different than prepare_data by structure)
    Returns:
    """
    advantages = batch["advantages"]
    states = batch["states"]
    actions = batch["actions"]
    trpo_step(policy_net, states,actions,advantages , args.max_kl, args.damping)
예제 #4
0
파일: main.py 프로젝트: HuiminHe/rl-swing
def update_params(batch):
    rewards = torch.Tensor(batch.reward)
    masks = torch.Tensor(batch.mask)
    actions = torch.Tensor(np.concatenate(batch.action, 0))
    states = torch.Tensor(batch.state)
    values = value_net(Variable(states))

    returns = torch.Tensor(actions.size(0), 1)
    deltas = torch.Tensor(actions.size(0), 1)
    advantages = torch.Tensor(actions.size(0), 1)

    prev_return = 0
    prev_value = 0
    prev_advantage = 0
    for i in reversed(range(rewards.size(0))):
        returns[i] = rewards[i] + args.gamma * prev_return * masks[i]
        deltas[i] = rewards[
            i] + args.gamma * prev_value * masks[i] - values.data[i]
        advantages[
            i] = deltas[i] + args.gamma * args.tau * prev_advantage * masks[i]

        prev_return = returns[i, 0]
        prev_value = values.data[i, 0]
        prev_advantage = advantages[i, 0]

    targets = Variable(returns)

    # Original code uses the same LBFGS to optimize the value loss
    def get_value_loss(flat_params):
        set_flat_params_to(value_net, torch.Tensor(flat_params))
        for param in value_net.parameters():
            if param.grad is not None:
                param.grad.data.fill_(0)

        values_ = value_net(Variable(states))

        value_loss = (values_ - targets).pow(2).mean()

        # weight decay
        for param in value_net.parameters():
            value_loss += param.pow(2).sum() * args.l2_reg
        value_loss.backward()
        return (value_loss.data.double().numpy()[0],
                get_flat_grad_from(value_net).data.double().numpy())

    flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b(
        get_value_loss,
        get_flat_params_from(value_net).double().numpy(),
        maxiter=25)
    set_flat_params_to(value_net, torch.Tensor(flat_params))

    advantages = (advantages - advantages.mean()) / advantages.std()

    action_means, action_log_stds, action_stds = policy_net(Variable(states))
    fixed_log_prob = normal_log_density(Variable(actions), action_means,
                                        action_log_stds,
                                        action_stds).data.clone()

    def get_loss(volatile=False):
        action_means, action_log_stds, action_stds = policy_net(
            Variable(states, volatile=volatile))
        log_prob = normal_log_density(Variable(actions), action_means,
                                      action_log_stds, action_stds)
        action_loss = -Variable(advantages) * torch.exp(
            log_prob - Variable(fixed_log_prob))
        return action_loss.mean()

    def get_kl():
        mean1, log_std1, std1 = policy_net(Variable(states))

        mean0 = Variable(mean1.data)
        log_std0 = Variable(log_std1.data)
        std0 = Variable(std1.data)
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (
            2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)

    trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping)
예제 #5
0
def update_params(batch):
    rewards = torch.Tensor(batch.reward)
    masks = torch.Tensor(batch.mask)
    actions = torch.Tensor(np.concatenate(batch.action, 0))
    states = torch.Tensor(batch.state)
    n = actions.size(0)
    values = value_net(states)

    ############## GAE ###############
    returns = torch.Tensor(n, 1)
    deltas = torch.Tensor(n, 1)
    advantages = torch.Tensor(n, 1)
    prev_return = 0
    prev_value = 0
    prev_advantage = 0
    for i in reversed(range(rewards.size(0))):

        returns[i] = rewards[i] + args.gamma * prev_return * masks[i]
        deltas[i] = rewards[
            i] + args.gamma * prev_value * masks[i] - values.data[i]
        advantages[
            i] = deltas[i] + args.gamma * args.tau * prev_advantage * masks[i]

        prev_return = returns[i, 0]
        prev_value = values.data[i, 0]
        prev_advantage = advantages[i, 0]

    ##################################

    ###################### Sever ############################

    if args.sever == 1:
        add_hooks(policy_net)
        clear_backprops(policy_net)
        policy_net.zero_grad()
        action_means, action_log_stds, action_stds = policy_net(states)
        log_policy = normal_log_density(actions, action_means, action_log_stds,
                                        action_stds)
        torch.autograd.grad(log_policy.mean(), policy_net.parameters())
        ## compute gradient of log policy for every single data point, the trick only works for linear and conv layers
        compute_grad1(policy_net, loss_type='mean')
        actor_grad_logp = []
        for param in policy_net.parameters():
            actor_grad_logp.append(param.grad1.view(param.grad1.shape[0], -1))
        actor_grad_logp = torch.cat(actor_grad_logp, 1)
        remove_hooks(policy_net)
        policy_net.zero_grad()

        ## standardize the advantage estimate for stable training. Anticipating outliers, use huber's robust estimate of mean and std instead of vanilla sample mean and std.
        h = huber
        h.maxiter = 100
        try:
            mean, std = h(advantages)
        except:
            "huber failed."
            mean = advantages.mean()
            std = advantages.std()
        advantages = (advantages - mean) / std
        actor_loss_grad = actor_grad_logp * (advantages
                                             )  # vanilla policy gradient
        start_time = time.time()

        ## robust CG procedure
        search_dir, indices = Sever_CG(actor_loss_grad,
                                       actor_grad_logp,
                                       n,
                                       nsteps=10,
                                       r=4,
                                       p=args.eps / 2)
    else:
        advantages = (advantages - advantages.mean()) / advantages.std()
        indices = list(range(n))
        search_dir = None

    #########################################################

    # Use same LBFGS to optimize the value loss
    def get_value_loss(flat_params):
        set_flat_params_to(value_net, torch.Tensor(flat_params))
        for param in value_net.parameters():
            if param.grad is not None:
                param.grad.data.fill_(0)

        values_ = value_net(states[indices])

        value_loss = (values_ - returns[indices]).pow(2).mean()
        #         print('value loss:',value_loss)
        # weight decay
        for param in value_net.parameters():
            value_loss += param.pow(2).sum() * args.l2_reg
        value_loss.backward()
        return (value_loss.data.double().numpy(),
                get_flat_grad_from(value_net).data.double().numpy())

    flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b(
        get_value_loss,
        get_flat_params_from(value_net).double().numpy(),
        maxiter=25)
    set_flat_params_to(value_net, torch.Tensor(flat_params))

    action_means, action_log_stds, action_stds = policy_net(states[indices])
    fixed_log_prob = normal_log_density(actions[indices], action_means,
                                        action_log_stds,
                                        action_stds).data.clone()

    # Policy loss
    def get_loss(volatile=False):
        if volatile:
            with torch.no_grad():
                action_means, action_log_stds, action_stds = policy_net(
                    states[indices])
        else:
            action_means, action_log_stds, action_stds = policy_net(
                states[indices])

        log_prob = normal_log_density(actions[indices], action_means,
                                      action_log_stds, action_stds)
        action_loss = -advantages[indices] * torch.exp(log_prob -
                                                       fixed_log_prob)
        return action_loss.mean()

    def get_kl():
        mean1, log_std1, std1 = policy_net(states[indices])

        mean0 = mean1.data
        log_std0 = log_std1.data
        std0 = std1.data
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (
            2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)

    trpo_step(policy_net,
              get_loss,
              get_kl,
              args.max_kl,
              args.damping,
              xinit=search_dir)
    num_attack = args.batch_size * args.eps
    return 1 - sum(1 for i in indices if i < num_attack
                   ) / num_attack  ## fraction of outlier detected
예제 #6
0
    def update_params(batch, gamma, tau, l2_reg, max_kl, damping):
        rewards = torch.Tensor(batch.reward)
        masks = torch.Tensor(batch.mask)
        actions = torch.Tensor(np.concatenate(batch.action, 0))
        states = torch.Tensor(batch.state).squeeze()
        values = value_net(Variable(states))
        x_poses = torch.Tensor(batch.x_pos)

        returns = torch.Tensor(actions.size(0), 1)
        deltas = torch.Tensor(actions.size(0), 1)
        advantages = torch.Tensor(actions.size(0), 1)

        prev_return = 0
        prev_value = 0
        prev_advantage = 0
        for i in reversed(range(rewards.size(0))):
            returns[i] = rewards[i] + gamma * prev_return * masks[i]
            deltas[i] = rewards[
                i] + gamma * prev_value * masks[i] - values.data[i]
            advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i]

            prev_return = returns[i, 0]
            prev_value = values.data[i, 0]
            prev_advantage = advantages[i, 0]

        targets = Variable(returns)

        # Original code uses the same LBFGS to optimize the value loss
        def get_value_loss(flat_params):
            global val_loss
            set_flat_params_to(value_net, torch.Tensor(flat_params))
            for param in value_net.parameters():
                if param.grad is not None:
                    param.grad.data.fill_(0)

            values_ = value_net(Variable(states))

            value_loss = (values_ - targets).pow(2).mean()

            # weight decay
            for param in value_net.parameters():
                value_loss += param.pow(2).sum() * l2_reg
            value_loss.backward()
            val_loss = value_loss.item()
            # print("Value Loss: ", val_loss)
            return (value_loss.data.double().item(),
                    get_flat_grad_from(value_net).data.double().numpy())

        flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b(
            get_value_loss,
            get_flat_params_from(value_net).double().numpy(),
            maxiter=25)
        set_flat_params_to(value_net, torch.Tensor(flat_params))

        advantages = (advantages - advantages.mean()) / advantages.std()

        # print("States: ", states)
        # print("States Size: ", states.size())

        probs = policy_net((states, x_poses)).squeeze()

        # print("Actions: ", actions)
        # print("probs: ", probs)
        fixed_log_prob = (
            torch.log(probs) * Variable(actions)
        ).sum(1).data.clone(
        )  #normal_log_density(Variable(actions), action_means, action_log_stds, action_stds).data.clone()

        def get_loss(volatile=False):

            # print("Action Size: ", actions.size())

            if volatile:
                with torch.no_grad():
                    probs = policy_net(Variable(states)).squeeze()
            else:
                probs = policy_net(Variable(states)).squeeze()

            # print("Probs Size: ", probs.size())
            log_prob = (torch.log(probs) * Variable(actions)).sum(
                1
            )  #normal_log_density(Variable(actions), action_means, action_log_stds, action_stds)

            # print("Log Probs Size: ", log_prob.size())
            # print("Advantages Size: ", advantages.size())
            action_loss = -Variable(advantages).squeeze() * torch.exp(
                log_prob - Variable(fixed_log_prob))
            # print("Action Loss Size: ", action_loss.size())
            # print("Action Loss: ", action_loss.mean().item())
            return action_loss.mean()

        def get_kl():
            actprobs = policy_net((Variable(states), Variable(x_poses))) + 1e-8

            old_actprobs = Variable(actprobs.data)

            kl = old_actprobs * torch.log(old_actprobs / actprobs)

            return kl.sum(1, keepdim=True)

        probs_old = policy_net((Variable(states), Variable(x_poses)))

        loss = trpo_step(policy_net, get_loss, get_kl, max_kl, damping)

        probs_new = policy_net((Variable(states), Variable(x_poses))) + 1e-8

        kl = torch.sum(probs_old * torch.log(probs_old / probs_new), 1)

        return loss, kl.mean()