示例#1
0
        loss /= torch.tensor(num, device=device, dtype=torch.float32)

        policy_candidate_optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm(policy_candidate.parameters(),
                                1.)  # Clip gradients
        policy_candidate_optimizer.step()

    policy_net = copy.deepcopy(policy_candidate).to(device)

    # Optimize value net for a given number of steps
    # Set value net in training mode
    value_net_in.train()
    value_net_ex.train()
    ex_rtg = memory.extrinsic_discounted_rtg(
        batch_size)  # Use undiscounted reward-to-go to fit the value net
    in_rtg = memory.intrinsic_rtg(batch_size)
    ex_val_est = []
    in_val_est = []

    print("\n\n\tUpdate Value Net for %d steps" % (num_vn_iter))

    for i in tqdm(range(num_vn_iter)):  # Use tqdm to show progress bar
        for j in range(batch_size):
            in_val_traj = value_net_in(
                torch.cat([
                    states[j],
                    torch.ones((states[j].shape[0], 1),
                               dtype=torch.float32,
                               device=device) * j
                ],
示例#2
0
                    finished_rendering_this_epoch = True

                break

    ###################################################################
    # Optimize the model for a given number of steps

    # Make a candidate to update parameters
    actor_critic_candidate = copy.deepcopy(actor_critic).to(device)
    actor_critic_candidate.train()

    # initialize the optimizer
    candidate_optimizer = optim.Adam(actor_critic_candidate.parameters())

    # Get batch data
    ex_rtg = memory.extrinsic_discounted_rtg(batch_size)
    ex_gae = memory.extrinsic_gae(batch_size)
    old_act_log_prob = memory.act_log_prob(batch_size)
    states = memory.states(batch_size)
    actions = memory.actions(batch_size)

    # Proximal Policy Optimization - Calculate joint loss for both actor and critic network
    loss = 0
    critic_loss_total = 0

    print("\n\n\tUpdate Actor Critic for %d steps:" % num_updates_per_epoch)

    for i in tqdm(
            range(num_updates_per_epoch)):  # Use tqdm to show progress bar

        num = 0