예제 #1
0
def test_per(capacity):
    # test implementation of proritized replay buffer
    p_buffer = PrioritizedReplayBuffer(capacity)

    # populate the buffer
    for _ in range(capacity // 2):
        p_buffer.add(Experience())

    # update batches of experience
    n_batches = 10
    batch_size = 100
    for _ in range(10):
        # randomly sample $batch_size of tree indices
        idx = random.sample([x for x in range(capacity - 1, 2 * capacity - 1)],
                            batch_size)

        td_errors = np.random.uniform(0, 10, batch_size)

        p_buffer.batch_update(idx, td_errors)

        assert p_buffer.tree.max_priority == np.max(
            p_buffer.tree.tree[-capacity:])

    # test sample
    for _ in range(10):
        p_buffer.sample(batch_size)

    return
예제 #2
0
        Loss = weights * MSE
    
    '''

    # compute MSE adjusted by importance sampling weights
    # and backprop
    weights = torch.tensor(weights, dtype=torch.float32)
    #print(weights, torch.pow(td_loss, 2))
    loss = torch.mean(weights * torch.pow(td_loss, 2))
    loss.backward()
    grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
    opt.step()
    opt.zero_grad()

    # update the priorities of sampled exprs
    exp_replay.batch_update(b_idx, np.abs(td_loss.detach().cpu().numpy()))

    # increase the importance sampling hyperparameter b gradually to 1
    exp_replay.increment_b()

    if step % loss_freq == 0:
        # save MSE without importance sampling
        loss = torch.mean(torch.pow(td_loss, 2))
        td_loss_history.append(loss.cpu().item())

    if step % refresh_target_network_freq == 0:
        target_network.load_state_dict(agent.state_dict())

    if step % eval_freq == 0:
        mean_rw_history.append(
            evaluate(make_env(clip_rewards=True, seed=step),