Exemplo n.º 1
0
def draw_quantilles(frame_idx, batch, net, cuda=False, dir='.'):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = Variable(torch.from_numpy(states))
    actions_v = Variable(torch.from_numpy(actions))
    if cuda:
        states_v = states_v.cuda(async=True)
        actions_v = actions_v.cuda(async=True)

    quant_v = net(states_v)[range(batch_size), actions_v.data]
    quant = quant_v.data.cpu().numpy()

    for batch_idx in range(batch_size):
        if not dones[batch_idx]:
           continue
        q_val = np.mean(quant[batch_idx])
        suffix = "_%03d_%06d_%d_%.1f_%.4f.png" % (
            batch_idx, frame_idx, int(dones[batch_idx]), rewards[batch_idx], q_val)
        plt.clf()
#        plt.subplot(2, 1, 1)
        plt.plot(np.arange(0.0, 1.0, 1/QUANT_N), quant[batch_idx])
        plt.title("Inv CDF, q_val=%.3f, done=%d, reward=%.1f" % (
            q_val, int(dones[batch_idx]), rewards[batch_idx]))
#        plt.subplot(2, 1, 2)
#        plt.plot(1/np.diff(quant[batch_idx])/QUANT_N)
#        plt.title("Density")
        plt.savefig(os.path.join(dir, "quant" + suffix))
    pass
def calc_loss_double_dqn(batch,
                         net,
                         tgt_net,
                         gamma,
                         device="cpu",
                         double=True):
    states, actions, rewards, dones, next_states = \
        common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    actions_v = actions_v.unsqueeze(-1)
    state_action_vals = net(states_v).gather(1, actions_v)
    state_action_vals = state_action_vals.squeeze(-1)
    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(device)
        if double:
            next_state_acts = net(next_states_v).max(1)[1]
            next_state_acts = next_state_acts.unsqueeze(-1)
            next_state_vals = tgt_net(next_states_v).gather(
                1, next_state_acts).squeeze(-1)
        else:
            next_state_vals = tgt_net(next_states_v).max(1)[0]
        next_state_vals[done_mask] = 0.0
        exp_sa_vals = next_state_vals.detach() * gamma + rewards_v
    return nn.MSELoss()(state_action_vals, exp_sa_vals)
Exemplo n.º 3
0
def calc_loss(batch,
              batch_weights,
              net,
              tgt_net,
              gamma,
              device="cpu",
              double=True):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    if double:
        next_state_actions = net(next_states_v).max(1)[1]
        next_state_values = tgt_net(next_states_v).gather(
            1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    else:
        next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0

    expected_state_action_values = next_state_values.detach(
    ) * gamma + rewards_v
    losses_v = batch_weights_v * (state_action_values -
                                  expected_state_action_values)**2
    return losses_v.mean(), losses_v + 1e-5
def calc_loss(batch, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = \
        common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    next_states_v = torch.tensor(next_states).to(device)

    # next state distribution
    next_distr_v, next_qvals_v = tgt_net.both(next_states_v)
    next_acts = next_qvals_v.max(1)[1].data.cpu().numpy()
    next_distr = tgt_net.apply_softmax(next_distr_v)
    next_distr = next_distr.data.cpu().numpy()

    next_best_distr = next_distr[range(batch_size), next_acts]
    dones = dones.astype(np.bool)

    proj_distr = dqn_extra.distr_projection(next_best_distr, rewards, dones,
                                            gamma)

    distr_v = net(states_v)
    sa_vals = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(sa_vals, dim=1)
    proj_distr_v = torch.tensor(proj_distr).to(device)

    loss_v = -state_log_sm_v * proj_distr_v
    return loss_v.sum(dim=1).mean()
Exemplo n.º 5
0
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
    next_qvals_v = qvals_v[batch_size:]
    distr_v = distr_v[:batch_size]

    next_actions_v = next_qvals_v.max(1)[1]
    next_distr_v = tgt_net(next_states_v)
    next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
    next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
    next_best_distr = next_best_distr_v.data.cpu().numpy()

    dones = dones.astype(np.bool)
    proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin,
                                         Vmax, N_ATOMS, gamma)

    state_action_values = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values, dim=1)

    proj_distr_v = torch.tensor(proj_distr)
    loss_v = -state_log_sm_v * proj_distr_v
    loss_v = batch_weights_v * loss_v.sum(dim=1)
    return loss_v.mean(), loss_v + 1e-5
Exemplo n.º 6
0
def calc_loss(batch, batch_weights, net, tgt_net, gamma, cuda=False):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = Variable(torch.from_numpy(states))
    next_states_v = Variable(torch.from_numpy(next_states), volatile=True)
    actions_v = Variable(torch.from_numpy(actions))
    rewards_v = Variable(torch.from_numpy(rewards))
    done_mask = torch.ByteTensor(dones)
    batch_weights_v = Variable(torch.from_numpy(batch_weights))
    if cuda:
        states_v = states_v.cuda(async=True)
        next_states_v = next_states_v.cuda(async=True)
        actions_v = actions_v.cuda(async=True)
        rewards_v = rewards_v.cuda(async=True)
        done_mask = done_mask.cuda(async=True)
        batch_weights_v = batch_weights_v.cuda(async=True)

    state_all_action_values = net(states_v)
    state_action_values = state_all_action_values.gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_actions = net(next_states_v).max(1)[1]
    next_state_values = tgt_net(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    next_state_values[done_mask] = 0.0
    next_state_values.volatile = False

    # DQN Loss
    expected_state_action_values = next_state_values * gamma + rewards_v
    dq_losses = nn.SmoothL1Loss(reduce=False)(state_action_values, expected_state_action_values)
    dq_loss = (batch_weights_v * dq_losses).sum()

    last_dq_losses.append(dq_loss.data.cpu().numpy() / len(batch))

    return dq_loss, (dq_losses.data.abs() + e_prio).cpu().numpy()
def data_func(net, device, train_queue):
    envs = [make_env() for _ in range(NUM_ENVS)]
    agent = ptan.agent.PolicyAgent(lambda x: net(x)[0],
                                   device=device,
                                   apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
    micro_batch = []

    for exp in exp_source:
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            data = TotalReward(reward=np.mean(new_rewards))
            train_queue.put(data)

        micro_batch.append(exp)
        if len(micro_batch) < MICRO_BATCH_SIZE:
            continue

        data = common.unpack_batch(micro_batch,
                                   net,
                                   device=device,
                                   last_val_gamma=GAMMA**REWARD_STEPS)
        train_queue.put(data)
        micro_batch.clear()
def calc_loss(batch, net, tgt_net, gamma, cuda=False, double=True):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = Variable(torch.from_numpy(states))
    next_states_v = Variable(torch.from_numpy(next_states), volatile=True)
    actions_v = Variable(torch.from_numpy(actions))
    rewards_v = Variable(torch.from_numpy(rewards))
    done_mask = torch.ByteTensor(dones)
    if cuda:
        states_v = states_v.cuda()
        next_states_v = next_states_v.cuda()
        actions_v = actions_v.cuda()
        rewards_v = rewards_v.cuda()
        done_mask = done_mask.cuda()

    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    if double:
        next_state_actions = net(next_states_v).max(1)[1]
        next_state_values = tgt_net(next_states_v).gather(
            1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    else:
        next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values.volatile = False

    expected_state_action_values = next_state_values * gamma + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)
Exemplo n.º 9
0
def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    next_states_v = torch.tensor(next_states).to(device)

    # next state distribution
    next_distr_v, next_qvals_v = tgt_net.both(next_states_v)
    next_actions = next_qvals_v.max(1)[1].data.cpu().numpy()
    next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy()

    next_best_distr = next_distr[range(batch_size), next_actions]
    dones = dones.astype(np.bool)

    # project our distribution using Bellman update
    proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin,
                                         Vmax, N_ATOMS, gamma)

    # calculate net output
    distr_v = net(states_v)
    state_action_values = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values, dim=1)
    proj_distr_v = torch.tensor(proj_distr).to(device)

    if save_prefix is not None:
        pred = F.softmax(state_action_values, dim=1).data.cpu().numpy()
        save_transition_images(batch_size, pred, proj_distr, next_best_distr,
                               dones, rewards, save_prefix)

    loss_v = -state_log_sm_v * proj_distr_v
    return loss_v.sum(dim=1).mean()
def calc_loss(batch, batch_weights, net, tgt_net, gamma, cuda=False):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = Variable(torch.from_numpy(states))
    next_states_v = Variable(torch.from_numpy(next_states), volatile=True)
    actions_v = Variable(torch.from_numpy(actions))
    rewards_v = Variable(torch.from_numpy(rewards))
    done_mask = torch.ByteTensor(dones)
    batch_weights_v = Variable(torch.from_numpy(batch_weights))
    if cuda:
        states_v = states_v.cuda()
        next_states_v = next_states_v.cuda()
        actions_v = actions_v.cuda()
        rewards_v = rewards_v.cuda()
        done_mask = done_mask.cuda()
        batch_weights_v = batch_weights_v.cuda()

    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values.volatile = False

    expected_state_action_values = next_state_values * gamma + rewards_v
    losses_v = batch_weights_v * (state_action_values -
                                  expected_state_action_values)**2
    return losses_v.mean(), losses_v + 1e-5
Exemplo n.º 11
0
def calc_loss_rainbow(batch,
                      batch_weights,
                      net,
                      tgt_net,
                      gamma,
                      device="cpu",
                      double=True):
    states, actions, rewards, dones, next_states = \
        common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    actions_v = actions_v.unsqueeze(-1)
    state_action_values = net(states_v).gather(1, actions_v)
    state_action_values = state_action_values.squeeze(-1)
    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(device)
        if double:
            next_state_actions = net(next_states_v).max(1)[1]
            next_state_actions = next_state_actions.unsqueeze(-1)
            next_state_values = tgt_net(next_states_v).gather(
                1, next_state_actions).squeeze(-1)
        else:
            next_state_values = tgt_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        expected_state_action_values = \
            next_state_values.detach() * gamma + rewards_v
    losses_v = (state_action_values - expected_state_action_values)**2
    losses_v *= batch_weights_v
    return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    next_states_v = torch.tensor(next_states).to(device)

    # next state distribution
    next_distr_v, next_qvals_v = tgt_net.both(next_states_v)
    next_actions = next_qvals_v.max(1)[1].data.cpu().numpy()
    next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy()

    next_best_distr = next_distr[range(batch_size), next_actions]
    dones = dones.astype(np.bool)

    # project our distribution using Bellman update
    proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)

    # calculate net output
    distr_v = net(states_v)
    state_action_values = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values, dim=1)
    proj_distr_v = torch.tensor(proj_distr).to(device)

    if save_prefix is not None:
        pred = F.softmax(state_action_values, dim=1).data.cpu().numpy()
        save_transition_images(batch_size, pred, proj_distr, next_best_distr, dones, rewards, save_prefix)

    loss_v = -state_log_sm_v * proj_distr_v
    return loss_v.sum(dim=1).mean()
Exemplo n.º 13
0
def grads_func(proc_name, net, cuda, train_queue):
    envs = [make_env() for _ in range(NUM_ENVS)]

    agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], cuda=cuda, apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

    batch = []
    frame_idx = 0
    writer = SummaryWriter(comment=proc_name)

    with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker:
        with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker:
            for exp in exp_source:
                frame_idx += 1
                new_rewards = exp_source.pop_total_rewards()
                if new_rewards and tracker.reward(new_rewards[0], frame_idx):
                    break

                batch.append(exp)
                if len(batch) < GRAD_BATCH:
                    continue

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, cuda=cuda)
                batch.clear()

                net.zero_grad()
                logits_v, value_v = net(states_v)
                loss_value_v = F.mse_loss(value_v, vals_ref_v)

                log_prob_v = F.log_softmax(logits_v, dim=1)
                adv_v = vals_ref_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(GRAD_BATCH), actions_t]
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v, dim=1)
                entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()

                loss_v = entropy_loss_v + loss_value_v + loss_policy_v
                loss_v.backward()

                tb_tracker.track("advantage", adv_v, frame_idx)
                tb_tracker.track("values", value_v, frame_idx)
                tb_tracker.track("batch_rewards", vals_ref_v, frame_idx)
                tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx)
                tb_tracker.track("loss_policy", loss_policy_v, frame_idx)
                tb_tracker.track("loss_value", loss_value_v, frame_idx)
                tb_tracker.track("loss_total", loss_v, frame_idx)

                # gather gradients
                nn_utils.clip_grad_norm(net.parameters(), CLIP_GRAD)
                grads = [param.grad.data.cpu().numpy() if param.grad is not None else None
                         for param in net.parameters()]
                train_queue.put(grads)

    train_queue.put(None)
Exemplo n.º 14
0
def calc_loss_qr(batch, net, tgt_net, gamma, cuda=False):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    done_indices = np.where(dones)[0]
    batch_size = len(batch)

    states_v = Variable(torch.from_numpy(states))
    next_states_v = Variable(torch.from_numpy(next_states), volatile=True)
    actions_v = Variable(torch.from_numpy(actions))
    rewards_v = Variable(torch.from_numpy(rewards))
    tau_hat_v = Variable(torch.range(0.0, 1.0 - 1 / QUANT_N,
                                     1 / QUANT_N)) + 0.5 / QUANT_N
    if cuda:
        states_v = states_v.cuda(async=True)
        next_states_v = next_states_v.cuda(async=True)
        actions_v = actions_v.cuda(async=True)
        rewards_v = rewards_v.cuda(async=True)
        tau_hat_v = tau_hat_v.cuda(async=True)

    next_quant_v = tgt_net(next_states_v)
    best_actions_v = tgt_net.qvals_from_quant(next_quant_v).max(1)[1]
    best_next_quant_v = next_quant_v[range(batch_size), best_actions_v.data]
    if dones.any():
        done_indices_v = torch.from_numpy(done_indices)
        if cuda:
            done_indices_v = done_indices_v.cuda()
        best_next_quant_v[done_indices_v] = 0.0
    best_next_quant_v.volatile = False
    expected_quant_v = best_next_quant_v * gamma + rewards_v.unsqueeze(-1)
    quant_v = net(states_v)[range(batch_size), actions_v.data]

    _, quant_idx = torch.sort(quant_v, dim=1, descending=False)
    tau = []
    for idx in range(batch_size):
        tau.append(tau_hat_v[quant_idx[idx]])
    tau_hat_v = torch.stack(tau)

    u = expected_quant_v - quant_v
    abs_u = u.abs()
    clamp_u = torch.clamp(abs_u, 0.0, HUBER_K)
    huber_loss = HUBER_K * (abs_u - clamp_u) + 0.5 * clamp_u**2

    # mask_small_u = (abs_u <= HUBER_K).float()
    # huber_loss = mask_small_u * 0.5 * (u ** 2)
    # huber_loss = huber_loss + (1 - mask_small_u) * HUBER_K * (abs_u - HUBER_K / 2)

    huber_mul = torch.abs(tau_hat_v - (u < 0).float())
    #    huber_mul = tau_hat_v
    #    huber_mul = 1
    final_loss = huber_mul * huber_loss
    return final_loss.sum() / QUANT_N
def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu", cuda_async=False):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device, non_blocking=cuda_async)
    next_states_v = torch.tensor(next_states).to(device, non_blocking=cuda_async)
    actions_v = torch.tensor(actions).to(device, non_blocking=cuda_async)
    rewards_v = torch.tensor(rewards).to(device, non_blocking=cuda_async)
    done_mask = torch.BoolTensor(dones).to(device, non_blocking=cuda_async)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0

    expected_state_action_values = next_state_values.detach() * gamma + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0

    expected_state_action_values = next_state_values.detach() * gamma + rewards_v
    losses_v = batch_weights_v * (state_action_values - expected_state_action_values) ** 2
    return losses_v.mean(), losses_v + 1e-5
Exemplo n.º 17
0
def calc_loss(batch, batch_weights, net, tgt_net, gamma, cuda=False):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = Variable(torch.from_numpy(states))
    actions_v = Variable(torch.from_numpy(actions))
    next_states_v = Variable(torch.from_numpy(next_states))
    batch_weights_v = Variable(torch.from_numpy(batch_weights))
    if cuda:
        states_v = states_v.cuda()
        actions_v = actions_v.cuda()
        next_states_v = next_states_v.cuda()
        batch_weights_v = batch_weights_v.cuda()

    # next state distribution
    # dueling arch -- actions from main net, distr from tgt_net

    # calc at once both next and cur states
    distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
    next_qvals_v = qvals_v[batch_size:]
    distr_v = distr_v[:batch_size]

    next_actions_v = next_qvals_v.max(1)[1]
    next_distr_v = tgt_net(next_states_v)
    next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
    next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
    next_best_distr = next_best_distr_v.data.cpu().numpy()

    dones = dones.astype(np.bool)

    # project our distribution using Bellman update
    proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin,
                                         Vmax, N_ATOMS, gamma)

    # calculate net output
    state_action_values = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values)
    proj_distr_v = Variable(torch.from_numpy(proj_distr))
    if cuda:
        proj_distr_v = proj_distr_v.cuda()

    loss_v = -state_log_sm_v * proj_distr_v
    loss_v = batch_weights_v * loss_v.sum(dim=1)
    return loss_v.mean(), loss_v + 1e-5
def calc_loss(batch, net, tgt_net, gamma, device="cpu", double=True):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    if double:
        next_state_actions = net(next_states_v).max(1)[1]
        next_state_values = tgt_net(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    else:
        next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0

    expected_state_action_values = next_state_values.detach() * gamma + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)
Exemplo n.º 19
0
def calc_loss(batch, batch_weights, net, target_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    dones_mask_v = torch.ByteTensor(dones).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    predicted_Q_v = net(states_v).gather(1,
                                         actions_v.unsqueeze(-1)).squeeze(-1)

    next_state_Q_v = target_net(next_states_v).max(1)[0]
    # use the dones_mask to 0 out values where last_state is none
    next_state_Q_v[dones_mask_v] = 0.0
    expected_Q_v = next_state_Q_v.detach() * gamma + rewards_v
    losses_v = batch_weights_v * (predicted_Q_v - expected_Q_v)**2

    return losses_v.mean(), losses_v + 1e-5
def calc_loss_prio(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(device)
        next_state_values = tgt_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        expected_state_action_values = next_state_values.detach(
        ) * gamma + rewards_v
    losses_v = batch_weights_v * (state_action_values -
                                  expected_state_action_values)**2
    return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = \
        common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    actions_v = actions_v.unsqueeze(-1)
    state_action_vals = net(states_v).gather(1, actions_v)
    state_action_vals = state_action_vals.squeeze(-1)
    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(device)
        next_s_vals = tgt_net(next_states_v).max(1)[0]
        next_s_vals[done_mask] = 0.0
        exp_sa_vals = next_s_vals.detach() * gamma + rewards_v
    l = (state_action_vals - exp_sa_vals)**2
    losses_v = batch_weights_v * l
    return losses_v.mean(), \
           (losses_v + 1e-5).data.cpu().numpy()
Exemplo n.º 22
0
def calc_loss(batch, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    next_states_v = torch.tensor(next_states).to(device)

    # next state distribution
    # dueling arch -- actions from main net, distr from tgt_net

    # calc at once both next and cur states
    distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
    next_qvals_v = qvals_v[batch_size:]
    distr_v = distr_v[:batch_size]

    next_actions_v = next_qvals_v.max(1)[1]
    next_distr_v = tgt_net(next_states_v)
    next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
    next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
    next_best_distr = next_best_distr_v.data.cpu().numpy()

    dones = dones.astype(np.bool)

    # project our distribution using Bellman update
    proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin,
                                         Vmax, N_ATOMS, gamma)

    # calculate net output
    state_action_values = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values, dim=1)
    proj_distr_v = torch.tensor(proj_distr).to(device)

    loss_v = -state_log_sm_v * proj_distr_v
    loss_v = loss_v.sum(dim=1)
    return loss_v.mean(), loss_v + 1e-5
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    # next state distribution
    # dueling arch -- actions from main net, distr from tgt_net

    # calc at once both next and cur states
    distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
    next_qvals_v = qvals_v[batch_size:]
    distr_v = distr_v[:batch_size]

    next_actions_v = next_qvals_v.max(1)[1]
    next_distr_v = tgt_net(next_states_v)
    next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
    next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
    next_best_distr = next_best_distr_v.data.cpu().numpy()

    dones = dones.astype(np.bool)

    # project our distribution using Bellman update
    proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)

    # calculate net output
    state_action_values = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values, dim=1)
    proj_distr_v = torch.tensor(proj_distr).to(device)

    loss_v = -state_log_sm_v * proj_distr_v
    loss_v = batch_weights_v * loss_v.sum(dim=1)
    return loss_v.mean(), loss_v + 1e-5
Exemplo n.º 24
0
                if step_idx > CUT_DEMO_PROB_FRAMES:
                    DEMO_PROB = 0.01
                if demo_samples and random.random() < DEMO_PROB:
                    random.shuffle(demo_samples)
                    demo_batch = demo_samples[:BATCH_SIZE]
                    model_vnc.train_demo(net,
                                         optimizer,
                                         demo_batch,
                                         writer,
                                         step_idx,
                                         preprocessor=preprocessor,
                                         device=device)

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS,
                                        device=device, states_preprocessor=preprocessor)
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)

                loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                log_prob_v = F.log_softmax(logits_v, dim=1)
                adv_v = vals_ref_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE),
                                                        actions_t]
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v, dim=1)
                entropy_loss_v = ENTROPY_BETA * (prob_v *
                eta_current = buffer.get_eta(ETA_INIT, ETA_FINAL, ETA_BASELINE_EPOCH, ETA_AVG_SIZE)
                ck_list = buffer.get_cks(ep_len, eta_current)

                for k in range(ep_len):
                    c_k = ck_list[k]
                    if c_k < C_MIN:
                        c_k = C_MIN

                    if withPrio:
                        batch, batch_indices, batch_weights = buffer.sample(c_k, BATCH_SIZE, beta)
                        batch_weights_v = torch.from_numpy(batch_weights).to(device)
                    else:
                        batch = buffer.sample(c_k, BATCH_SIZE)
                        batch_weights_v = torch.from_numpy(np.array(1, dtype=np.float32)).to(device)
                    states_v, actions_v, ref_q_v = \
                        common.unpack_batch(batch, tgt_twinq_net.target_model,
                                            agent, GAMMA ** REWARD_STEPS, device, munchausen=MUNCHAUSEN)

                    with torch.no_grad():
                        ref_q += ref_q_v.mean()

                    # TwinQ
                    q1_v, q2_v = twinq_net(states_v, actions_v)
                    q1_loss_v = batch_weights_v * (q1_v.squeeze() - ref_q_v.detach()).pow(2)
                    q2_loss_v = batch_weights_v * (q2_v.squeeze() - ref_q_v.detach()).pow(2)
                    if withPrio:
                        sample_prios_v = 0.5 * (q1_loss_v + q2_loss_v) + 1e-5
                    q1_loss_v = q1_loss_v.mean()
                    q2_loss_v = q2_loss_v.mean()
                    with torch.no_grad():
                        q1_loss += q1_loss_v
                        q2_loss += q2_loss_v
Exemplo n.º 26
0
                            torch.save(
                                net.state_dict(),
                                './checkpoints/' + args.name + "-best.dat")
                        if finished:
                            break
                        continue

                    step_idx += 1
                    # keep receiving data until one batch is full
                    batch.append(train_entry)
                    if len(batch) < BATCH_SIZE:
                        continue

                    # When a full batch, perform a policy update
                    states_v, actions_t, q_vals_v = \
                        common.unpack_batch(batch, net, last_val_gamma=GAMMA**BELLMAN_STEPS, device=device)
                    batch.clear()

                    optimizer.zero_grad()
                    logits_v, value_v = net(states_v)

                    loss_value_v = F.mse_loss(value_v.squeeze(-1), q_vals_v)

                    log_prob_v = F.log_softmax(logits_v, dim=1)
                    adv_v = q_vals_v - value_v.detach(
                    )  # calculate advantage = Q(s,a) - V(s)
                    log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE),
                                                            actions_t]
                    loss_policy_v = -log_prob_actions_v.mean()

                    # add an entropy bonus to the loss function, it is negative so will reduce loss
Exemplo n.º 27
0
                    if mean_reward is not None:
                        if best_reward is None or mean_reward > best_reward:
                            if best_reward is not None:
                                name = "best_%.3f_%d" % (mean_reward, step_idx)
                                fname = os.path.join(saves_path, name)
                                torch.save(net.state_dict(), fname + ".dat")
                                preprocessor.save(fname + ".pre")
                                print("Best reward updated: %.3f -> %.3f" %
                                      (best_reward, mean_reward))
                            best_reward = mean_reward
                batch.append(exp)
                if len(batch) < BATCH_SIZE:
                    continue

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS,
                                        cuda=args.cuda, states_preprocessor=preprocessor)
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)

                loss_value_v = F.mse_loss(value_v, vals_ref_v)

                log_prob_v = F.log_softmax(logits_v)
                adv_v = vals_ref_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE),
                                                        actions_t]
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v)
                entropy_loss_v = ENTROPY_BETA * (prob_v *
Exemplo n.º 28
0
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon)

            # Don't train while filling memory
            if len(buffer) < rep_init:
                continue

            if frame_idx % UPDATE_FREQ == 0:
                # Generate and unpack batch
                batch = buffer.sample(batch_size)
                s, a, r, t, s2 = common.unpack_batch(batch)
                a_1h = np.zeros((batch_size, env.action_space.n), dtype=np.float32)
                a_1h[np.arange(batch_size), a] = 1.0
                # Train
                sess.run(train_step, feed_dict={state : s,
                                                action: a_1h,
                                                reward: r,
                                                done  : t,
                                                state2: s2})

            # Copy current network to target network
            if frame_idx % (UPDATE_FREQ * tgt_net_sync) == 0:
                sync_nets.run()

            # Save current network every 250,000 ATARI frames
            if frame_idx % 62500 == 0:
                            break
                        continue

                    #queue에 있는게 reward가 아닌 expsource객체 (에피소드가 끝남)
                    step_idx += 1
                    batch.append(train_entry)

                    #batch가 쌓일때까지
                    if len(batch) < BATCH_SIZE:
                        continue

                    #batch가 다 쌓이면
                    #gamma**4
                    states_v, actions_t, vals_ref_v = common.unpack_batch(
                        batch,
                        net,
                        last_val_gamma=GAMMA**REWARD_STEPS,
                        device=device)

                    batch.clear()
                    """최적화"""
                    optimizer.zero_grad()

                    logits_v, value_v = net(states_v)

                    loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                    log_prob_v = F.log_softmax(logits_v, dim=1)

                    adv_v = vals_ref_v - value_v.detach()
Exemplo n.º 30
0
                batch.append(exp)
                if len(batch) < BATCH_SIZE:
                    continue

                if step_idx > CUT_DEMO_PROB_FRAMES:
                    DEMO_PROB = 0.01

                if demo_samples and random.random() < DEMO_PROB:
                    random.shuffle(demo_samples)
                    demo_batch = demo_samples[:BATCH_SIZE]
                    model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx,
                                         preprocessor=ptan.agent.default_states_preprocessor,
                                         cuda=args.cuda)

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS,
                                        cuda=args.cuda)
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)

                loss_value_v = F.mse_loss(value_v, vals_ref_v)

                log_prob_v = F.log_softmax(logits_v)
                adv_v = vals_ref_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v)
                entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()
        with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker:
            with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker:
                while True:
                    train_entry = train_queue.get()
                    if isinstance(train_entry, TotalReward):
                        if tracker.reward(train_entry.reward, step_idx):
                            break
                        continue

                    step_idx += 1
                    batch.append(train_entry)
                    if len(batch) < BATCH_SIZE:
                        continue

                    states_v, actions_t, vals_ref_v = \
                        common.unpack_batch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, device=device)
                    batch.clear()

                    optimizer.zero_grad()
                    logits_v, value_v = net(states_v)

                    loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                    log_prob_v = F.log_softmax(logits_v, dim=1)
                    adv_v = vals_ref_v - value_v.detach()
                    log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
                    loss_policy_v = -log_prob_actions_v.mean()

                    prob_v = F.softmax(logits_v, dim=1)
                    entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()
Exemplo n.º 32
0
                new_rewards = exp_source.pop_total_rewards()
                if new_rewards:
                    finished, save_checkpoint = tracker.reward(
                        new_rewards[0], step_idx)
                    if save_checkpoint:
                        torch.save(net.state_dict(),
                                   './checkpoints/' + args.name + "-best.dat")
                    if finished:
                        break

                if len(batch) < BATCH_SIZE:
                    continue

                states_v, actions_t, q_vals_v = common.unpack_batch(
                    batch,
                    net,
                    last_val_gamma=GAMMA**BELLMAN_STEPS,
                    device=device)
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)
                loss_value_v = F.mse_loss(value_v.squeeze(-1), q_vals_v)

                log_prob_v = F.log_softmax(logits_v, dim=1)
                adv_v = q_vals_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE),
                                                        actions_t]
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v, dim=1)
Exemplo n.º 33
0
            with ptan.common.utils.TBMeanTracker(writer,
                                                 batch_size=100) as tb_tracker:
                while True:
                    train_entry = train_queue.get()
                    if isinstance(train_entry, TotalReward):
                        if tracker.reward(train_entry.reward, step_idx):
                            break
                        continue

                    step_idx += 1
                    batch.append(train_entry)
                    if len(batch) < BATCH_SIZE:
                        continue

                    states_v, actions_t, vals_ref_v = \
                        common.unpack_batch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, device=device)
                    batch.clear()

                    optimizer.zero_grad()
                    logits_v, value_v = net(states_v)

                    loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                    log_prob_v = F.log_softmax(logits_v, dim=1)
                    adv_v = vals_ref_v - value_v.detach()
                    log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE),
                                                            actions_t]
                    loss_policy_v = -log_prob_actions_v.mean()

                    prob_v = F.softmax(logits_v, dim=1)
                    entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(