def calc_loss_double_dqn(batch, net, tgt_net, gamma, device="cpu", double=True): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.BoolTensor(dones).to(device) actions_v = actions_v.unsqueeze(-1) state_action_vals = net(states_v).gather(1, actions_v) state_action_vals = state_action_vals.squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(device) if double: next_state_acts = net(next_states_v).max(1)[1] next_state_acts = next_state_acts.unsqueeze(-1) next_state_vals = tgt_net(next_states_v).gather( 1, next_state_acts).squeeze(-1) else: next_state_vals = tgt_net(next_states_v).max(1)[0] next_state_vals[done_mask] = 0.0 exp_sa_vals = next_state_vals.detach() * gamma + rewards_v return nn.MSELoss()(state_action_vals, exp_sa_vals)
def calc_loss_rainbow(batch, batch_weights, net, tgt_net, gamma, device="cpu", double=True): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.BoolTensor(dones).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) actions_v = actions_v.unsqueeze(-1) state_action_values = net(states_v).gather(1, actions_v) state_action_values = state_action_values.squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(device) if double: next_state_actions = net(next_states_v).max(1)[1] next_state_actions = next_state_actions.unsqueeze(-1) next_state_values = tgt_net(next_states_v).gather( 1, next_state_actions).squeeze(-1) else: next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach( ) * gamma + rewards_v losses_v = (state_action_values - expected_state_action_values)**2 losses_v *= batch_weights_v return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
def calc_loss(batch, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) # next state distribution next_distr_v, next_qvals_v = tgt_net.both(next_states_v) next_acts = next_qvals_v.max(1)[1].data.cpu().numpy() next_distr = tgt_net.apply_softmax(next_distr_v) next_distr = next_distr.data.cpu().numpy() next_best_distr = next_distr[range(batch_size), next_acts] dones = dones.astype(np.bool) proj_distr = dqn_extra.distr_projection(next_best_distr, rewards, dones, gamma) distr_v = net(states_v) sa_vals = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(sa_vals, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) loss_v = -state_log_sm_v * proj_distr_v return loss_v.sum(dim=1).mean()
def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu", cuda_async=False): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device, non_blocking=cuda_async) next_states_v = torch.tensor(next_states).to(device, non_blocking=cuda_async) actions_v = torch.tensor(actions).to(device, non_blocking=cuda_async) rewards_v = torch.tensor(rewards).to(device, non_blocking=cuda_async) done_mask = torch.BoolTensor(dones).to(device, non_blocking=cuda_async) state_action_values = net(states_v).gather( 1, actions_v.unsqueeze(-1)).squeeze(-1) next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach( ) * gamma + rewards_v return nn.MSELoss()(state_action_values, expected_state_action_values)
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.BoolTensor(dones).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) actions_v = actions_v.unsqueeze(-1) state_action_vals = net(states_v).gather(1, actions_v) state_action_vals = state_action_vals.squeeze(-1) with torch.no_grad(): next_states_v = torch.tensor(next_states).to(device) next_s_vals = tgt_net(next_states_v).max(1)[0] next_s_vals[done_mask] = 0.0 exp_sa_vals = next_s_vals.detach() * gamma + rewards_v l = (state_action_vals - exp_sa_vals) ** 2 losses_v = batch_weights_v * l return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
def data_func(net, device, train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], device=device, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS ) micro_batch = [] for exp in exp_source: new_rewards = exp_source.pop_total_rewards() if new_rewards: data = TotalReward(reward=np.mean(new_rewards)) train_queue.put(data) micro_batch.append(exp) if len(micro_batch) < MICRO_BATCH_SIZE: continue data = common.unpack_batch( micro_batch, net, device=device, last_val_gamma=GAMMA ** REWARD_STEPS ) train_queue.put(data) micro_batch.clear()
random.shuffle(demo_samples) demo_batch = demo_samples[:BATCH_SIZE] model_vnc.train_demo( net, optimizer, demo_batch, writer, step_idx, preprocessor=preprocessor, device=device, ) states_v, actions_t, vals_ref_v = common.unpack_batch( batch, net, last_val_gamma=GAMMA ** REWARD_STEPS, device=device, states_preprocessor=preprocessor, ) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean()
def grads_func(proc_name, net, device, train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], device=device, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS ) batch = [] frame_idx = 0 writer = SummaryWriter(comment=proc_name) with common.RewardTracker(writer, REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker: for exp in exp_source: frame_idx += 1 new_rewards = exp_source.pop_total_rewards() if new_rewards and tracker.reward(new_rewards[0], frame_idx): break batch.append(exp) if len(batch) < GRAD_BATCH: continue data = common.unpack_batch( batch, net, device=device, last_val_gamma=GAMMA ** REWARD_STEPS ) states_v, actions_t, vals_ref_v = data batch.clear() net.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_p_a = log_prob_v[range(GRAD_BATCH), actions_t] log_prob_actions_v = adv_v * log_p_a loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) ent = (prob_v * log_prob_v).sum(dim=1).mean() entropy_loss_v = ENTROPY_BETA * ent loss_v = entropy_loss_v + loss_value_v + loss_policy_v loss_v.backward() tb_tracker.track("advantage", adv_v, frame_idx) tb_tracker.track("values", value_v, frame_idx) tb_tracker.track("batch_rewards", vals_ref_v, frame_idx) tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx) tb_tracker.track("loss_policy", loss_policy_v, frame_idx) tb_tracker.track("loss_value", loss_value_v, frame_idx) tb_tracker.track("loss_total", loss_v, frame_idx) # gather gradients nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) grads = [ param.grad.data.cpu().numpy() if param.grad is not None else None for param in net.parameters() ] train_queue.put(grads) train_queue.put(None)
if step_idx > CUT_DEMO_PROB_FRAMES: DEMO_PROB = 0.01 if demo_samples and random.random() < DEMO_PROB: random.shuffle(demo_samples) demo_batch = demo_samples[:BATCH_SIZE] model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx, device=device) states_v, actions_t, vals_ref_v = common.unpack_batch( batch, net, device=device, last_val_gamma=GAMMA**REWARD_STEPS) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() lpa = log_prob_v[range(BATCH_SIZE), actions_t] log_prob_actions_v = adv_v * lpa loss_policy_v = -log_prob_actions_v.mean()