writer.add_scalar("test_steps", steps, step_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, step_idx) fname = os.path.join(save_path, name) torch.save(net_act.state_dict(), fname) best_reward = rewards batch.append(exp) if len(batch) < BATCH_SIZE: continue states_v, actions_v, vals_ref_v = \ common.unpack_batch_a2c(batch, net_crt, last_val_gamma=GAMMA ** REWARD_STEPS, device=device) batch.clear() opt_crt.zero_grad() value_v = net_crt(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) loss_value_v.backward() opt_crt.step() opt_act.zero_grad() mu_v = net_act(states_v) adv_v = vals_ref_v.unsqueeze(dim=-1) - value_v.detach() log_prob_v = adv_v * calc_logprob(mu_v, net_act.logstd, actions_v) loss_policy_v = -log_prob_v.mean() entropy_loss_v = ENTROPY_BETA * (
writer.add_scalar("test_steps", steps, step_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, step_idx) fname = os.path.join(save_path, name) torch.save(net.state_dict(), fname) best_reward = rewards batch.append(exp) if len(batch) < BATCH_SIZE: continue states_v, actions_v, vals_ref_v = \ common.unpack_batch_a2c(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS, cuda=args.cuda) batch.clear() optimizer.zero_grad() mu_v, var_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v, vals_ref_v) adv_v = vals_ref_v.unsqueeze(dim=-1) - value_v.detach() log_prob_v = adv_v * calc_logprob(mu_v, var_v, actions_v) loss_policy_v = -log_prob_v.mean() entropy_loss_v = ENTROPY_BETA * ( -(torch.log(2 * math.pi * var_v) + 1) / 2).mean() loss_v = loss_policy_v + entropy_loss_v + loss_value_v loss_v.backward()
writer.add_scalar("test_reward", rewards, step_idx) writer.add_scalar("test_steps", steps, step_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, step_idx) fname = os.path.join(save_path, name) torch.save(net_act.state_dict(), fname) best_reward = rewards batch.append(exp) if len(batch) < BATCH_SIZE: continue states_v, actions_v, vals_ref_v = \ common.unpack_batch_a2c(batch, net_crt, last_val_gamma=GAMMA ** REWARD_STEPS, device=device) batch.clear() opt_crt.zero_grad() value_v = net_crt(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) loss_value_v.backward() opt_crt.step() opt_act.zero_grad() mu_v = net_act(states_v) adv_v = vals_ref_v.unsqueeze(dim=-1) - value_v.detach() log_prob_v = adv_v * calc_logprob(mu_v, net_act.logstd, actions_v) loss_policy_v = -log_prob_v.mean() entropy_loss_v = ENTROPY_BETA * (-(torch.log(2*math.pi*torch.exp(net_act.logstd)) + 1)/2).mean() loss_v = loss_policy_v + entropy_loss_v
name = "best_%+.3f_%d.dat" % (rewards, step_idx) fname = os.path.join(save_path, name) torch.save(net.state_dict(), fname) best_reward = rewards batch.append(exp) if len(batch) < BATCH_SIZE: continue states_v, actions_v, vals_ref_v = common.unpack_batch_a2c( batch, net, device=device, last_val_gamma=GAMMA**REWARD_STEPS) batch.clear() optimizer.zero_grad() mu_v, var_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) adv_v = vals_ref_v.unsqueeze(dim=-1) - value_v.detach() log_prob_v = adv_v * calc_logprob(mu_v, var_v, actions_v) loss_policy_v = -log_prob_v.mean() ent_v = -(torch.log(2 * math.pi * var_v) + 1) / 2 entropy_loss_v = ENTROPY_BETA * ent_v.mean()
writer.add_scalar("test_steps", steps, step_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, step_idx) fname = os.path.join(save_path, name) torch.save(net.state_dict(), fname) best_reward = rewards batch.append(exp) if len(batch) < BATCH_SIZE: continue optimizer.zero_grad() states_v, actions_v, vals_ref_v = common.unpack_batch_a2c( batch, net, GAMMA**REWARD_STEPS, device=device) mu_v, var_v, values_v = net(states_v) loss_value_v = F.mse_loss(values_v.squeeze(-1), vals_ref_v) adv_v = vals_ref_v.unsqueeze(dim=-1) - values_v.detach() loss_policy_v = -(adv_v * log_gaussian_policy( mu_v, var_v, actions_v)).mean() entropy_loss_v = ENTROPY_BETA * \ (- entropy_gaussian(var_v)).mean() loss_v = loss_value_v + loss_policy_v + entropy_loss_v loss_v.backward()