fname = os.path.join(saves_path, name) torch.save(net.state_dict(), fname) print("Best reward updated: %.3f -> %.3f" % (best_reward, mean_reward)) best_reward = mean_reward batch.append(exp) if len(batch) < BATCH_SIZE: continue if step_idx > CUT_DEMO_PROB_FRAMES: DEMO_PROB = 0.01 if demo_samples and random.random() < DEMO_PROB: random.shuffle(demo_samples) demo_batch = demo_samples[:BATCH_SIZE] model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx, preprocessor=ptan.agent.default_states_preprocessor, cuda=args.cuda) states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS, cuda=args.cuda) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v, vals_ref_v) log_prob_v = F.log_softmax(logits_v) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
print("Best reward updated: %.3f " "-> %.3f" % ( best_reward, mean_reward)) best_reward = mean_reward batch.append(exp) if len(batch) < BATCH_SIZE: continue if step_idx > CUT_DEMO_PROB_FRAMES: DEMO_PROB = 0.01 if demo_samples and random.random() < DEMO_PROB: random.shuffle(demo_samples) demo_batch = demo_samples[:BATCH_SIZE] model_vnc.train_demo( net, optimizer, demo_batch, writer, step_idx, device=device) states_v, actions_t, vals_ref_v = \ common.unpack_batch( batch, net, device=device, last_val_gamma=GAMMA ** REWARD_STEPS) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss( value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1)
print("Best reward updated: %.3f -> %.3f" % (best_reward, mean_reward)) best_reward = mean_reward batch.append(exp) if len(batch) < BATCH_SIZE: continue if step_idx > CUT_DEMO_PROB_FRAMES: DEMO_PROB = 0.01 if demo_samples and random.random() < DEMO_PROB: random.shuffle(demo_samples) demo_batch = demo_samples[:BATCH_SIZE] model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx, preprocessor=preprocessor, device=device) states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS, device=device, states_preprocessor=preprocessor) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1)
fname = os.path.join(saves_path, name) torch.save(net.state_dict(), fname + ".dat") preprocessor.save(fname + ".pre") print("Best reward updated: %.3f -> %.3f" % (best_reward, mean_reward)) best_reward = mean_reward batch.append(exp) if len(batch) < BATCH_SIZE: continue if step_idx > CUT_DEMO_PROB_FRAMES: DEMO_PROB = 0.01 if demo_samples and random.random() < DEMO_PROB: random.shuffle(demo_samples) demo_batch = demo_samples[:BATCH_SIZE] model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx, preprocessor=preprocessor, cuda=args.cuda) states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS, cuda=args.cuda, states_preprocessor=preprocessor) batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v, vals_ref_v) log_prob_v = F.log_softmax(logits_v) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]