示例#1
0
                                fname = os.path.join(saves_path, name)
                                torch.save(net.state_dict(), fname)
                                print("Best reward updated: %.3f -> %.3f" % (best_reward, mean_reward))
                            best_reward = mean_reward
                batch.append(exp)
                if len(batch) < BATCH_SIZE:
                    continue

                if step_idx > CUT_DEMO_PROB_FRAMES:
                    DEMO_PROB = 0.01

                if demo_samples and random.random() < DEMO_PROB:
                    random.shuffle(demo_samples)
                    demo_batch = demo_samples[:BATCH_SIZE]
                    model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx,
                                         preprocessor=ptan.agent.default_states_preprocessor,
                                         cuda=args.cuda)

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS,
                                        cuda=args.cuda)
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)

                loss_value_v = F.mse_loss(value_v, vals_ref_v)

                log_prob_v = F.log_softmax(logits_v)
                adv_v = vals_ref_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
示例#2
0
                                print("Best reward updated: %.3f "
                                      "-> %.3f" % (
                                    best_reward, mean_reward))
                            best_reward = mean_reward
                batch.append(exp)
                if len(batch) < BATCH_SIZE:
                    continue

                if step_idx > CUT_DEMO_PROB_FRAMES:
                    DEMO_PROB = 0.01

                if demo_samples and random.random() < DEMO_PROB:
                    random.shuffle(demo_samples)
                    demo_batch = demo_samples[:BATCH_SIZE]
                    model_vnc.train_demo(
                        net, optimizer, demo_batch, writer,
                        step_idx, device=device)

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(
                        batch, net, device=device,
                        last_val_gamma=GAMMA ** REWARD_STEPS)
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)

                loss_value_v = F.mse_loss(
                    value_v.squeeze(-1), vals_ref_v)

                log_prob_v = F.log_softmax(logits_v, dim=1)
示例#3
0
                                print("Best reward updated: %.3f -> %.3f" %
                                      (best_reward, mean_reward))
                            best_reward = mean_reward
                batch.append(exp)
                if len(batch) < BATCH_SIZE:
                    continue

                if step_idx > CUT_DEMO_PROB_FRAMES:
                    DEMO_PROB = 0.01
                if demo_samples and random.random() < DEMO_PROB:
                    random.shuffle(demo_samples)
                    demo_batch = demo_samples[:BATCH_SIZE]
                    model_vnc.train_demo(net,
                                         optimizer,
                                         demo_batch,
                                         writer,
                                         step_idx,
                                         preprocessor=preprocessor,
                                         device=device)

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS,
                                        device=device, states_preprocessor=preprocessor)
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)

                loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                log_prob_v = F.log_softmax(logits_v, dim=1)
                                fname = os.path.join(saves_path, name)
                                torch.save(net.state_dict(), fname + ".dat")
                                preprocessor.save(fname + ".pre")
                                print("Best reward updated: %.3f -> %.3f" % (best_reward, mean_reward))
                            best_reward = mean_reward
                batch.append(exp)
                if len(batch) < BATCH_SIZE:
                    continue

                if step_idx > CUT_DEMO_PROB_FRAMES:
                    DEMO_PROB = 0.01
                if demo_samples and random.random() < DEMO_PROB:
                    random.shuffle(demo_samples)
                    demo_batch = demo_samples[:BATCH_SIZE]
                    model_vnc.train_demo(net, optimizer, demo_batch, writer, step_idx,
                                         preprocessor=preprocessor,
                                         cuda=args.cuda)

                states_v, actions_t, vals_ref_v = \
                    common.unpack_batch(batch, net, last_val_gamma=GAMMA ** REWARD_STEPS,
                                        cuda=args.cuda, states_preprocessor=preprocessor)
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)

                loss_value_v = F.mse_loss(value_v, vals_ref_v)

                log_prob_v = F.log_softmax(logits_v)
                adv_v = vals_ref_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]