Exemplo n.º 1
0
    def test_learning(self):
        t_pol_net = PolNet(self.env.observation_space,
                           self.env.action_space, h1=200, h2=100)
        s_pol_net = PolNet(self.env.observation_space,
                           self.env.action_space, h1=190, h2=90)

        t_pol = GaussianPol(
            self.env.observation_space, self.env.action_space, t_pol_net)
        s_pol = GaussianPol(
            self.env.observation_space, self.env.action_space, s_pol_net)

        student_sampler = EpiSampler(self.env, s_pol, num_parallel=1)

        optim_pol = torch.optim.Adam(s_pol.parameters(), 3e-4)

        epis = student_sampler.sample(s_pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_h_masks(traj)
        traj.register_epis()
        result_dict = on_pol_teacher_distill.train(
            traj=traj,
            student_pol=s_pol,
            teacher_pol=t_pol,
            student_optim=optim_pol,
            epoch=1,
            batchsize=32)

        del student_sampler
Exemplo n.º 2
0
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.rnn:
    vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256)
else:
    vf_net = VNet(observation_space)
vf = DeterministicSVfunc(observation_space, vf_net, args.rnn)

if rank == 0:
    sampler = EpiSampler(env,
                         pol,
                         num_parallel=args.num_parallel,
                         seed=args.seed)

optim_pol = torch.optim.Adam(pol.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf.parameters(), args.vf_lr)

ddp_pol, optim_pol = make_model_distributed(pol,
                                            optim_pol,
                                            args.use_apex,
                                            args.apex_opt_level,
                                            args.apex_keep_batchnorm_fp32,
                                            args.apex_sync_bn,
                                            args.apex_loss_scale,
                                            device_ids=[args.local_rank],
                                            output_device=args.local_rank)
ddp_vf, optim_vf = make_model_distributed(vf,
                                          optim_vf,
                                          args.use_apex,
                                          args.apex_opt_level,