예제 #1
0
    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2,
                                     optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2)
        result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2,
                                   optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2, max_grad_norm=20)

        del sampler
예제 #2
0
    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2)

        del sampler
예제 #3
0
    log_dir=os.path.join(
        args.log,
        'movie'),
    record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

observation_space = env.observation_space
action_space = env.action_space

# Generate teacher (t) policy and student (s) policy and load teacher policy
# Please note that the two policies do not have to have the same hidden architecture

if args.rnn:
    t_pol_net = PolNetLSTM(observation_space, action_space,
                           h_size=256, cell_size=256)
    s_pol_net = PolNetLSTM(observation_space, action_space,
                           h_size=256, cell_size=256)
else:
    t_pol_net = PolNet(observation_space, action_space)
    s_pol_net = PolNet(observation_space, action_space, h1=190, h2=90)
if isinstance(action_space, gym.spaces.Box):
    t_pol = GaussianPol(observation_space, action_space, t_pol_net, args.rnn)
    s_pol = GaussianPol(observation_space, action_space, s_pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.Discrete):
    t_pol = CategoricalPol(
        observation_space, action_space, t_pol_net, args.rnn)
    s_pol = CategoricalPol(
        observation_space, action_space, s_pol_net, args.rnn)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    t_pol = MultiCategoricalPol(
예제 #4
0
device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
device = torch.device(device_name)
set_device(device)

score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)
logger.add_tensorboard_output(args.log)

env = GymEnv(args.env_name, log_dir=os.path.join(
    args.log, 'movie'), record_video=args.record)
env.env.seed(args.seed)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNetLSTM(observation_space, action_space)
pol = GaussianPol(observation_space, action_space, pol_net, rnn=True)

qf_net1 = QNetLSTM(observation_space, action_space)
qf1 = DeterministicSAVfunc(observation_space, action_space, qf_net1, rnn=True)
targ_qf_net1 = QNetLSTM(observation_space, action_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(
    observation_space, action_space, targ_qf_net1, rnn=True)

qf_net2 = QNetLSTM(observation_space, action_space)
qf2 = DeterministicSAVfunc(observation_space, action_space, qf_net2, rnn=True)
targ_qf_net2 = QNetLSTM(observation_space, action_space)
targ_qf_net2.load_state_dict(qf_net2.state_dict())
targ_qf2 = DeterministicSAVfunc(
    observation_space, action_space, targ_qf_net2, rnn=True)
예제 #5
0
score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)

env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

ob_space = env.observation_space
ac_space = env.action_space

if args.rnn:
    pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
else:
    pol_net = PolNet(ob_space, ac_space)
if isinstance(ac_space, gym.spaces.Box):
    pol = GaussianPol(ob_space,
                      ac_space,
                      pol_net,
                      args.rnn,
                      data_parallel=args.data_parallel,
                      parallel_dim=1 if args.rnn else 0)
elif isinstance(ac_space, gym.spaces.Discrete):
    pol = CategoricalPol(ob_space,
                         ac_space,
                         pol_net,
                         args.rnn,
                         data_parallel=args.data_parallel,
예제 #6
0
score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)

env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)
if args.c2d:
    env = C2DEnv(env)

ob_space = env.observation_space
ac_space = env.action_space

if args.rnn:
    pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
else:
    pol_net = PolNet(ob_space, ac_space)
if isinstance(ac_space, gym.spaces.Box):
    pol = GaussianPol(ob_space, ac_space, pol_net, args.rnn)
elif isinstance(ac_space, gym.spaces.Discrete):
    pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn)
elif isinstance(ac_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.rnn:
    vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
else:
    vf_net = VNet(ob_space)
예제 #7
0
observation_space = env.observation_space
action_space = env.action_space

if args.ddpg:
    pol_net = PolNet(observation_space,
                     action_space,
                     args.pol_h1,
                     args.pol_h2,
                     deterministic=True)
    noise = OUActionNoise(action_space)
    pol = DeterministicActionNoisePol(observation_space, action_space, pol_net,
                                      noise)
else:
    if args.rnn:
        pol_net = PolNetLSTM(observation_space,
                             action_space,
                             h_size=256,
                             cell_size=256)
    else:
        pol_net = PolNet(observation_space, action_space)
    if isinstance(action_space, gym.spaces.Box):
        pol = GaussianPol(observation_space, action_space, pol_net, args.rnn)
    elif isinstance(action_space, gym.spaces.Discrete):
        pol = CategoricalPol(observation_space, action_space, pol_net,
                             args.rnn)
    elif isinstance(action_space, gym.spaces.MultiDiscrete):
        pol = MultiCategoricalPol(observation_space, action_space, pol_net,
                                  args.rnn)
    else:
        raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=1, seed=args.seed)
예제 #8
0
    def test_learning(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        qf_net1 = QNetLSTM(self.env.observation_space,
                           self.env.action_space, h_size=32, cell_size=32)
        qf1 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, qf_net1, rnn=True)
        targ_qf_net1 = QNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, targ_qf_net1, rnn=True)

        qf_net2 = QNetLSTM(self.env.observation_space,
                           self.env.action_space, h_size=32, cell_size=32)
        qf2 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, qf_net2, rnn=True)
        targ_qf_net2 = QNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, targ_qf_net2, rnn=True)

        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]

        log_alpha = nn.Parameter(torch.zeros(()))

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        max_pri = traj.get_max_pri()
        traj = ef.set_all_pris(traj, max_pri)
        traj = ef.compute_seq_pris(traj, 4)
        traj = ef.compute_h_masks(traj)
        for i in range(len(qfs)):
            traj = ef.compute_hs(
                traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True)
            traj = ef.compute_hs(
                traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True)
        traj.register_epis()

        result_dict = r2d2_sac.train(
            traj,
            pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            2, 32, 4, 2,
            0.01, 0.99, 2,
        )

        del sampler
예제 #9
0
    def setup_nets(self):
        ob_space = self.env.observation_space
        ac_space = self.env.action_space

        if self.args.mirror is True:
            print("Initiating a symmetric network")
            pol_net = SymmetricNet(
                *self.env.unwrapped.mirror_sizes,
                hidden_size=int(self.args.hidden_size / 4),
                num_layers=self.args.num_layers,
                varying_std=self.args.varying_std,
                tanh_finish=self.args.tanh_finish,
                log_std=self.args.log_stdev,
            )
        elif self.args.rnn:
            pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256)
        elif self.args.net_version == 1:
            pol_net = PolNet(ob_space, ac_space, log_std=self.args.log_stdev)
        else:
            pol_net = PolNetB(
                ob_space,
                ac_space,
                hidden_size=self.args.hidden_size,
                num_layers=self.args.num_layers,
                varying_std=self.args.varying_std,
                tanh_finish=self.args.tanh_finish,
                log_std=self.args.log_stdev,
            )

        if self.args.mirror == "new":
            print("Initiating a new symmetric network")
            # TODO: in this case the action_space for the previous pol_net is incorrect, but it isn't easy to fix ...
            # we can use this for now which just ignores some of the final indices
            pol_net = SymNet(
                pol_net,
                ob_space.shape[0],
                *self.env.unwrapped.sym_act_inds,
                varying_std=self.args.varying_std,
                log_std=self.args.log_stdev,
                deterministic=False,
            )

        if isinstance(ac_space, gym.spaces.Box):
            pol_class = GaussianPol
        elif isinstance(ac_space, gym.spaces.Discrete):
            pol_class = CategoricalPol
        elif isinstance(ac_space, gym.spaces.MultiDiscrete):
            pol_class = MultiCategoricalPol
        else:
            raise ValueError(
                "Only Box, Discrete, and MultiDiscrete are supported")

        policy = pol_class(
            ob_space,
            ac_space,
            pol_net,
            self.args.rnn,
            data_parallel=self.args.data_parallel,
            parallel_dim=1 if self.args.rnn else 0,
        )

        if self.args.mirror is True:
            vf_net = SymmetricValue(
                *self.env.unwrapped.mirror_sizes[:3],
                hidden_size=self.args.hidden_size,
                num_layers=self.args.num_layers,
            )
        elif self.args.rnn:
            vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
        elif self.args.net_version == 1:
            vf_net = VNet(ob_space)
        else:
            vf_net = VNetB(
                ob_space,
                hidden_size=self.args.hidden_size,
                num_layers=self.args.num_layers,
            )

        if self.args.mirror == "new":
            print("Initiating a new symmetric value network")
            vf_net = SymVNet(vf_net, ob_space.shape[0])

        vf = DeterministicSVfunc(
            ob_space,
            vf_net,
            self.args.rnn,
            data_parallel=self.args.data_parallel,
            parallel_dim=1 if self.args.rnn else 0,
        )

        self.pol = policy
        self.vf = vf