Пример #1
0
    def test_learning(self):
        pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net)

        targ_pol_net = PolNet(self.env.ob_space, self.env.ac_space, 32, 32)
        targ_pol_net.load_state_dict(pol_net.state_dict())
        targ_pol = GaussianPol(
            self.env.ob_space, self.env.ac_space, targ_pol_net)

        qf_net = QNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        qf = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, qf_net)

        targ_qf_net = QNet(self.env.ob_space, self.env.ac_space, 32, 32)
        targ_qf_net.load_state_dict(targ_qf_net.state_dict())
        targ_qf = DeterministicSAVfunc(
            self.env.ob_space, self.env.ac_space, targ_qf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        traj.register_epis()

        result_dict = svg.train(
            traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, 1, 32, 0.01, 0.9, 1)

        del sampler
Пример #2
0
    def test_learning(self):
        ob_space = self.env.real_observation_space
        skill_space = self.env.skill_space
        ob_skill_space = self.env.observation_space
        ac_space = self.env.action_space
        ob_dim = ob_skill_space.shape[0] - 4
        f_dim = ob_dim
        def discrim_f(x): return x

        pol_net = PolNet(ob_skill_space, ac_space)
        pol = GaussianPol(ob_skill_space, ac_space, pol_net)
        qf_net1 = QNet(ob_skill_space, ac_space)
        qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net1)
        targ_qf_net1 = QNet(ob_skill_space, ac_space)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net1)
        qf_net2 = QNet(ob_skill_space, ac_space)
        qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net2)
        targ_qf_net2 = QNet(ob_skill_space, ac_space)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net2)
        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]
        log_alpha = nn.Parameter(torch.ones(()))

        high = np.array([np.finfo(np.float32).max]*f_dim)
        f_space = gym.spaces.Box(-high, high, dtype=np.float32)
        discrim_net = DiaynDiscrimNet(
            f_space, skill_space, h_size=100, discrim_f=discrim_f)
        discrim = DeterministicSVfunc(f_space, discrim_net)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 1e-4)
        optim_discrim = torch.optim.SGD(discrim.parameters(),
                                        lr=0.001, momentum=0.9)

        off_traj = Traj()
        sampler = EpiSampler(self.env, pol, num_parallel=1)

        epis = sampler.sample(pol, max_steps=200)
        on_traj = Traj()
        on_traj.add_epis(epis)
        on_traj = ef.add_next_obs(on_traj)
        on_traj = ef.compute_diayn_rews(
            on_traj, lambda x: diayn_sac.calc_rewards(x, 4, discrim))
        on_traj.register_epis()
        off_traj.add_traj(on_traj)
        step = on_traj.num_step
        log_alpha = nn.Parameter(np.log(0.1)*torch.ones(()))  # fix alpha
        result_dict = diayn_sac.train(
            off_traj, pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            step, 128, 5e-3, 0.99, 1, discrim, 4, True)
        discrim_losses = diayn.train(
            discrim, optim_discrim, on_traj, 32, 100, 4)

        del sampler
Пример #3
0
    def test_learning(self):
        pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net)

        qf_net1 = QNet(self.env.ob_space, self.env.ac_space)
        qf1 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space,
                                   qf_net1)
        targ_qf_net1 = QNet(self.env.ob_space, self.env.ac_space)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space,
                                        targ_qf_net1)

        qf_net2 = QNet(self.env.ob_space, self.env.ac_space)
        qf2 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space,
                                   qf_net2)
        targ_qf_net2 = QNet(self.env.ob_space, self.env.ac_space)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space,
                                        targ_qf_net2)

        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]

        log_alpha = nn.Parameter(torch.zeros(()))

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        traj.register_epis()

        result_dict = sac.train(
            traj,
            pol,
            qfs,
            targ_qfs,
            log_alpha,
            optim_pol,
            optim_qfs,
            optim_alpha,
            2,
            32,
            0.01,
            0.99,
            2,
        )

        del sampler
Пример #4
0
    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space,
                         h1=32,
                         h2=32,
                         deterministic=True)
        noise = OUActionNoise(self.env.action_space)
        pol = DeterministicActionNoisePol(self.env.observation_space,
                                          self.env.action_space, pol_net,
                                          noise)

        targ_pol_net = PolNet(self.env.observation_space,
                              self.env.action_space,
                              32,
                              32,
                              deterministic=True)
        targ_pol_net.load_state_dict(pol_net.state_dict())
        targ_noise = OUActionNoise(self.env.action_space)
        targ_pol = DeterministicActionNoisePol(self.env.observation_space,
                                               self.env.action_space,
                                               targ_pol_net, targ_noise)

        qf_net = QNet(self.env.observation_space,
                      self.env.action_space,
                      h1=32,
                      h2=32)
        qf = DeterministicSAVfunc(self.env.observation_space,
                                  self.env.action_space, qf_net)

        targ_qf_net = QNet(self.env.observation_space, self.env.action_space,
                           32, 32)
        targ_qf_net.load_state_dict(targ_qf_net.state_dict())
        targ_qf = DeterministicSAVfunc(self.env.observation_space,
                                       self.env.action_space, targ_qf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        traj.register_epis()

        result_dict = ddpg.train(traj, pol, targ_pol, qf, targ_qf, optim_pol,
                                 optim_qf, 1, 32, 0.01, 0.9)

        del sampler
Пример #5
0
    def test_learning(self):
        qf_net = QNet(self.env.observation_space, self.env.action_space, 32,
                      32)
        lagged_qf_net = QNet(self.env.observation_space, self.env.action_space,
                             32, 32)
        lagged_qf_net.load_state_dict(qf_net.state_dict())
        targ_qf1_net = QNet(self.env.observation_space, self.env.action_space,
                            32, 32)
        targ_qf1_net.load_state_dict(qf_net.state_dict())
        targ_qf2_net = QNet(self.env.observation_space, self.env.action_space,
                            32, 32)
        targ_qf2_net.load_state_dict(lagged_qf_net.state_dict())
        qf = DeterministicSAVfunc(self.env.observation_space,
                                  self.env.action_space, qf_net)
        lagged_qf = DeterministicSAVfunc(self.env.observation_space,
                                         self.env.action_space, lagged_qf_net)
        targ_qf1 = CEMDeterministicSAVfunc(self.env.observation_space,
                                           self.env.action_space,
                                           targ_qf1_net,
                                           num_sampling=60,
                                           num_best_sampling=6,
                                           num_iter=2,
                                           multivari=False)
        targ_qf2 = DeterministicSAVfunc(self.env.observation_space,
                                        self.env.action_space, targ_qf2_net)

        pol = ArgmaxQfPol(self.env.observation_space,
                          self.env.action_space,
                          targ_qf1,
                          eps=0.2)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)
        traj = ef.add_next_obs(traj)
        traj.register_epis()

        result_dict = qtopt.train(traj, qf, lagged_qf, targ_qf1, targ_qf2,
                                  optim_qf, 1000, 32, 0.9999, 0.995, 'mse')

        del sampler
Пример #6
0
    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space,
                         h1=32,
                         h2=32)
        pol = GaussianPol(self.env.observation_space, self.env.action_space,
                          pol_net)

        vf_net = VNet(self.env.observation_space)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net)

        discrim_net = DiscrimNet(self.env.observation_space,
                                 self.env.action_space,
                                 h1=32,
                                 h2=32)
        discrim = DeterministicSAVfunc(self.env.observation_space,
                                       self.env.action_space, discrim_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)
        optim_discrim = torch.optim.Adam(discrim_net.parameters(), 3e-4)

        with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'),
                  'rb') as f:
            expert_epis = pickle.load(f)
        expert_traj = Traj()
        expert_traj.add_epis(expert_epis)
        expert_traj.register_epis()

        epis = sampler.sample(pol, max_steps=32)

        agent_traj = Traj()
        agent_traj.add_epis(epis)
        agent_traj = ef.compute_pseudo_rews(agent_traj, discrim)
        agent_traj = ef.compute_vs(agent_traj, vf)
        agent_traj = ef.compute_rets(agent_traj, 0.99)
        agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95)
        agent_traj = ef.centerize_advs(agent_traj)
        agent_traj = ef.compute_h_masks(agent_traj)
        agent_traj.register_epis()

        result_dict = gail.train(agent_traj,
                                 expert_traj,
                                 pol,
                                 vf,
                                 discrim,
                                 optim_vf,
                                 optim_discrim,
                                 rl_type='trpo',
                                 epoch=1,
                                 batch_size=32,
                                 discrim_batch_size=32,
                                 discrim_step=1,
                                 pol_ent_beta=1e-3,
                                 discrim_ent_beta=1e-5)

        del sampler
Пример #7
0
    def setUpClass(cls):
        env = GymEnv('Pendulum-v0')
        random_pol = RandomPol(cls.env.observation_space, cls.env.action_space)
        sampler = EpiSampler(cls.env, pol, num_parallel=1)
        epis = sampler.sample(pol, max_steps=32)
        traj = Traj()
        traj.add_epis(epis)
        traj.register_epis()

        cls.num_step = traj.num_step

        make_redis('localhost', '6379')
        cls.r = get_redis()

        cls.r.set('env', env)
        cls.r.set('traj', traj)

        pol_net = PolNet(env.observation_space, env.action_space)
        gpol = GaussianPol(env.observation_space, env.action_space, pol_net)
        pol_net = PolNet(env.observation_space,
                         env.action_space, deterministic=True)
        dpol = DeterministicActionNoisePol(
            env.observation_space, env.action_space, pol_net)
        model_net = ModelNet(env.observation_space, env.action_space)
        mpcpol = MPCPol(env.observation_space,
                        env.action_space, model_net, rew_func)
        q_net = QNet(env.observation_space, env.action_space)
        qfunc = DeterministicSAVfunc(
            env.observation_space, env.action_space, q_net)
        aqpol = ArgmaxQfPol(env.observation_space, env.action_space, qfunc)
        v_net = VNet(env.observation_space)
        vfunc = DeterministicSVfunc(env.observation_space, v_net)

        cls.r.set('gpol', cloudpickle.dumps(gpol))
        cls.r.set('dpol', cloudpickle.dumps(dpol))
        cls.r.set('mpcpol', cloudpickle.dumps(mpcpol))
        cls.r.set('qfunc', cloudpickle.dumps(qfunc))
        cls.r.set('aqpol', cloudpickle.dumps(aqpol))
        cls.r.set('vfunc', cloudpickle.dumps(vfunc))

        c2d = C2DEnv(env)
        pol_net = PolNet(c2d.observation_space, c2d.action_space)
        mcpol = MultiCategoricalPol(
            env.observation_space, env.action_space, pol_net)

        cls.r.set('mcpol', cloudpickle.dumps(mcpol))
Пример #8
0
                          h1=args.discrim_h1,
                          h2=args.discrim_h2)
    shaping_vf = DeterministicSVfunc(observation_space,
                                     shaping_vf_net,
                                     data_parallel=args.data_parallel)
    optim_discrim = torch.optim.Adam(
        list(rewf_net.parameters()) + list(shaping_vf_net.parameters()),
        args.discrim_lr)
    advf = None
elif args.rew_type == 'adv':
    advf_net = DiscrimNet(observation_space,
                          action_space,
                          h1=args.discrim_h1,
                          h2=args.discrim_h2)
    advf = DeterministicSAVfunc(observation_space,
                                action_space,
                                advf_net,
                                data_parallel=args.data_parallel)
    optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr)
    rewf = None
    shaping_vf = None
else:
    raise ValueError('Only rew and adv are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
expert_traj = Traj()
Пример #9
0
    rewf_net = VNet(observation_space, h1=args.discrim_h1, h2=args.discrim_h2)
    rewf = DeterministicSVfunc(observation_space, rewf_net)
    shaping_vf_net = VNet(observation_space,
                          h1=args.discrim_h1,
                          h2=args.discrim_h2)
    shaping_vf = DeterministicSVfunc(observation_space, shaping_vf_net)
    optim_discrim = torch.optim.Adam(
        list(rewf_net.parameters()) + list(shaping_vf_net.parameters()),
        args.discrim_lr)
    advf = None
elif args.rew_type == 'adv':
    advf_net = DiscrimNet(observation_space,
                          action_space,
                          h1=args.discrim_h1,
                          h2=args.discrim_h2)
    advf = DeterministicSAVfunc(observation_space, action_space, advf_net)
    optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr)
    rewf = None
    shaping_vf = None
else:
    raise ValueError('Only rew and adv are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
expert_traj = Traj()
expert_traj.add_epis(expert_epis)
Пример #10
0
    def test_learning(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        qf_net1 = QNetLSTM(self.env.observation_space,
                           self.env.action_space, h_size=32, cell_size=32)
        qf1 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, qf_net1, rnn=True)
        targ_qf_net1 = QNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, targ_qf_net1, rnn=True)

        qf_net2 = QNetLSTM(self.env.observation_space,
                           self.env.action_space, h_size=32, cell_size=32)
        qf2 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, qf_net2, rnn=True)
        targ_qf_net2 = QNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, targ_qf_net2, rnn=True)

        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]

        log_alpha = nn.Parameter(torch.zeros(()))

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        max_pri = traj.get_max_pri()
        traj = ef.set_all_pris(traj, max_pri)
        traj = ef.compute_seq_pris(traj, 4)
        traj = ef.compute_h_masks(traj)
        for i in range(len(qfs)):
            traj = ef.compute_hs(
                traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True)
            traj = ef.compute_hs(
                traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True)
        traj.register_epis()

        result_dict = r2d2_sac.train(
            traj,
            pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            2, 32, 4, 2,
            0.01, 0.99, 2,
        )

        del sampler
Пример #11
0
env = GymEnv(args.env_name,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)

observation_space = env.observation_space
action_space = env.action_space

qf_net = QNet(observation_space, action_space, args.h1, args.h2)
lagged_qf_net = QNet(observation_space, action_space, args.h1, args.h2)
lagged_qf_net.load_state_dict(qf_net.state_dict())
targ_qf1_net = QNet(observation_space, action_space, args.h1, args.h2)
targ_qf1_net.load_state_dict(qf_net.state_dict())
targ_qf2_net = QNet(observation_space, action_space, args.h1, args.h2)
targ_qf2_net.load_state_dict(lagged_qf_net.state_dict())
qf = DeterministicSAVfunc(observation_space, action_space, qf_net)
lagged_qf = DeterministicSAVfunc(observation_space, action_space,
                                 lagged_qf_net)
targ_qf1 = CEMDeterministicSAVfunc(observation_space,
                                   action_space,
                                   targ_qf1_net,
                                   num_sampling=args.num_sampling,
                                   num_best_sampling=args.num_best_sampling,
                                   num_iter=args.num_iter,
                                   multivari=args.multivari,
                                   save_memory=args.save_memory)
targ_qf2 = DeterministicSAVfunc(observation_space, action_space, targ_qf2_net)

pol = ArgmaxQfPol(observation_space, action_space, targ_qf1, eps=args.eps)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
Пример #12
0
pol = GaussianPol(ob_space,
                  ac_space,
                  pol_net,
                  data_parallel=args.data_parallel,
                  parallel_dim=0)

vf_net = VNet(ob_space)
vf = DeterministicSVfunc(ob_space,
                         vf_net,
                         data_parallel=args.data_parallel,
                         parallel_dim=0)

qf_net = QNet(ob_space, ac_space)
qf = DeterministicSAVfunc(ob_space,
                          ac_space,
                          qf_net,
                          data_parallel=args.data_parallel,
                          parallel_dim=0)
targ_qf_net = QNet(ob_space, ac_space)
targ_qf_net.load_state_dict(qf_net.state_dict())
targ_qf = DeterministicSAVfunc(ob_space,
                               ac_space,
                               targ_qf_net,
                               data_parallel=args.data_parallel,
                               parallel_dim=0)

log_alpha = nn.Parameter(torch.zeros((), device=device))

sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
Пример #13
0
                         data_parallel=args.data_parallel)

if args.rew_type == 'rew':
    rewf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2)
    rewf = DeterministicSVfunc(
        ob_space, rewf_net, data_parallel=args.data_parallel)
    shaping_vf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2)
    shaping_vf = DeterministicSVfunc(
        ob_space, shaping_vf_net, data_parallel=args.data_parallel)
    optim_discrim = torch.optim.Adam(
        list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), args.discrim_lr)
    advf = None
elif args.rew_type == 'adv':
    advf_net = DiscrimNet(ob_space, ac_space,
                          h1=args.discrim_h1, h2=args.discrim_h2)
    advf = DeterministicSAVfunc(
        ob_space, ac_space, advf_net, data_parallel=args.data_parallel)
    optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr)
    rewf = None
    shaping_vf = None
else:
    raise ValueError('Only rew and adv are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
expert_traj = Traj()
expert_traj.add_epis(expert_epis)
Пример #14
0
def main():
    pygame.init()  # 初期化
    (w, h) = (480, 320)
    screen = pygame.display.set_mode((w, h), FULLSCREEN)  # window size
    pygame.display.set_caption("Sikamaru")  # window bar

    # initialization
    tx = 0
    ty = 0
    sika = Sikamaru((w / 2, h / 2))
    sleep_count = 5
    eat_mode = 100
    esa = Food()
    wait = True
    seed = 42

    # TODO define RL agent
    '''
    state : 4D (sikaposi, esaposi)
    action : 2D (-20,+20)^2
    SAC
    simple_net : 30,30
    '''
    np.random.seed(seed)
    torch.manual_seed(seed)

    low = np.zeros(4)
    high = w * np.ones(4)
    ob_space = gym.spaces.Box(low=low, high=high)
    ac_space = gym.spaces.Discrete(4)
    ac_dict = {
        0: np.array([-20, 0]),
        1: np.array([20, 0]),
        2: np.array([0, -20]),
        3: np.array([0, 20])
    }
    pol_net = PolNet(ob_space, ac_space)
    pol = CategoricalPol(ob_space, ac_space, pol_net)
    qf_net1 = QNet(ob_space, ac_space)
    qf1 = DeterministicSAVfunc(ob_space, ac_space, qf_net1)
    targ_qf_net1 = QNet(ob_space, ac_space)
    targ_qf_net1.load_state_dict(qf_net1.state_dict())
    targ_qf1 = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net1)
    qf_net2 = QNet(ob_space, ac_space)
    qf2 = DeterministicSAVfunc(ob_space, ac_space, qf_net2)
    targ_qf_net2 = QNet(ob_space, ac_space)
    targ_qf_net2.load_state_dict(qf_net2.state_dict())
    targ_qf2 = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net2)
    qfs = [qf1, qf2]
    targ_qfs = [targ_qf1, targ_qf2]
    log_alpha = nn.Parameter(torch.ones(()))

    optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4)
    optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
    optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
    optim_qfs = [optim_qf1, optim_qf2]
    optim_alpha = torch.optim.Adam([log_alpha], 1e-4)

    # off_traj = Traj()

    while (True):
        screen.fill((
            0,
            100,
            0,
        ))  # backgroud color

        # my procedure
        ## env
        obs = make_obs((tx, ty), sika.posi, w, h)
        ac_real, ac, a_i = pol.deterministic_ac_real(
            torch.tensor(obs, dtype=torch.float))
        # ac_real = ac_real.reshape(pol.ac_space.shape)
        a = rule_act((tx, ty), sika.posi)
        # a = ac_dict[int(ac_real)]

        nx = sika.posi[0] + a[0]
        nx = max(min(nx, w), 0)
        ny = sika.posi[1] + a[1]
        ny = max(min(ny, h), 0)

        sika.move((nx, ny))
        screen.blit(sika.get_im(), sika.rect)

        if esa.life:  # RL
            # TOOD:record as epi

            screen.blit(esa.im, esa.rect)
            # scr
            rew = esa.life_step(sika)
            if rew > 0:
                sika.bigup()
            if esa.life == 0:
                pass
                #TODO add one epi and learn

                wait = False

        if wait:
            pygame.time.wait(500)
        wait = True
        pygame.display.update()  # 画面更新

        ## event
        for event in pygame.event.get():
            if event.type == MOUSEBUTTONDOWN and event.button == 1:
                tx, ty = event.pos
                esa.set((tx, ty))
            if event.type == KEYDOWN:
                if event.key == K_ESCAPE:
                    sys.exit()

            if event.type == QUIT:  # 終了処理
                pygame.quit()
                sys.exit()
Пример #15
0
    args.log, 'movie'), record_video=args.record)
env.env.seed(args.seed)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNet(observation_space, action_space)
pol = GaussianPol(observation_space, action_space, pol_net,
                  data_parallel=args.data_parallel, parallel_dim=0)

vf_net = VNet(observation_space)
vf = DeterministicSVfunc(
    observation_space, vf_net, data_parallel=args.data_parallel, parallel_dim=0)

qf_net = QNet(observation_space, action_space)
qf = DeterministicSAVfunc(observation_space, action_space, qf_net,
                          data_parallel=args.data_parallel, parallel_dim=0)
targ_qf_net = QNet(observation_space, action_space)
targ_qf_net.load_state_dict(qf_net.state_dict())
targ_qf = DeterministicSAVfunc(
    observation_space, action_space, targ_qf_net, data_parallel=args.data_parallel, parallel_dim=0)

log_alpha = nn.Parameter(torch.zeros((), device=device))

sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)
optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr)
optim_alpha = torch.optim.Adam([log_alpha], args.pol_lr)

off_traj = Traj(args.max_steps_off)
Пример #16
0
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)

observation_space = env.observation_space
action_space = env.action_space

qf_net = QNet(observation_space, action_space, args.h1, args.h2)
lagged_qf_net = QNet(observation_space, action_space, args.h1, args.h2)
lagged_qf_net.load_state_dict(qf_net.state_dict())
targ_qf1_net = QNet(observation_space, action_space, args.h1, args.h2)
targ_qf1_net.load_state_dict(qf_net.state_dict())
targ_qf2_net = QNet(observation_space, action_space, args.h1, args.h2)
targ_qf2_net.load_state_dict(lagged_qf_net.state_dict())
qf = DeterministicSAVfunc(observation_space,
                          action_space,
                          qf_net,
                          data_parallel=args.data_parallel)
lagged_qf = DeterministicSAVfunc(observation_space,
                                 action_space,
                                 lagged_qf_net,
                                 data_parallel=args.data_parallel)
targ_qf1 = CEMDeterministicSAVfunc(observation_space,
                                   action_space,
                                   targ_qf1_net,
                                   num_sampling=args.num_sampling,
                                   num_best_sampling=args.num_best_sampling,
                                   num_iter=args.num_iter,
                                   multivari=args.multivari,
                                   data_parallel=args.data_parallel,
                                   save_memory=args.save_memory)
targ_qf2 = DeterministicSAVfunc(observation_space,
Пример #17
0
if args.rnn:
    vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256)
else:
    vf_net = VNet(ob_space)
vf = DeterministicSVfunc(ob_space,
                         vf_net,
                         args.rnn,
                         data_parallel=args.data_parallel,
                         parallel_dim=1 if args.rnn else 0)

discrim_net = DiscrimNet(ob_space,
                         ac_space,
                         h1=args.discrim_h1,
                         h2=args.discrim_h2)
discrim = DeterministicSAVfunc(ob_space,
                               ac_space,
                               discrim_net,
                               data_parallel=args.data_parallel)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)
optim_discrim = torch.optim.Adam(discrim_net.parameters(), args.discrim_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
expert_traj = Traj()
expert_traj.add_epis(expert_epis)
expert_traj.register_epis()
expert_rewards = [np.sum(epi['rews']) for epi in expert_epis]
expert_mean_rew = np.mean(expert_rewards)
Пример #18
0
    pol = MultiCategoricalPol(observation_space, action_space, pol_net,
                              args.rnn)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

if args.rnn:
    vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256)
else:
    vf_net = VNet(observation_space)
vf = DeterministicSVfunc(observation_space, vf_net, args.rnn)

discrim_net = DiscrimNet(observation_space,
                         action_space,
                         h1=args.discrim_h1,
                         h2=args.discrim_h2)
discrim = DeterministicSAVfunc(observation_space, action_space, discrim_net)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)
optim_discrim = torch.optim.Adam(discrim_net.parameters(), args.discrim_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
expert_traj = Traj()
expert_traj.add_epis(expert_epis)
expert_traj.register_epis()
expert_rewards = [np.sum(epi['rews']) for epi in expert_epis]
expert_mean_rew = np.mean(expert_rewards)
logger.log('expert_score={}'.format(expert_mean_rew))
Пример #19
0
if args.irl_type == 'rew':
    rewf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2)
    rewf = DeterministicSVfunc(ob_space, rewf_net, args.rnn)
    shaping_vf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2)
    shaping_vf = DeterministicSVfunc(ob_space, shaping_vf_net, args.rnn)
    optim_discrim = torch.optim.Adam(
        list(rewf_net.parameters()) + list(shaping_vf_net.parameters()),
        args.discrim_lr)
    advf = None
elif args.rew_type == 'adv':
    advf_net = DiscrimNet(ob_space,
                          ac_space,
                          h1=args.discrim_h1,
                          h2=args.discrim_h2)
    advf = DeterministicSAVfunc(ob_space, ac_space, advf_net, args.rnn)
    optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr)
    rewf = None
    shaping_vf = None
else:
    raise ValueError('Only rew and adv are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
expert_traj = Traj()
expert_traj.add_epis(expert_epis)
Пример #20
0
ob_space = env.observation_space
ac_space = env.action_space

pol_net = PolNet(ob_space, ac_space, args.h1, args.h2, deterministic=True)
noise = OUActionNoise(ac_space)
pol = DeterministicActionNoisePol(
    ob_space, ac_space, pol_net, noise, data_parallel=args.data_parallel)

targ_pol_net = PolNet(ob_space, ac_space, args.h1, args.h2, deterministic=True)
targ_pol_net.load_state_dict(pol_net.state_dict())
targ_noise = OUActionNoise(ac_space)
targ_pol = DeterministicActionNoisePol(
    ob_space, ac_space, targ_pol_net, targ_noise, data_parallel=args.data_parallel)

qf_net = QNet(ob_space, ac_space, args.h1, args.h2)
qf = DeterministicSAVfunc(ob_space, ac_space, qf_net,
                          data_parallel=args.data_parallel)

targ_qf_net = QNet(ob_space, ac_space, args.h1, args.h2)
targ_qf_net.load_state_dict(targ_qf_net.state_dict())
targ_qf = DeterministicSAVfunc(
    ob_space, ac_space, targ_qf_net, data_parallel=args.data_parallel)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr)

off_traj = Traj(args.max_steps_off, traj_device='cpu')

total_epi = 0
total_step = 0
Пример #21
0
skill_space = env.skill_space
ob_skill_space = env.observation_space
action_space = env.action_space
ob_dim = ob_skill_space.shape[0] - args.num_skill
device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
device = torch.device(device_name)
set_device(device)

# policy
pol_net = PolNet(ob_skill_space, action_space)
pol = GaussianPol(ob_skill_space, action_space, pol_net,
                  data_parallel=args.data_parallel, parallel_dim=0)

# q-function
qf_net1 = QNet(ob_skill_space, action_space)
qf1 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net1,
                           data_parallel=args.data_parallel, parallel_dim=0)
targ_qf_net1 = QNet(ob_skill_space, action_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(
    ob_skill_space, action_space, targ_qf_net1, data_parallel=args.data_parallel, parallel_dim=0)
qf_net2 = QNet(ob_skill_space, action_space)
qf2 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net2,
                           data_parallel=args.data_parallel, parallel_dim=0)
targ_qf_net2 = QNet(ob_skill_space, action_space)
targ_qf_net2.load_state_dict(qf_net2.state_dict())
targ_qf2 = DeterministicSAVfunc(
    ob_skill_space, action_space, targ_qf_net2, data_parallel=args.data_parallel, parallel_dim=0)
qfs = [qf1, qf2]
targ_qfs = [targ_qf1, targ_qf2]

log_alpha = nn.Parameter(torch.ones((), device=device))
Пример #22
0
observation_space = env.real_observation_space
skill_space = env.skill_space
ob_skill_space = env.observation_space
action_space = env.action_space
ob_dim = ob_skill_space.shape[0] - args.num_skill
device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
device = torch.device(device_name)
set_device(device)

# policy
pol_net = PolNet(ob_skill_space, action_space)
pol = GaussianPol(ob_skill_space, action_space, pol_net)

# q-function
qf_net1 = QNet(ob_skill_space, action_space)
qf1 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net1)
targ_qf_net1 = QNet(ob_skill_space, action_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(ob_skill_space, action_space, targ_qf_net1)
qf_net2 = QNet(ob_skill_space, action_space)
qf2 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net2)
targ_qf_net2 = QNet(ob_skill_space, action_space)
targ_qf_net2.load_state_dict(qf_net2.state_dict())
targ_qf2 = DeterministicSAVfunc(ob_skill_space, action_space, targ_qf_net2)
qfs = [qf1, qf2]
targ_qfs = [targ_qf1, targ_qf2]

log_alpha = nn.Parameter(torch.ones((), device=device))

high = np.array([np.finfo(np.float32).max]*f_dim)
f_space = gym.spaces.Box(-high, high, dtype=np.float32)
Пример #23
0
ob_space = env.observation_space
ac_space = env.action_space

pol_net = PolNetLSTM(ob_space, ac_space)
pol = GaussianPol(ob_space,
                  ac_space,
                  pol_net,
                  rnn=True,
                  data_parallel=args.data_parallel,
                  parallel_dim=1)

qf_net1 = QNetLSTM(ob_space, ac_space)
qf1 = DeterministicSAVfunc(ob_space,
                           ac_space,
                           qf_net1,
                           rnn=True,
                           data_parallel=args.data_parallel,
                           parallel_dim=1)
targ_qf_net1 = QNetLSTM(ob_space, ac_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(ob_space,
                                ac_space,
                                targ_qf_net1,
                                rnn=True,
                                data_parallel=args.data_parallel,
                                parallel_dim=1)

qf_net2 = QNetLSTM(ob_space, ac_space)
qf2 = DeterministicSAVfunc(ob_space,
                           ac_space,
                           qf_net2,
Пример #24
0
score_file = os.path.join(args.log, 'progress.csv')
logger.add_tabular_output(score_file)
logger.add_tensorboard_output(args.log)

env = GymEnv(args.env_name, log_dir=os.path.join(
    args.log, 'movie'), record_video=args.record)
env.env.seed(args.seed)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNetLSTM(observation_space, action_space)
pol = GaussianPol(observation_space, action_space, pol_net, rnn=True)

qf_net1 = QNetLSTM(observation_space, action_space)
qf1 = DeterministicSAVfunc(observation_space, action_space, qf_net1, rnn=True)
targ_qf_net1 = QNetLSTM(observation_space, action_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(
    observation_space, action_space, targ_qf_net1, rnn=True)

qf_net2 = QNetLSTM(observation_space, action_space)
qf2 = DeterministicSAVfunc(observation_space, action_space, qf_net2, rnn=True)
targ_qf_net2 = QNetLSTM(observation_space, action_space)
targ_qf_net2.load_state_dict(qf_net2.state_dict())
targ_qf2 = DeterministicSAVfunc(
    observation_space, action_space, targ_qf_net2, rnn=True)

qfs = [qf1, qf2]
targ_qfs = [targ_qf1, targ_qf2]
Пример #25
0
ob_space = env.observation_space
ac_space = env.action_space

pol_net = PolNet(ob_space, ac_space, args.h1, args.h2, deterministic=True)
noise = OUActionNoise(ac_space)
pol = DeterministicActionNoisePol(ob_space, ac_space, pol_net, noise)

targ_pol_net = PolNet(ob_space, ac_space, args.h1, args.h2, deterministic=True)
targ_pol_net.load_state_dict(pol_net.state_dict())
targ_noise = OUActionNoise(ac_space.shape)
targ_pol = DeterministicActionNoisePol(
    ob_space, ac_space, targ_pol_net, targ_noise)

qf_net = QNet(ob_space, ac_space, args.h1, args.h2)
qf = DeterministicSAVfunc(ob_space, ac_space, qf_net)

targ_qf_net = QNet(ob_space, ac_space, args.h1, args.h2)
targ_qf_net.load_state_dict(qf_net.state_dict())
targ_qf = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net)

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)

optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr)

off_traj = Traj(args.max_steps_off)

total_epi = 0
total_step = 0
max_rew = -1e6
Пример #26
0
device = torch.device(device_name)
set_device(device)

# policy
pol_net = PolNet(ob_skill_space, ac_space)
pol = GaussianPol(ob_skill_space,
                  ac_space,
                  pol_net,
                  data_parallel=args.data_parallel,
                  parallel_dim=0)

# q-function
qf_net1 = QNet(ob_skill_space, ac_space)
qf1 = DeterministicSAVfunc(ob_skill_space,
                           ac_space,
                           qf_net1,
                           data_parallel=args.data_parallel,
                           parallel_dim=0)
targ_qf_net1 = QNet(ob_skill_space, ac_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(ob_skill_space,
                                ac_space,
                                targ_qf_net1,
                                data_parallel=args.data_parallel,
                                parallel_dim=0)
qf_net2 = QNet(ob_skill_space, ac_space)
qf2 = DeterministicSAVfunc(ob_skill_space,
                           ac_space,
                           qf_net2,
                           data_parallel=args.data_parallel,
                           parallel_dim=0)
Пример #27
0
env = GymEnv(env,
             log_dir=os.path.join(args.log, 'movie'),
             record_video=args.record)
env.env.seed(args.seed)

# 観測と行動の次元
observation_space = env.observation_space
action_space = env.action_space
print('obs: {0}, act: {1}'.format(observation_space, action_space))

# Q-Network
print('Qnet')
qf_net = QTOptNet(observation_space, action_space)
qf = DeterministicSAVfunc(
    flattend_observation_space,
    action_space,
    qf_net,
    data_parallel=args.data_parallel)  # 決定的行動状態価値関数?q-netの出力の形を少し整える

# target Q network theta1
print('target1_net')
targ_qf1_net = QTOptNet(observation_space, action_space)
targ_qf1_net.load_state_dict(qf_net.state_dict())  # model(重み)をロード(q-netからコピー)
targ_qf1 = CEMDeterministicSAVfunc(
    flattend_observation_space,
    action_space,
    targ_qf1_net,
    num_sampling=args.num_sampling,
    num_best_sampling=args.num_best_sampling,
    num_iter=args.num_iter,
    multivari=args.multivari,
Пример #28
0
env.env.seed(args.seed)

observation_space = env.observation_space
action_space = env.action_space

pol_net = PolNet(observation_space, action_space)
pol = GaussianPol(observation_space,
                  action_space,
                  pol_net,
                  data_parallel=args.data_parallel,
                  parallel_dim=0)

qf_net1 = QNet(observation_space, action_space)
qf1 = DeterministicSAVfunc(observation_space,
                           action_space,
                           qf_net1,
                           data_parallel=args.data_parallel,
                           parallel_dim=0)
targ_qf_net1 = QNet(observation_space, action_space)
targ_qf_net1.load_state_dict(qf_net1.state_dict())
targ_qf1 = DeterministicSAVfunc(observation_space,
                                action_space,
                                targ_qf_net1,
                                data_parallel=args.data_parallel,
                                parallel_dim=0)

qf_net2 = QNet(observation_space, action_space)
qf2 = DeterministicSAVfunc(observation_space,
                           action_space,
                           qf_net2,
                           data_parallel=args.data_parallel,