Пример #1
0
 def create_normal_dist_mlp_policy(self, env_spec, name='norm_dist_p_'):
     policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                          name_scope=name + 'mlp_policy',
                                          name=name + 'mlp_policy',
                                          mlp_config=[{
                                              "ACT":
                                              "RELU",
                                              "B_INIT_VALUE":
                                              0.0,
                                              "NAME":
                                              "1",
                                              "L1_NORM":
                                              0.01,
                                              "L2_NORM":
                                              0.01,
                                              "N_UNITS":
                                              16,
                                              "TYPE":
                                              "DENSE",
                                              "W_NORMAL_STDDEV":
                                              0.03
                                          }, {
                                              "ACT":
                                              "LINEAR",
                                              "B_INIT_VALUE":
                                              0.0,
                                              "NAME":
                                              "OUPTUT",
                                              "N_UNITS":
                                              env_spec.flat_action_dim,
                                              "TYPE":
                                              "DENSE",
                                              "W_NORMAL_STDDEV":
                                              0.03
                                          }],
                                          reuse=False)
     return policy, locals()
Пример #2
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_v = MLPVValueFunc(env_spec=env_spec,
                          name_scope=name + 'mlp_v',
                          name=name + 'mlp_v',
                          **exp_config['MLP_V'])
    policy = NormalDistributionMLPPolicy(
        env_spec=env_spec,
        name_scope=name + 'mlp_policy',
        name=name + 'mlp_policy',
        **exp_config['POLICY'],
        output_low=env_spec.action_space.low,
        output_high=env_spec.action_space.high,
        reuse=False)

    ppo = PPO(env_spec=env_spec,
              **exp_config['PPO'],
              value_func=mlp_v,
              stochastic_policy=policy,
              name=name + 'ppo')
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=ppo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['TrainTestFlow']
        ['config_or_config_dict'],
        func_dict={
            'test': {
                'func':
                agent.test,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TEST_SAMPLES_COUNT'],
                     sample_trajectory_flag=True),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TRAIN_SAMPLES_COUNT'],
                     env=agent.env,
                     sample_type='trajectory',
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Пример #3
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_v = MLPVValueFunc(env_spec=env_spec,
                          name_scope=name + 'mlp_v',
                          name=name + 'mlp_v',
                          mlp_config=[{
                              "ACT": "RELU",
                              "B_INIT_VALUE": 0.0,
                              "NAME": "1",
                              "N_UNITS": 16,
                              "L1_NORM": 0.01,
                              "L2_NORM": 0.01,
                              "TYPE": "DENSE",
                              "W_NORMAL_STDDEV": 0.03
                          }, {
                              "ACT": "LINEAR",
                              "B_INIT_VALUE": 0.0,
                              "NAME": "OUPTUT",
                              "N_UNITS": 1,
                              "TYPE": "DENSE",
                              "W_NORMAL_STDDEV": 0.03
                          }])

    policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                         name_scope=name + 'mlp_policy',
                                         name=name + 'mlp_policy',
                                         mlp_config=[{
                                             "ACT": "RELU",
                                             "B_INIT_VALUE": 0.0,
                                             "NAME": "1",
                                             "L1_NORM": 0.01,
                                             "L2_NORM": 0.01,
                                             "N_UNITS": 16,
                                             "TYPE": "DENSE",
                                             "W_NORMAL_STDDEV": 0.03
                                         }, {
                                             "ACT": "LINEAR",
                                             "B_INIT_VALUE": 0.0,
                                             "NAME": "OUPTUT",
                                             "N_UNITS":
                                             env_spec.flat_action_dim,
                                             "TYPE": "DENSE",
                                             "W_NORMAL_STDDEV": 0.03
                                         }],
                                         reuse=False)

    ppo = PPO(env_spec=env_spec,
              config_or_config_dict={
                  "gamma": 0.995,
                  "lam": 0.98,
                  "policy_train_iter": 10,
                  "value_func_train_iter": 10,
                  "clipping_range": None,
                  "beta": 1.0,
                  "eta": 50,
                  "log_var_init": -1.0,
                  "kl_target": 0.003,
                  "policy_lr": 0.01,
                  "value_func_lr": 0.01,
                  "value_func_train_batch_size": 10,
                  "lr_multiplier": 1.0
              },
              value_func=mlp_v,
              stochastic_policy=policy,
              name=name + 'ppo')
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=ppo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + 'agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (),
                              dict(sample_count=100,
                                   env=agent.env,
                                   in_which_status='TRAIN',
                                   store_flag=True)))

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
    def test_mlp_norm_dist_policy(self):
        env = make('Pendulum-v0')
        env.reset()
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                             name='mlp_policy',
                                             name_scope='mlp_policy',
                                             mlp_config=[{
                                                 "ACT":
                                                 "RELU",
                                                 "B_INIT_VALUE":
                                                 0.0,
                                                 "NAME":
                                                 "1",
                                                 "N_UNITS":
                                                 16,
                                                 "TYPE":
                                                 "DENSE",
                                                 "W_NORMAL_STDDEV":
                                                 0.03
                                             }, {
                                                 "ACT":
                                                 "LINEAR",
                                                 "B_INIT_VALUE":
                                                 0.0,
                                                 "NAME":
                                                 "OUPTUT",
                                                 "N_UNITS":
                                                 env_spec.flat_action_dim,
                                                 "TYPE":
                                                 "DENSE",
                                                 "W_NORMAL_STDDEV":
                                                 0.03
                                             }],
                                             output_high=None,
                                             output_low=None,
                                             output_norm=None,
                                             input_norm=None,
                                             reuse=False)
        policy.init()
        dist_info = policy.get_dist_info()
        self.assertTrue(
            np.equal(dist_info[0]['shape'],
                     policy.mean_output.shape.as_list()).all())
        self.assertTrue(
            np.equal(dist_info[1]['shape'],
                     policy.logvar_output.shape.as_list()).all())
        for _ in range(10):
            ac = policy.forward(obs=env.observation_space.sample())
            self.assertTrue(env.action_space.contains(ac[0]))
        p2 = policy.make_copy(name='test',
                              name_scope='mlp_policy_2',
                              reuse=False)
        p2.init()
        self.assertGreater(len(policy.parameters('tf_var_list')), 0)
        self.assertGreater(len(p2.parameters('tf_var_list')), 0)
        for var1, var2 in zip(policy.parameters('tf_var_list'),
                              p2.parameters('tf_var_list')):
            self.assertEqual(var1.shape, var2.shape)
            self.assertNotEqual(id(var1), id(var2))

        p3 = policy.make_copy(name='mlp_policy_ttt',
                              name_scope='mlp_policy',
                              reuse=True)
        p3.init()
        self.assertGreater(len(p3.parameters('tf_var_list')), 0)
        for var1, var2 in zip(policy.parameters('tf_var_list'),
                              p3.parameters('tf_var_list')):
            self.assertEqual(var1.shape, var2.shape)
            self.assertEqual(id(var1), id(var2))

        # policy.copy_from(p2)]
        res_not_true = []
        for var1, var2, var3 in zip(policy.parameters('tf_var_list'),
                                    p2.parameters('tf_var_list'),
                                    p3.parameters('tf_var_list')):
            re1, re2, re3 = self.sess.run([var1, var2, var3])
            res_not_true.append(np.isclose(re1, re2).all())
            res_not_true.append(np.isclose(re3, re2).all())
            self.assertTrue(np.isclose(re1, re3).all())
        self.assertFalse(np.array(res_not_true).all())

        policy.copy_from(p2)

        for var1, var2, var3 in zip(policy.parameters('tf_var_list'),
                                    p2.parameters('tf_var_list'),
                                    p3.parameters('tf_var_list')):
            re1, re2, re3 = self.sess.run([var1, var2, var3])
            self.assertTrue(np.isclose(re1, re3).all())
            self.assertTrue(np.isclose(re2, re3).all())
            self.assertTrue(np.isclose(re1, re2).all())
    def test_func(self):
        env = make('Pendulum-v0')
        env.reset()
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                             name='mlp_policy',
                                             name_scope='mlp_policy',
                                             mlp_config=[{
                                                 "ACT":
                                                 "RELU",
                                                 "B_INIT_VALUE":
                                                 0.0,
                                                 "NAME":
                                                 "1",
                                                 "N_UNITS":
                                                 16,
                                                 "TYPE":
                                                 "DENSE",
                                                 "W_NORMAL_STDDEV":
                                                 0.03
                                             }, {
                                                 "ACT":
                                                 "LINEAR",
                                                 "B_INIT_VALUE":
                                                 0.0,
                                                 "NAME":
                                                 "OUPTUT",
                                                 "N_UNITS":
                                                 env_spec.flat_action_dim,
                                                 "TYPE":
                                                 "DENSE",
                                                 "W_NORMAL_STDDEV":
                                                 0.03
                                             }],
                                             output_high=None,
                                             output_low=None,
                                             output_norm=None,
                                             input_norm=None,
                                             reuse=False)
        policy.init()
        print(
            policy.compute_dist_info(name='entropy',
                                     feed_dict={
                                         policy.state_input:
                                         make_batch(
                                             env_spec.obs_space.sample(),
                                             original_shape=env_spec.obs_shape)
                                     }))
        print(
            policy.compute_dist_info(name='prob',
                                     value=env_spec.action_space.sample(),
                                     feed_dict={
                                         policy.state_input:
                                         make_batch(
                                             env_spec.obs_space.sample(),
                                             original_shape=env_spec.obs_shape)
                                     }))
        new_policy = policy.make_copy(
            reuse=False,
            name='new_p',
            name_scope='mlp_policy_2',
        )
        new_policy.init()
        for var1, var2 in zip(policy.parameters('tf_var_list'),
                              new_policy.parameters('tf_var_list')):
            print(var1.name)
            print(var2.name)
            self.assertNotEqual(var1.name, var2.name)
            self.assertNotEqual(id(var1), id(var2))
        obs1 = make_batch(
            env_spec.obs_space.sample(),
            original_shape=env_spec.obs_shape,
        )
        obs2 = make_batch(env_spec.obs_space.sample(),
                          original_shape=env_spec.obs_shape)
        kl1 = policy.compute_dist_info(name='kl',
                                       other=new_policy,
                                       feed_dict={
                                           policy.state_input: obs1,
                                           new_policy.state_input: obs2
                                       })
        kl2 = self.sess.run(policy.kl(other=new_policy),
                            feed_dict={
                                policy.state_input: obs1,
                                new_policy.state_input: obs2
                            })
        self.assertTrue(np.isclose(kl1, kl2).all())
Пример #6
0
    def func():
        env = make(env_id)
        exp_config = make_config(obs_dim=env.env_spec.flat_obs_dim,
                                 action_dim=env.env_spec.flat_action_dim,
                                 policy_hid1_multi=10,
                                 value_hid3_size=5,
                                 value_hid1_multi=10,
                                 total_episode=total_episode,
                                 episode_length=1000,
                                 episode_per_sample=episode_per_sample)

        GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                           exp_config['DEFAULT_EXPERIMENT_END_POINT'])
        env.reset()
        env = StepObservationWrapper(
            env, step_limit=env.unwrapped._max_episode_steps)
        name = 'benchmark'
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_v = MLPVValueFunc(env_spec=env_spec,
                              name_scope=name + 'mlp_v',
                              name=name + 'mlp_v',
                              **exp_config['MLP_V'])
        policy = NormalDistributionMLPPolicy(
            env_spec=env_spec,
            name_scope=name + 'mlp_policy',
            name=name + 'mlp_policy',
            **exp_config['POLICY'],
            output_low=env_spec.action_space.low,
            output_high=env_spec.action_space.high,
            reuse=False)

        ppo = PPO(env_spec=env_spec,
                  **exp_config['PPO'],
                  value_func=mlp_v,
                  stochastic_policy=policy,
                  name=name + '_ppo',
                  use_time_index_flag=True)
        agent = Agent(env=env,
                      env_spec=env_spec,
                      algo=ppo,
                      exploration_strategy=None,
                      noise_adder=None,
                      name=name + '_agent')

        flow = TrainTestFlow(
            train_sample_count_func=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_FUNC_COUNT'),
            config_or_config_dict=exp_config['TrainTestFlow']
            ['config_or_config_dict'],
            func_dict={
                'test': {
                    'func':
                    agent.test,
                    'args':
                    list(),
                    'kwargs':
                    dict(sample_count=exp_config['TrainTestFlow']
                         ['TEST_SAMPLES_COUNT']),
                },
                'train': {
                    'func': agent.train,
                    'args': list(),
                    'kwargs': dict(),
                },
                'sample': {
                    'func':
                    agent.sample,
                    'args':
                    list(),
                    'kwargs':
                    dict(sample_count=exp_config['TrainTestFlow']
                         ['TRAIN_SAMPLES_COUNT'],
                         env=agent.env,
                         sample_type='trajectory',
                         in_which_status='TRAIN',
                         store_flag=True),
                },
            })

        experiment = Experiment(tuner=None,
                                env=env,
                                agent=agent,
                                flow=flow,
                                name=name)
        experiment.run()