示例#1
0
    def test_get_actions(self):
        dim = 2
        env_spec = MockEnvSpec(num_envs=5)
        sampler = UniformlyRandomLatentSampler(
            scheduler=ConstantIntervalScheduler(), name='test', dim=dim)
        sampler.reset([True] * 5)

        # scalar observations case
        obs = np.zeros((env_spec.num_envs, 3))
        latent, agent_info = sampler.get_actions(obs)
        self.assertEqual(latent.shape, (env_spec.num_envs, 2))
示例#2
0
    def test_get_action(self):
        dim = 3
        env_spec = MockEnvSpec()
        sampler = UniformlyRandomLatentSampler(
            scheduler=ConstantIntervalScheduler(), name='test', dim=dim)
        sampler.reset([True])
        obs = [[0, 1]]
        latent, agent_info = sampler.get_action(obs)
        self.assertTrue('latent' in agent_info.keys())

        sampler.reset([True])
        obs = [[0, 0, 1]]
        latent, agent_info = sampler.get_action(obs)
        self.assertEqual(latent.shape, (3, ))
        self.assertEqual(sum(latent), 1)
示例#3
0
def build_categorical_latent_sampler(base_dim=2,
                                     base_scheduler_k=np.inf,
                                     dim=3,
                                     scheduler_k=np.inf):
    base_latent_sampler = UniformlyRandomLatentSampler(
        name='test_base',
        dim=base_dim,
        scheduler=ConstantIntervalScheduler(k=base_scheduler_k))
    latent_sampler = CategoricalLatentSampler(
        name='test',
        policy_name='test',
        dim=dim,
        scheduler=ConstantIntervalScheduler(k=scheduler_k),
        env_spec=MockEnvSpec(action_space=spaces.Discrete(dim)),
        latent_sampler=base_latent_sampler)
    return latent_sampler
示例#4
0
def build_policy(args, env, latent_sampler=None):
    if args.use_infogail:
        if latent_sampler is None:
            latent_sampler = UniformlyRandomLatentSampler(
                scheduler=ConstantIntervalScheduler(k=args.scheduler_k),
                name='latent_sampler',
                dim=args.latent_dim
            )
        if args.policy_recurrent:
            policy = GaussianLatentVarGRUPolicy(
                name="policy",
                latent_sampler=latent_sampler,
                env_spec=env.spec,
                hidden_dim=args.recurrent_hidden_dim,
            )
        else:
            print("GaussianLatentVarMLPPolicy")
            policy = GaussianLatentVarMLPPolicy(
                name="policy",
                latent_sampler=latent_sampler,
                env_spec=env.spec,
                hidden_sizes=args.policy_mean_hidden_layer_dims,
                std_hidden_sizes=args.policy_std_hidden_layer_dims
            )
    else:
        if args.policy_recurrent:
            print("GaussianGRUPolicy")
            policy = GaussianGRUPolicy(
                name="policy",
                env_spec=env.spec,
                hidden_dim=args.recurrent_hidden_dim,
                output_nonlinearity=None,
                learn_std=True
            )
        else:
            print("GaussianMLPPolicy")
            policy = GaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                hidden_sizes=args.policy_mean_hidden_layer_dims,
                std_hidden_sizes=args.policy_std_hidden_layer_dims,
                adaptive_std=True,
                output_nonlinearity=None,
                learn_std=True
            )
    return policy
示例#5
0
def build_hierarchy(args, env, writer=None):
    levels = []

    latent_sampler = UniformlyRandomLatentSampler(
        name='base_latent_sampler',
        dim=args.latent_dim,
        scheduler=ConstantIntervalScheduler(k=args.env_H)
    )
    for level_idx in [1,0]:
        # wrap env in different spec depending on level
        if level_idx == 0:
            level_env = env
        else:
            level_env = SpecWrapperEnv(
                env,
                action_space=Discrete(args.latent_dim),
                observation_space=env.observation_space
            )
            
        with tf.variable_scope('level_{}'.format(level_idx)):
            # recognition_model = build_recognition_model(args, level_env, writer)
            recognition_model = None
            if level_idx == 0:
                policy = build_policy(args, env, latent_sampler=latent_sampler)
            else:
                scheduler = ConstantIntervalScheduler(k=args.scheduler_k)
                policy = latent_sampler = CategoricalLatentSampler(
                    scheduler=scheduler,
                    name='latent_sampler',
                    policy_name='latent_sampler_policy',
                    dim=args.latent_dim,
                    env_spec=level_env.spec,
                    latent_sampler=latent_sampler,
                    max_n_envs=args.n_envs
                )
            baseline = build_baseline(args, level_env)
            if args.vectorize:
                force_batch_sampler = False
                if level_idx == 0:
                    sampler_args = dict(n_envs=args.n_envs)
                else:
                    sampler_args = None
            else:
                force_batch_sampler = True
                sampler_args = None

            sampler_cls = None if level_idx == 0 else HierarchySampler
            algo = TRPO(
                env=level_env,
                policy=policy,
                baseline=baseline,
                batch_size=args.batch_size,
                max_path_length=args.max_path_length,
                n_itr=args.n_itr,
                discount=args.discount,
                step_size=args.trpo_step_size,
                sampler_cls=sampler_cls,
                force_batch_sampler=force_batch_sampler,
                sampler_args=sampler_args,
                optimizer_args=dict(
                    max_backtracks=50,
                    debug_nan=True
                )
            )
            reward_handler = build_reward_handler(args, writer)
            level = Level(
                depth=level_idx,
                algo=algo,
                reward_handler=reward_handler,
                recognition_model=recognition_model,
                start_itr=0,
                end_itr=0 if level_idx == 0 else np.inf
            )
            levels.append(level)

    # by convention the order of the levels should be increasing
    # but they must be built in the reverse order 
    # so reverse the list before returning it
    return list(reversed(levels))
示例#6
0
            obs_dim=env.observation_space.flat_dim,
            act_dim=env.action_space.n,
            dataset=recognition_dataset,
            network=recognition_network,
            variable_type='categorical',
            latent_dim=latent_dim,
            optimizer=tf.train.AdamOptimizer(recognition_learning_rate,
                                             beta1=.5,
                                             beta2=.9),
            n_train_epochs=n_recognition_train_epochs,
            summary_writer=summary_writer,
            verbose=2)

        # build the policy
        latent_sampler = UniformlyRandomLatentSampler(
            scheduler=ConstantIntervalScheduler(k=scheduler_k),
            name='latent_sampler',
            dim=latent_dim)
        policy = CategoricalLatentVarMLPPolicy(policy_name="policy",
                                               latent_sampler=latent_sampler,
                                               env_spec=env.spec,
                                               hidden_sizes=(64, 64))
    else:
        # build the policy
        policy = CategoricalMLPPolicy(name="policy",
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))
        recognition_model = None

    # build gail
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    reward_handler = hgail.misc.utils.RewardHandler(
示例#7
0
        critic = WassersteinCritic(
            obs_dim=env.observation_space.flat_dim,
            act_dim=env.action_space.n,
            dataset=critic_dataset, 
            network=critic_network,
            gradient_penalty=10.,
            optimizer=tf.train.RMSPropOptimizer(critc_lr),
            n_train_epochs=n_critic_train_epochs,
            summary_writer=summary_writer,
            verbose=2,
        )

    # level 2
    base_latent_sampler = UniformlyRandomLatentSampler(
        name='base_latent_sampler',
        dim=latent_dim_1,
        scheduler=ConstantIntervalScheduler(k=scheduler_k_1)
    )

    # level 1
    with tf.variable_scope('level_1'):
        recog_dataset_1 = RecognitionDataset(batch_size)
        recog_network_1 = ObservationActionMLP(
            name='recog_1', 
            hidden_layer_dims=[32,32],
            output_dim=latent_dim_1
        )
        recog_1 = RecognitionModel(
                    obs_dim=env.observation_space.flat_dim,
                    act_dim=env.action_space.n,
                    dataset=recog_dataset_1, 
示例#8
0
def build_hgail(env, critic_dataset, batch_size):

    # critic
    with tf.variable_scope('critic'):
        critic_network = ObservationActionMLP(name='critic',
                                              hidden_layer_dims=[32, 32])
        critic = WassersteinCritic(obs_dim=3,
                                   act_dim=2,
                                   dataset=critic_dataset,
                                   network=critic_network,
                                   gradient_penalty=.01,
                                   optimizer=tf.train.AdamOptimizer(.001,
                                                                    beta1=.5,
                                                                    beta2=.9),
                                   n_train_epochs=50)

    # base latent variable sampler
    base_latent_sampler = UniformlyRandomLatentSampler(
        scheduler=ConstantIntervalScheduler(),
        name='base_latent_sampler',
        dim=3)

    with tf.variable_scope('level_1'):
        recog_dataset_1 = RecognitionDataset(batch_size=batch_size)
        recog_network_1 = ObservationActionMLP(name='recog',
                                               hidden_layer_dims=[32, 32],
                                               output_dim=3)
        recog_1 = RecognitionModel(obs_dim=3,
                                   act_dim=2,
                                   dataset=recog_dataset_1,
                                   network=recog_network_1,
                                   variable_type='categorical',
                                   latent_dim=3,
                                   name='recognition_1')

        latent_sampler = CategoricalLatentSampler(
            scheduler=ConstantIntervalScheduler(k=1),
            name='latent_sampler',
            policy_name='latent_sampler_policy',
            dim=2,
            env_spec=env.spec,
            latent_sampler=base_latent_sampler,
            max_n_envs=20)
        baseline_1 = LinearFeatureBaseline(env_spec=env.spec)

        algo_1 = TRPO(
            env=env,
            policy=latent_sampler,
            baseline=baseline_1,
            batch_size=4000,
            max_path_length=100,
            n_itr=15,
            discount=0.99,
            step_size=0.01,
            sampler_cls=HierarchySampler,
        )
        reward_handler_1 = RewardHandler(use_env_rewards=False,
                                         critic_final_scale=1.)
        level_1 = Level(depth=1,
                        algo=algo_1,
                        reward_handler=reward_handler_1,
                        recognition_model=recog_1)

    with tf.variable_scope('level_0'):

        # recognition model
        recog_dataset_0 = RecognitionDataset(batch_size=batch_size)
        recog_network_0 = ObservationActionMLP(name='recog',
                                               hidden_layer_dims=[32, 32],
                                               output_dim=2)
        recog_0 = RecognitionModel(obs_dim=3,
                                   act_dim=2,
                                   dataset=recog_dataset_0,
                                   network=recog_network_0,
                                   variable_type='categorical',
                                   latent_dim=2,
                                   name='recognition_0')

        policy = CategoricalLatentVarMLPPolicy(policy_name="policy",
                                               latent_sampler=latent_sampler,
                                               env_spec=env.spec)
        baseline_0 = LinearFeatureBaseline(env_spec=env.spec)

        algo_0 = TRPO(env=env,
                      policy=policy,
                      baseline=baseline_0,
                      batch_size=4000,
                      max_path_length=100,
                      n_itr=5,
                      discount=0.99,
                      step_size=0.1,
                      sampler_args=dict(n_envs=1))

        reward_handler_0 = RewardHandler(use_env_rewards=False,
                                         critic_final_scale=1.)
        level_0 = Level(depth=0,
                        algo=algo_0,
                        reward_handler=reward_handler_0,
                        recognition_model=recog_0)

    hierarchy = [level_0, level_1]
    algo = HGAIL(
        critic=critic,
        hierarchy=hierarchy,
    )
    return algo
示例#9
0
    def test_reset(self):

        # single env
        dim = 3
        env_spec = MockEnvSpec()
        sampler = UniformlyRandomLatentSampler(
            scheduler=ConstantIntervalScheduler(), name='test', dim=dim)
        dones = [True]
        sampler.reset(dones)
        action, _ = sampler.get_action(None)
        self.assertTrue(sampler.latent_values.shape == (1, 3))
        self.assertTrue(np.sum(sampler.latent_values, axis=1) == 1)

        # multi env
        env_spec = MockEnvSpec(num_envs=2)
        dim = 100
        sampler = UniformlyRandomLatentSampler(
            scheduler=ConstantIntervalScheduler(), name='test', dim=dim)
        dones = [True, True]
        sampler.reset(dones)

        self.assertTrue(sampler.latent_values.shape == (2, dim))

        actions_1, _ = sampler.get_actions([None] * 2)
        sampler.reset(dones)

        actions_2, _ = sampler.get_actions([None] * 2)
        self.assertEqual(sampler.latent_values.shape, (2, dim))
        self.assertNotEqual(tuple(np.argmax(actions_1, axis=1)),
                            tuple(np.argmax(actions_2, axis=1)))

        dones = [False, True]
        sampler.reset(dones)
        np.testing.assert_array_equal(np.sum(sampler.latent_values, axis=1),
                                      [1, 1])
示例#10
0
文件: test_gail.py 项目: zxsted/hgail
    def test_infogail_two_round_stochastic_env(self):

        env = TfEnv(TwoRoundNondeterministicRewardEnv())

        # dataset of one-hot obs and acts
        # optimal actions: 0, 1
        # first state
        n_expert_samples = 1000
        batch_size = 1000
        half = int(n_expert_samples / 2)
        rx = np.zeros((n_expert_samples, 3))
        rx[:half, 2] = 1
        rx[half:, 0] = 1
        ra = np.zeros((n_expert_samples, 2))
        ra[:half, 0] = 1
        ra[half:, 1] = 1

        with tf.Session() as session:
            # critic
            critic_dataset = CriticDataset(dict(observations=rx, actions=ra),
                                           batch_size=batch_size)
            critic_network = ObservationActionMLP(name='critic',
                                                  hidden_layer_dims=[32, 32])
            critic = WassersteinCritic(obs_dim=3,
                                       act_dim=2,
                                       dataset=critic_dataset,
                                       network=critic_network,
                                       gradient_penalty=.01,
                                       optimizer=tf.train.AdamOptimizer(
                                           .001, beta1=.5, beta2=.9),
                                       n_train_epochs=50)

            # recognition model
            recog_dataset = RecognitionDataset(batch_size=batch_size)
            recog_network = ObservationActionMLP(name='recog',
                                                 hidden_layer_dims=[32, 32],
                                                 output_dim=2)
            recog = RecognitionModel(obs_dim=3,
                                     act_dim=2,
                                     dataset=recog_dataset,
                                     network=recog_network,
                                     variable_type='categorical',
                                     latent_dim=2)

            # policy
            env.spec.num_envs = 10
            latent_sampler = UniformlyRandomLatentSampler(
                scheduler=ConstantIntervalScheduler(),
                name='latent_sampler',
                dim=2)
            policy = CategoricalLatentVarMLPPolicy(
                policy_name="policy",
                latent_sampler=latent_sampler,
                env_spec=env.spec)

            # gail
            reward_handler = RewardHandler(use_env_rewards=False,
                                           critic_final_scale=1.)
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = GAIL(critic=critic,
                        recognition=recog,
                        reward_handler=reward_handler,
                        env=env,
                        policy=policy,
                        baseline=baseline,
                        batch_size=4000,
                        max_path_length=200,
                        n_itr=15,
                        discount=.99,
                        step_size=.01,
                        sampler_args=dict(n_envs=env.spec.num_envs))

            session.run(tf.global_variables_initializer())

            # run it!
            algo.train(sess=session)

            # evaluate
            l0_state_infos = dict(latent=[[1, 0]])
            l0_dist_2 = policy.dist_info([[0., 0., 1.]],
                                         l0_state_infos)['prob']
            l0_dist_0 = policy.dist_info([[1., 0., 0.]],
                                         l0_state_infos)['prob']

            l1_state_infos = dict(latent=[[0, 1]])
            l1_dist_2 = policy.dist_info([[0., 0., 1.]],
                                         l1_state_infos)['prob']
            l1_dist_0 = policy.dist_info([[1., 0., 0.]],
                                         l1_state_infos)['prob']

            np.testing.assert_array_almost_equal(l0_dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(l0_dist_0, [[0, 1]], 1)
            np.testing.assert_array_almost_equal(l1_dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(l1_dist_0, [[0, 1]], 1)