def test_reset(self): # single env dim = 3 env_spec = MockEnvSpec() sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='test', dim=dim) dones = [True] sampler.reset(dones) action, _ = sampler.get_action(None) self.assertTrue(sampler.latent_values.shape == (1, 3)) self.assertTrue(np.sum(sampler.latent_values, axis=1) == 1) # multi env env_spec = MockEnvSpec(num_envs=2) dim = 100 sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='test', dim=dim) dones = [True, True] sampler.reset(dones) self.assertTrue(sampler.latent_values.shape == (2, dim)) actions_1, _ = sampler.get_actions([None] * 2) sampler.reset(dones) actions_2, _ = sampler.get_actions([None] * 2) self.assertEqual(sampler.latent_values.shape, (2, dim)) self.assertNotEqual(tuple(np.argmax(actions_1, axis=1)), tuple(np.argmax(actions_2, axis=1))) dones = [False, True] sampler.reset(dones) np.testing.assert_array_equal(np.sum(sampler.latent_values, axis=1), [1, 1])
def build_categorical_latent_sampler(base_dim=2, base_scheduler_k=np.inf, dim=3, scheduler_k=np.inf): base_latent_sampler = UniformlyRandomLatentSampler( name='test_base', dim=base_dim, scheduler=ConstantIntervalScheduler(k=base_scheduler_k)) latent_sampler = CategoricalLatentSampler( name='test', policy_name='test', dim=dim, scheduler=ConstantIntervalScheduler(k=scheduler_k), env_spec=MockEnvSpec(action_space=spaces.Discrete(dim)), latent_sampler=base_latent_sampler) return latent_sampler
def test_get_actions(self): dim = 2 env_spec = MockEnvSpec(num_envs=5) sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='test', dim=dim) sampler.reset([True] * 5) # scalar observations case obs = np.zeros((env_spec.num_envs, 3)) latent, agent_info = sampler.get_actions(obs) self.assertEqual(latent.shape, (env_spec.num_envs, 2))
def test_get_action(self): dim = 3 env_spec = MockEnvSpec() sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='test', dim=dim) sampler.reset([True]) obs = [[0, 1]] latent, agent_info = sampler.get_action(obs) self.assertTrue('latent' in agent_info.keys()) sampler.reset([True]) obs = [[0, 0, 1]] latent, agent_info = sampler.get_action(obs) self.assertEqual(latent.shape, (3, )) self.assertEqual(sum(latent), 1)
def build_policy(args, env, latent_sampler=None): if args.use_infogail: if latent_sampler is None: latent_sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(k=args.scheduler_k), name='latent_sampler', dim=args.latent_dim ) if args.policy_recurrent: policy = GaussianLatentVarGRUPolicy( name="policy", latent_sampler=latent_sampler, env_spec=env.spec, hidden_dim=args.recurrent_hidden_dim, ) else: print("GaussianLatentVarMLPPolicy") policy = GaussianLatentVarMLPPolicy( name="policy", latent_sampler=latent_sampler, env_spec=env.spec, hidden_sizes=args.policy_mean_hidden_layer_dims, std_hidden_sizes=args.policy_std_hidden_layer_dims ) else: if args.policy_recurrent: print("GaussianGRUPolicy") policy = GaussianGRUPolicy( name="policy", env_spec=env.spec, hidden_dim=args.recurrent_hidden_dim, output_nonlinearity=None, learn_std=True ) else: print("GaussianMLPPolicy") policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=args.policy_mean_hidden_layer_dims, std_hidden_sizes=args.policy_std_hidden_layer_dims, adaptive_std=True, output_nonlinearity=None, learn_std=True ) return policy
def build_hierarchy(args, env, writer=None): levels = [] latent_sampler = UniformlyRandomLatentSampler( name='base_latent_sampler', dim=args.latent_dim, scheduler=ConstantIntervalScheduler(k=args.env_H) ) for level_idx in [1,0]: # wrap env in different spec depending on level if level_idx == 0: level_env = env else: level_env = SpecWrapperEnv( env, action_space=Discrete(args.latent_dim), observation_space=env.observation_space ) with tf.variable_scope('level_{}'.format(level_idx)): # recognition_model = build_recognition_model(args, level_env, writer) recognition_model = None if level_idx == 0: policy = build_policy(args, env, latent_sampler=latent_sampler) else: scheduler = ConstantIntervalScheduler(k=args.scheduler_k) policy = latent_sampler = CategoricalLatentSampler( scheduler=scheduler, name='latent_sampler', policy_name='latent_sampler_policy', dim=args.latent_dim, env_spec=level_env.spec, latent_sampler=latent_sampler, max_n_envs=args.n_envs ) baseline = build_baseline(args, level_env) if args.vectorize: force_batch_sampler = False if level_idx == 0: sampler_args = dict(n_envs=args.n_envs) else: sampler_args = None else: force_batch_sampler = True sampler_args = None sampler_cls = None if level_idx == 0 else HierarchySampler algo = TRPO( env=level_env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=args.max_path_length, n_itr=args.n_itr, discount=args.discount, step_size=args.trpo_step_size, sampler_cls=sampler_cls, force_batch_sampler=force_batch_sampler, sampler_args=sampler_args, optimizer_args=dict( max_backtracks=50, debug_nan=True ) ) reward_handler = build_reward_handler(args, writer) level = Level( depth=level_idx, algo=algo, reward_handler=reward_handler, recognition_model=recognition_model, start_itr=0, end_itr=0 if level_idx == 0 else np.inf ) levels.append(level) # by convention the order of the levels should be increasing # but they must be built in the reverse order # so reverse the list before returning it return list(reversed(levels))
obs_dim=env.observation_space.flat_dim, act_dim=env.action_space.n, dataset=recognition_dataset, network=recognition_network, variable_type='categorical', latent_dim=latent_dim, optimizer=tf.train.AdamOptimizer(recognition_learning_rate, beta1=.5, beta2=.9), n_train_epochs=n_recognition_train_epochs, summary_writer=summary_writer, verbose=2) # build the policy latent_sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(k=scheduler_k), name='latent_sampler', dim=latent_dim) policy = CategoricalLatentVarMLPPolicy(policy_name="policy", latent_sampler=latent_sampler, env_spec=env.spec, hidden_sizes=(64, 64)) else: # build the policy policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) recognition_model = None # build gail baseline = LinearFeatureBaseline(env_spec=env.spec)
obs_dim=env.observation_space.flat_dim, act_dim=env.action_space.n, dataset=critic_dataset, network=critic_network, gradient_penalty=10., optimizer=tf.train.RMSPropOptimizer(critc_lr), n_train_epochs=n_critic_train_epochs, summary_writer=summary_writer, verbose=2, ) # level 2 base_latent_sampler = UniformlyRandomLatentSampler( name='base_latent_sampler', dim=latent_dim_1, scheduler=ConstantIntervalScheduler(k=scheduler_k_1) ) # level 1 with tf.variable_scope('level_1'): recog_dataset_1 = RecognitionDataset(batch_size) recog_network_1 = ObservationActionMLP( name='recog_1', hidden_layer_dims=[32,32], output_dim=latent_dim_1 ) recog_1 = RecognitionModel( obs_dim=env.observation_space.flat_dim, act_dim=env.action_space.n, dataset=recog_dataset_1, network=recog_network_1,
def build_hgail(env, critic_dataset, batch_size): # critic with tf.variable_scope('critic'): critic_network = ObservationActionMLP(name='critic', hidden_layer_dims=[32, 32]) critic = WassersteinCritic(obs_dim=3, act_dim=2, dataset=critic_dataset, network=critic_network, gradient_penalty=.01, optimizer=tf.train.AdamOptimizer(.001, beta1=.5, beta2=.9), n_train_epochs=50) # base latent variable sampler base_latent_sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='base_latent_sampler', dim=3) with tf.variable_scope('level_1'): recog_dataset_1 = RecognitionDataset(batch_size=batch_size) recog_network_1 = ObservationActionMLP(name='recog', hidden_layer_dims=[32, 32], output_dim=3) recog_1 = RecognitionModel(obs_dim=3, act_dim=2, dataset=recog_dataset_1, network=recog_network_1, variable_type='categorical', latent_dim=3, name='recognition_1') latent_sampler = CategoricalLatentSampler( scheduler=ConstantIntervalScheduler(k=1), name='latent_sampler', policy_name='latent_sampler_policy', dim=2, env_spec=env.spec, latent_sampler=base_latent_sampler, max_n_envs=20) baseline_1 = LinearFeatureBaseline(env_spec=env.spec) algo_1 = TRPO( env=env, policy=latent_sampler, baseline=baseline_1, batch_size=4000, max_path_length=100, n_itr=15, discount=0.99, step_size=0.01, sampler_cls=HierarchySampler, ) reward_handler_1 = RewardHandler(use_env_rewards=False, critic_final_scale=1.) level_1 = Level(depth=1, algo=algo_1, reward_handler=reward_handler_1, recognition_model=recog_1) with tf.variable_scope('level_0'): # recognition model recog_dataset_0 = RecognitionDataset(batch_size=batch_size) recog_network_0 = ObservationActionMLP(name='recog', hidden_layer_dims=[32, 32], output_dim=2) recog_0 = RecognitionModel(obs_dim=3, act_dim=2, dataset=recog_dataset_0, network=recog_network_0, variable_type='categorical', latent_dim=2, name='recognition_0') policy = CategoricalLatentVarMLPPolicy(policy_name="policy", latent_sampler=latent_sampler, env_spec=env.spec) baseline_0 = LinearFeatureBaseline(env_spec=env.spec) algo_0 = TRPO(env=env, policy=policy, baseline=baseline_0, batch_size=4000, max_path_length=100, n_itr=5, discount=0.99, step_size=0.1, sampler_args=dict(n_envs=1)) reward_handler_0 = RewardHandler(use_env_rewards=False, critic_final_scale=1.) level_0 = Level(depth=0, algo=algo_0, reward_handler=reward_handler_0, recognition_model=recog_0) hierarchy = [level_0, level_1] algo = HGAIL( critic=critic, hierarchy=hierarchy, ) return algo
def test_infogail_two_round_stochastic_env(self): env = TfEnv(TwoRoundNondeterministicRewardEnv()) # dataset of one-hot obs and acts # optimal actions: 0, 1 # first state n_expert_samples = 1000 batch_size = 1000 half = int(n_expert_samples / 2) rx = np.zeros((n_expert_samples, 3)) rx[:half, 2] = 1 rx[half:, 0] = 1 ra = np.zeros((n_expert_samples, 2)) ra[:half, 0] = 1 ra[half:, 1] = 1 with tf.Session() as session: # critic critic_dataset = CriticDataset(dict(observations=rx, actions=ra), batch_size=batch_size) critic_network = ObservationActionMLP(name='critic', hidden_layer_dims=[32, 32]) critic = WassersteinCritic(obs_dim=3, act_dim=2, dataset=critic_dataset, network=critic_network, gradient_penalty=.01, optimizer=tf.train.AdamOptimizer( .001, beta1=.5, beta2=.9), n_train_epochs=50) # recognition model recog_dataset = RecognitionDataset(batch_size=batch_size) recog_network = ObservationActionMLP(name='recog', hidden_layer_dims=[32, 32], output_dim=2) recog = RecognitionModel(obs_dim=3, act_dim=2, dataset=recog_dataset, network=recog_network, variable_type='categorical', latent_dim=2) # policy env.spec.num_envs = 10 latent_sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='latent_sampler', dim=2) policy = CategoricalLatentVarMLPPolicy( policy_name="policy", latent_sampler=latent_sampler, env_spec=env.spec) # gail reward_handler = RewardHandler(use_env_rewards=False, critic_final_scale=1.) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = GAIL(critic=critic, recognition=recog, reward_handler=reward_handler, env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=200, n_itr=15, discount=.99, step_size=.01, sampler_args=dict(n_envs=env.spec.num_envs)) session.run(tf.global_variables_initializer()) # run it! algo.train(sess=session) # evaluate l0_state_infos = dict(latent=[[1, 0]]) l0_dist_2 = policy.dist_info([[0., 0., 1.]], l0_state_infos)['prob'] l0_dist_0 = policy.dist_info([[1., 0., 0.]], l0_state_infos)['prob'] l1_state_infos = dict(latent=[[0, 1]]) l1_dist_2 = policy.dist_info([[0., 0., 1.]], l1_state_infos)['prob'] l1_dist_0 = policy.dist_info([[1., 0., 0.]], l1_state_infos)['prob'] np.testing.assert_array_almost_equal(l0_dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(l0_dist_0, [[0, 1]], 1) np.testing.assert_array_almost_equal(l1_dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(l1_dist_0, [[0, 1]], 1)