def test_clone(self): env = MetaRLEnv(DummyBoxEnv(obs_dim=(2, ), action_dim=(2, ))) embedding_spec = InOutSpec(input_space=env.spec.observation_space, output_space=env.spec.action_space) embedding = GaussianMLPEncoder(embedding_spec) clone_embedding = embedding.clone(name='cloned') assert clone_embedding.input_dim == embedding.input_dim assert clone_embedding.output_dim == embedding.output_dim
def test_auxiliary(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 2, 2 env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) obs_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, 2)) task_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, 2)) policy.build(obs_input, task_input) assert policy.distribution.loc.get_shape().as_list( )[-1] == env.action_space.flat_dim assert policy.encoder == encoder assert policy.latent_space.flat_dim == latent_dim assert policy.task_space.flat_dim == task_num assert (policy.augmented_observation_space.flat_dim == env.observation_space.flat_dim + task_num) assert policy.encoder_distribution.loc.get_shape().as_list( )[-1] == latent_dim
def test_get_action(self, obs_dim, task_num, latent_dim, action_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) env.reset() obs, _, _, _ = env.step(1) latent = np.random.random((latent_dim, )) task = np.zeros(task_num) task[0] = 1 action1, _ = policy.get_action_given_latent(obs, latent) action2, _ = policy.get_action_given_task(obs, task) action3, _ = policy.get_action(np.concatenate([obs.flatten(), task])) assert env.action_space.contains(action1) assert env.action_space.contains(action2) assert env.action_space.contains(action3) obses, latents, tasks = [obs] * 3, [latent] * 3, [task] * 3 aug_obses = [np.concatenate([obs.flatten(), task])] * 3 action1n, _ = policy.get_actions_given_latents(obses, latents) action2n, _ = policy.get_actions_given_tasks(obses, tasks) action3n, _ = policy.get_actions(aug_obses) for action in chain(action1n, action2n, action3n): assert env.action_space.contains(action)
def test_get_vars(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2 env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec, hidden_sizes=[32, 32, 32]) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder, hidden_sizes=[32, 32, 32]) vars1 = sorted(policy.get_trainable_vars(), key=lambda v: v.name) vars2 = sorted(policy.get_global_vars(), key=lambda v: v.name) assert vars1 == vars2 # Two network. Each with 4 layers * (1 weight + 1 bias) + 1 log_std assert len(vars1) == 2 * (4 * 2 + 1) obs = np.random.random(obs_dim) latent = np.random.random((latent_dim, )) for var in vars1: var.assign(np.ones(var.shape)) assert np.any(policy.get_action_given_latent(obs, latent) != 0) for var in vars1: var.assign(np.zeros(var.shape)) assert not np.all(policy.get_action_given_latent(obs, latent) == 0)
def test_is_pickleable(self, obs_dim, embedding_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=embedding_dim)) embedding_spec = InOutSpec(input_space=env.spec.observation_space, output_space=env.spec.action_space) embedding = GaussianMLPEncoder(embedding_spec) task_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, embedding.input_dim)) embedding.build(task_input, name='default') env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim with tf.compat.v1.variable_scope('GaussianMLPEncoder/GaussianMLPModel', reuse=True): bias = tf.compat.v1.get_variable( 'dist_params/mean_network/hidden_0/bias') # assign it to all one bias.load(tf.ones_like(bias).eval()) output1 = self.sess.run( [embedding.distribution.loc, embedding.distribution.stddev()], feed_dict={embedding.model.input: [[obs.flatten()]]}) p = pickle.dumps(embedding) with tf.compat.v1.Session(graph=tf.Graph()) as sess: embedding_pickled = pickle.loads(p) task_input = tf.compat.v1.placeholder( tf.float32, shape=(None, None, embedding_pickled.input_dim)) embedding_pickled.build(task_input, name='default') output2 = sess.run( [ embedding_pickled.distribution.loc, embedding_pickled.distribution.stddev() ], feed_dict={embedding_pickled.model.input: [[obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_pickling(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2 env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) pickled = pickle.dumps(policy) with tf.compat.v1.variable_scope('resumed'): unpickled = pickle.loads(pickled) assert hasattr(unpickled, '_f_dist_obs_latent') assert hasattr(unpickled, '_f_dist_obs_task')
def test_get_latent(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2 env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) task_id = 3 task_onehot = np.zeros(task_num) task_onehot[task_id] = 1 latent, latent_info = policy.get_latent(task_onehot) assert latent.shape == (latent_dim, ) assert latent_info['mean'].shape == (latent_dim, ) assert latent_info['log_std'].shape == (latent_dim, )
def test_get_embedding(self, obs_dim, embedding_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=embedding_dim)) embedding_spec = InOutSpec(input_space=env.spec.observation_space, output_space=env.spec.action_space) embedding = GaussianMLPEncoder(embedding_spec) task_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, embedding.input_dim)) embedding.build(task_input) env.reset() obs, _, _, _ = env.step(1) latent, _ = embedding.get_latent(obs) latents, _ = embedding.get_latents([obs] * 5) assert env.action_space.contains(latent) for latent in latents: assert env.action_space.contains(latent)
def setup_method(self): super().setup_method() def circle(r, n): """Generate n points on a circle of radius r. Args: r (float): Radius of the circle. n (int): Number of points to generate. Yields: tuple(float, float): Coordinate of a point. """ for t in np.arange(0, 2 * np.pi, 2 * np.pi / n): yield r * np.sin(t), r * np.cos(t) N = 4 goals = circle(3.0, N) tasks = { str(i + 1): { 'args': [], 'kwargs': { 'goal': g, 'never_done': False, 'done_bonus': 0.0, } } for i, g in enumerate(goals) } latent_length = 1 inference_window = 2 self.batch_size = 100 * len(tasks) self.policy_ent_coeff = 2e-2 self.encoder_ent_coeff = 2.2e-3 self.inference_ce_coeff = 5e-2 self.max_path_length = 100 embedding_init_std = 1.0 embedding_max_std = 2.0 embedding_min_std = 0.38 policy_init_std = 1.0 policy_max_std = None policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] task_envs = [ MetaRLEnv(PointEnv(*t_args, **t_kwargs)) for t_args, t_kwargs in zip(task_args, task_kwargs) ] self.env = env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') latent_lb = np.zeros(latent_length, ) latent_ub = np.ones(latent_length, ) latent_space = akro.Box(latent_lb, latent_ub) obs_lb, obs_ub = env.observation_space.bounds obs_lb_flat = env.observation_space.flatten(obs_lb) obs_ub_flat = env.observation_space.flatten(obs_ub) traj_lb = np.stack([obs_lb_flat] * inference_window) traj_ub = np.stack([obs_ub_flat] * inference_window) traj_space = akro.Box(traj_lb, traj_ub) task_embed_spec = InOutSpec(env.task_space, latent_space) traj_embed_spec = InOutSpec(traj_space, latent_space) self.inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=[20, 10], std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=[20, 20], std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) self.policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=[32, 16], std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) self.baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents'])
def te_ppo_pointenv(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = TASKS latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * len(TASKS) policy_ent_coeff = 2e-2 encoder_ent_coeff = 2.e-4 inference_ce_coeff = 5e-2 max_path_length = 100 embedding_init_std = 0.01 embedding_max_std = 0.02 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] with LocalTFRunner(snapshot_config=ctxt) as runner: task_envs = [ MetaRLEnv(PointEnv(*t_args, **t_kwargs)) for t_args, t_kwargs in zip(task_args, task_kwargs) ] env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=[20, 20], std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=[20, 10], std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=[32, 16], std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, max_path_length=max_path_length, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, entropy_method='max', use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_epochs=10, ), center_adv=True, stop_entropy_gradient=True, stop_ce_gradient=True) runner.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def test_auxiliary(self): input_space = akro.Box(np.array([-1, -1]), np.array([1, 1])) latent_space = akro.Box(np.array([-2, -2, -2]), np.array([2, 2, 2])) embedding_spec = InOutSpec(input_space=input_space, output_space=latent_space) embedding = GaussianMLPEncoder(embedding_spec, hidden_sizes=[32, 32, 32]) task_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, embedding.input_dim)) embedding.build(task_input, name='default') # 9 Layers: (3 hidden + 1 output) * (1 weight + 1 bias) + 1 log_std assert len(embedding.get_params()) == 9 assert len(embedding.get_global_vars()) == 9 assert embedding.distribution.loc.get_shape().as_list( )[-1] == latent_space.shape[0] assert embedding.input.shape.as_list() == [ None, None, input_space.shape[0] ] assert (embedding.latent_mean.shape.as_list() == [ None, None, latent_space.shape[0] ]) assert (embedding.latent_std_param.shape.as_list() == [ None, None, latent_space.shape[0] ]) # To increase coverage in embeddings/base.py embedding.reset() assert embedding.input_dim == embedding_spec.input_space.flat_dim assert embedding.output_dim == embedding_spec.output_space.flat_dim var_shapes = [ (2, 32), (32, ), # input (32, 32), (32, ), # hidden 0 (32, 32), (32, ), # hidden 1 (32, 3), (3, ), # hidden 2 (3, ) ] # log_std assert sorted(embedding.get_param_shapes()) == sorted(var_shapes) var_count = sum(list(map(np.prod, var_shapes))) embedding.set_param_values(np.ones(var_count)) assert (embedding.get_param_values() == np.ones(var_count)).all() assert (sorted( map(np.shape, embedding.flat_to_params( np.ones(var_count)))) == sorted(var_shapes))