def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [MetaRLEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) runner = LocalRunner(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, eval_env=test_envs, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=1, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size, fixed_alpha=np.exp(0.5)) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) mtsac.to() assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) runner.setup(mtsac, env, sampler_cls=LocalSampler) runner.train(n_epochs=1, batch_size=128, plot=False) assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) assert not mtsac._use_automatic_entropy_tuning
def test_mtsac_get_log_alpha(monkeypatch): """Check that the private function _get_log_alpha functions correctly. MTSAC uses disentangled alphas, meaning that """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [MetaRLEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) for i, _ in enumerate(env_names): obs = torch.Tensor([env.reset()] * buffer_batch_size) log_alpha = mtsac._get_log_alpha(dict(observation=obs)) assert (log_alpha == torch.Tensor([i + 1, i + 1])).all().item() assert log_alpha.size() == torch.Size([mtsac.buffer_batch_size])
def test_to(): """Test the torch function that moves modules to GPU. Test that the policy and qfunctions are moved to gpu if gpu is available. """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [MetaRLEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) set_gpu_mode(torch.cuda.is_available()) mtsac.to() device = global_device() for param in mtsac._qf1.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._policy.parameters(): assert param.device == device assert mtsac._log_alpha.device == device
def test_mtsac_inverted_double_pendulum(): """Performance regression test of MTSAC on 2 InvDoublePendulum envs.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [MetaRLEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) runner = LocalRunner(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, eval_env=test_envs, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) runner.setup(mtsac, env, sampler_cls=LocalSampler) ret = runner.train(n_epochs=8, batch_size=128, plot=False) assert ret > 130
def mtppo_metaworld_mt10(ctxt, seed, epochs, batch_size, n_worker): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_worker (int): The number of workers the sampler should use. """ set_seed(seed) tasks = mwb.MT10.get_train_tasks().all_task_names envs = [] for task in tasks: envs.append(normalize(MetaRLEnv(mwb.MT10.from_task(task)))) env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=128, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) runner = LocalRunner(ctxt) runner.setup(algo, env, n_workers=n_worker) runner.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0): """Train MTSAC with MT50 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. use_gpu (bool): Used to enable ussage of GPU in training. _gpu (int): The ID of the gpu (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) task_names = mwb.MT50.get_train_tasks().all_task_names train_envs = [] test_envs = [] for task_name in task_names: train_env = normalize(MetaRLEnv(mwb.MT50.from_task(task_name)), normalize_reward=True) test_env = normalize(MetaRLEnv(mwb.MT50.from_task(task_name))) train_envs.append(train_env) test_envs.append(test_env) mt50_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, mode='vanilla') mt50_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = TanhGaussianMLPPolicy( env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 100000000 batch_size = int(150 * mt50_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=250, eval_env=mt50_test_envs, env_spec=mt50_train_envs.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=7500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=6400) set_gpu_mode(use_gpu, _gpu) mtsac.to() runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def setup_method(self): super().setup_method() def circle(r, n): """Generate n points on a circle of radius r. Args: r (float): Radius of the circle. n (int): Number of points to generate. Yields: tuple(float, float): Coordinate of a point. """ for t in np.arange(0, 2 * np.pi, 2 * np.pi / n): yield r * np.sin(t), r * np.cos(t) N = 4 goals = circle(3.0, N) tasks = { str(i + 1): { 'args': [], 'kwargs': { 'goal': g, 'never_done': False, 'done_bonus': 0.0, } } for i, g in enumerate(goals) } latent_length = 1 inference_window = 2 self.batch_size = 100 * len(tasks) self.policy_ent_coeff = 2e-2 self.encoder_ent_coeff = 2.2e-3 self.inference_ce_coeff = 5e-2 self.max_path_length = 100 embedding_init_std = 1.0 embedding_max_std = 2.0 embedding_min_std = 0.38 policy_init_std = 1.0 policy_max_std = None policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] task_envs = [ MetaRLEnv(PointEnv(*t_args, **t_kwargs)) for t_args, t_kwargs in zip(task_args, task_kwargs) ] self.env = env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') latent_lb = np.zeros(latent_length, ) latent_ub = np.ones(latent_length, ) latent_space = akro.Box(latent_lb, latent_ub) obs_lb, obs_ub = env.observation_space.bounds obs_lb_flat = env.observation_space.flatten(obs_lb) obs_ub_flat = env.observation_space.flatten(obs_ub) traj_lb = np.stack([obs_lb_flat] * inference_window) traj_ub = np.stack([obs_ub_flat] * inference_window) traj_space = akro.Box(traj_lb, traj_ub) task_embed_spec = InOutSpec(env.task_space, latent_space) traj_embed_spec = InOutSpec(traj_space, latent_space) self.inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=[20, 10], std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=[20, 20], std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) self.policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=[32, 16], std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) self.baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents'])
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None): """Train MTSAC with the ML1 pick-place-v1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) train_envs = [] test_envs = [] env_names = [] for i in range(50): train_env = MetaRLEnv( normalize(mwb.ML1.get_train_tasks('pick-place-v1'), normalize_reward=True)) test_env = pickle.loads(pickle.dumps(train_env)) env_names.append('pick_place_{}'.format(i)) train_envs.append(train_env) test_envs.append(test_env) ml1_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, env_names=env_names) ml1_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, env_names=env_names) policy = TanhGaussianMLPPolicy( env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 10000000 batch_size = int(150 * ml1_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=ml1_test_envs, env_spec=ml1_train_envs.spec, num_tasks=50, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)