def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [GymEnv(name, max_episode_length=100) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) trainer = Trainer(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, eval_env=[test_envs], env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=1, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size, fixed_alpha=np.exp(0.5)) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) mtsac.to() assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) trainer.setup(mtsac, env) trainer.train(n_epochs=1, batch_size=128, plot=False) assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) assert not mtsac._use_automatic_entropy_tuning
def test_to(): """Test the torch function that moves modules to GPU. Test that the policy and qfunctions are moved to gpu if gpu is available. """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) set_gpu_mode(torch.cuda.is_available()) mtsac.to() device = global_device() for param in mtsac._qf1.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac.policy.parameters(): assert param.device == device assert mtsac._log_alpha.device == device
def test_mtsac_get_log_alpha_incorrect_num_tasks(monkeypatch): """Check that if the num_tasks passed does not match the number of tasks in the environment, then the algorithm should raise an exception. MTSAC uses disentangled alphas, meaning that """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GymEnv(name, max_episode_length=150) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=None, gradient_steps_per_itr=150, eval_env=[env], env_spec=env.spec, num_tasks=4, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) error_string = ('The number of tasks in the environment does ' 'not match self._num_tasks. Are you sure that you passed ' 'The correct number of tasks?') obs = torch.Tensor([env.reset()[0]] * buffer_batch_size) with pytest.raises(ValueError, match=error_string): mtsac._get_log_alpha(dict(observation=obs))
def test_mtsac_get_log_alpha(monkeypatch): """Check that the private function _get_log_alpha functions correctly. MTSAC uses disentangled alphas, meaning that """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) for i, _ in enumerate(env_names): obs = torch.Tensor([env.reset()] * buffer_batch_size) log_alpha = mtsac._get_log_alpha(dict(observation=obs)) assert (log_alpha == torch.Tensor([i + 1, i + 1])).all().item() assert log_alpha.size() == torch.Size([mtsac._buffer_batch_size])
def test_mtsac_inverted_double_pendulum(): """Performance regression test of MTSAC on 2 InvDoublePendulum envs.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [GymEnv(name, max_episode_length=100) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) trainer = Trainer(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, eval_env=[test_envs], env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) trainer.setup(mtsac, env) ret = trainer.train(n_epochs=8, batch_size=128, plot=False) assert ret > 0
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() # pylint: disable=no-member mt10_test = metaworld.MT10() # pylint: disable=no-member # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env, normalize_reward=True) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 sampler = LocalSampler( agents=policy, envs=mt10_train_envs, max_episode_length=env.spec.max_episode_length, # 1 sampler worker for each environment n_workers=meta_batch_size, worker_class=FragmentWorker, # increasing n_envs increases the vectorization of a sampler worker # which improves runtime performance, but you will need to adjust this # depending on your memory constraints. For reference, each worker by # default uses n_envs=8. Each environment is approximately ~50mb large # so creating 50 envs with 8 copies comes out to 20gb of memory. Many # users want to be able to run multiple seeds on 1 machine, so I have # reduced this to n_envs = 2 for 2 copies in the meantime. worker_args=dict(n_envs=2)) batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=env.spec.max_episode_length, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt10_train_envs) trainer.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None): """Train MTSAC with the ML1 pick-place-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) train_envs = [] test_envs = [] env_names = [] for i in range(50): train_env = normalize( GymEnv(mwb.ML1.get_train_tasks('pick-place-v1'), normalize_reward=True)) test_env = pickle.loads(pickle.dumps(train_env)) env_names.append('pick_place_{}'.format(i)) train_envs.append(train_env) test_envs.append(test_env) ml1_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, env_names=env_names) ml1_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, env_names=env_names) policy = TanhGaussianMLPPolicy( env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 10000000 batch_size = int(150 * ml1_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_episode_length=150, eval_env=ml1_test_envs, env_spec=ml1_train_envs.spec, num_tasks=50, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0): """Train MTSAC with MT50 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. use_gpu (bool): Used to enable ussage of GPU in training. _gpu (int): The ID of the gpu (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) task_names = mwb.MT50.get_train_tasks().all_task_names train_envs = [] test_envs = [] for task_name in task_names: train_env = normalize(GarageEnv(mwb.MT50.from_task(task_name)), normalize_reward=True) test_env = normalize(GarageEnv(mwb.MT50.from_task(task_name))) train_envs.append(train_env) test_envs.append(test_env) mt50_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, mode='vanilla') mt50_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = TanhGaussianMLPPolicy( env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 100000000 batch_size = int(150 * mt50_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_episode_length=250, eval_env=mt50_test_envs, env_spec=mt50_train_envs.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=7500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=6400) set_gpu_mode(use_gpu, _gpu) mtsac.to() runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_mt1_pick_place(ctxt=None, *, seed, timesteps, _gpu): """Train MTSAC with the MT1 pick-place-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) mt1 = metaworld.MT1('pick-place-v1') mt1_test = metaworld.MT1('pick-place-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) test_task_sampler = MetaWorldTaskSampler(mt1_test, 'train', lambda env, _: normalize(env)) n_tasks = 50 train_envs = train_task_sampler.sample(n_tasks) env = train_envs[0]() test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] trainer = Trainer(ctxt) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) sampler = LocalSampler(agents=policy, envs=train_envs, max_episode_length=env.spec.max_episode_length, n_workers=n_tasks, worker_class=FragmentWorker) batch_size = int(env.spec.max_episode_length * n_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=150, eval_env=test_envs, env_spec=env.spec, num_tasks=1, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=train_envs) trainer.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() # pylint: disable=no-member mt10_test = metaworld.MT10() # pylint: disable=no-member # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env, normalize_reward=True) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 # ray.init(local_mode=True, log_to_driver=False, ignore_reinit_error=True) sampler = SingleVecWorkSampler( agents=policy, envs=mt10_train_envs, n_workers=meta_batch_size, max_episode_length=env.spec.max_episode_length, # 1 sampler worker for each environment worker_class=FragmentWorker, # increasing n_envs increases the vectorization of a sampler worker # which improves runtime performance, but you will need to adjust this # depending on your memory constraints. For reference, each worker by # default uses n_envs=8. Each environment is approximately ~50mb large # so creating 50 envs with 8 copies comes out to 20gb of memory. Many # users want to be able to run multiple seeds on 1 machine, so I have # reduced this to n_envs = 2 for 2 copies in the meantime. worker_args=dict(n_envs=10)) # one episode for each task between gradient steps batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=env.spec.max_episode_length, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) trainer.setup(algo=mtsac, env=mt10_train_envs) import time s = time.time() mtsac.to() print(time.time() - s) s = time.time() # trainer.step_episode = trainer.obtain_samples(0, 1500, None, None) trainer.step_episode = trainer.obtain_samples(0, 2000, None, None) print((time.time() - s)) a = 2 from garage import StepType path_returns = [] for path in trainer.step_episode: mtsac.replay_buffer.add_path( dict(observation=path['observations'], action=path['actions'], reward=path['rewards'].reshape(-1, 1), next_observation=path['next_observations'], terminal=np.array([ step_type == StepType.TERMINAL for step_type in path['step_types'] ]).reshape(-1, 1))) path_returns.append(sum(path['rewards'])) s = time.time() for _ in range(10): trainer._algo.train_once() print((time.time() - s) / 10)
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() mt10_test = metaworld.MT10() # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', wrap, add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt10_train_envs, sampler_cls=LocalSampler, n_workers=meta_batch_size) trainer.train(n_epochs=epochs, batch_size=batch_size)