def test_obtain_samples(): """Test obtain_samples method.""" env = TfEnv(DummyBoxEnv()) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) max_path_length = 10 max_samples = 20 max_trajs = 20 sampler = PEARLSampler(env, policy, max_path_length) sampler.start_worker() paths, steps = sampler.obtain_samples(max_samples=max_samples, max_trajs=max_trajs, accum_context=False) total_steps = 0 obs_dim = len(paths[0]['observations'][0]) act_dim = len(paths[0]['actions'][0]) for path in paths: path_length = len(path['observations']) total_steps += path_length assert (obs_dim, ) == env.observation_space.shape assert (act_dim, ) == env.action_space.shape assert path_length == max_path_length assert total_steps == max_samples assert steps == max_samples sampler.shutdown_worker()
def test_get_actions(self, batch_size, hidden_sizes): env_spec = TfEnv(DummyBoxEnv()) obs_dim = env_spec.observation_space.flat_dim act_dim = env_spec.action_space.flat_dim obs = torch.ones([batch_size, obs_dim], dtype=torch.float32) policy = DeterministicMLPPolicy(env_spec=env_spec, hidden_nonlinearity=None, hidden_sizes=hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) expected_output = np.full([batch_size, act_dim], fill_value=obs_dim * np.prod(hidden_sizes), dtype=np.float32) assert np.array_equal(policy.get_actions(obs)[0], expected_output)
def test_is_pickleable(self, batch_size, hidden_sizes): env_spec = TfEnv(DummyBoxEnv()) obs_dim = env_spec.observation_space.flat_dim obs = torch.ones([batch_size, obs_dim], dtype=torch.float32) policy = DeterministicMLPPolicy(env_spec=env_spec, hidden_nonlinearity=None, hidden_sizes=hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) output1 = policy.get_actions(obs)[0] p = pickle.dumps(policy) policy_pickled = pickle.loads(p) output2 = policy_pickled.get_actions(obs)[0] assert np.array_equal(output1, output2)
def test_all(): """Test all methods.""" env = TfEnv(DummyBoxEnv()) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) max_path_length = 10 max_samples = 50 max_trajs = 50 sampler = PEARLSampler(env, policy, max_path_length) paths, _ = sampler.obtain_samples(max_samples=max_samples, max_trajs=max_trajs, accum_context=False) replay_buffer = MetaReplayBuffer(100, env.observation_space.low.size, env.action_space.low.size) i = 0 for path in paths: replay_buffer.add_path(path) i += max_path_length assert replay_buffer.size() == i replay_buffer.clear() assert replay_buffer.size() == 0 for path in paths: replay_buffer.add_path(path) batch_size = 3 indices = np.random.randint(0, replay_buffer.size(), batch_size) out = replay_buffer.sample_data(indices) assert len(out['observations']) == batch_size assert len(out['actions']) == batch_size assert len(out['rewards']) == batch_size assert len(out['terminals']) == batch_size assert len(out['next_observations']) == batch_size batch_size = 10 out = replay_buffer.random_batch(batch_size) assert len(out['observations']) == batch_size assert len(out['actions']) == batch_size assert len(out['rewards']) == batch_size assert len(out['terminals']) == batch_size assert len(out['next_observations']) == batch_size out = replay_buffer.random_sequence(batch_size) assert len(out['observations']) == batch_size assert len(out['actions']) == batch_size assert len(out['rewards']) == batch_size assert len(out['terminals']) == batch_size assert len(out['next_observations']) == batch_size
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ runner = LocalRunner(snapshot_config) env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) policy_optimizer = (torch.optim.Adagrad, {'lr': 1e-4, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def ddpg_pendulum(ctxt=None, seed=1, lr=1e-4): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. lr (float): Learning rate for policy optimization. """ set_seed(seed) runner = LocalRunner(ctxt) env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) policy_optimizer = (torch.optim.Adagrad, {'lr': lr, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def test_ddpg_pendulum(self): """Test DDPG with Pendulum environment. This environment has a [-3, 3] action_space bound. """ deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = MetaRLEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 10 env.close()
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = MetaRLEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 45 env.close()