def test_init_with_crashed_worker(): max_episode_length = 16 env = GarageEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) n_workers = 2 workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) class CrashingPolicy: def reset(self, **kwargs): raise Exception('Intentional subprocess crash') bad_policy = CrashingPolicy() # This causes worker 2 to crash. sampler = MultiprocessingSampler.from_worker_factory( workers, [policy, bad_policy], envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, None) assert sum(rollouts.lengths) >= 160 sampler.shutdown_worker() env.close()
def test_obtain_exact_episodes(): max_episode_length = 15 n_workers = 8 env = PointEnv() per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_episode_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory(workers, policies, envs=env) n_eps_per_worker = 3 episodes = sampler.obtain_exact_episodes(n_eps_per_worker, agent_update=policies) # At least one action per episode. assert sum(episodes.lengths) >= n_workers * n_eps_per_worker # All of the episodes. assert len(episodes.lengths) == n_workers * n_eps_per_worker worker = -1 for count, eps in enumerate(episodes.split()): if count % n_eps_per_worker == 0: worker += 1 assert (eps.actions == per_worker_actions[worker]).all() sampler.shutdown_worker() env.close()
def test_update_envs_env_update(): max_episode_length = 16 env = PointEnv(max_episode_length=max_episode_length) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory(workers, policy, env) episodes = sampler.obtain_samples(0, 161, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for eps in episodes.split(): mean_rewards.append(eps.rewards.mean()) goals.append(eps.env_infos['task'][0]['goal']) assert np.var(mean_rewards) > 0 assert np.var(goals) > 0 with pytest.raises(ValueError): sampler.obtain_samples(0, 10, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers + 1)) sampler.shutdown_worker() env.close()
def setup_method(self): self.env = PointEnv() self.base_len = len(self.env.reset()) self.n_total_tasks = 5 self.task_index = 1 self.wrapped = TaskOnehotWrapper(self.env, self.task_index, self.n_total_tasks)
def multi_env_trpo(ctxt=None, seed=1): """Train TRPO on two different PointEnv instances. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = TfEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. _ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = TfEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def test_pickle_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) max_path_length = 200 env = GarageEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = RandomPolicy(env.spec.action_space) algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) meta_eval_pickle = cloudpickle.dumps(meta_eval) meta_eval2 = cloudpickle.loads(meta_eval_pickle) meta_eval2.evaluate(algo)
def test_pickle(): max_episode_length = 16 env = PointEnv() policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 4 workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory(workers, policy, env) sampler_pickled = pickle.dumps(sampler) sampler.shutdown_worker() sampler2 = pickle.loads(sampler_pickled) episodes = sampler2.obtain_samples(0, 500, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for eps in episodes.split(): mean_rewards.append(eps.rewards.mean()) goals.append(eps.env_infos['task'][0]['goal']) assert np.var(mean_rewards) > 0 assert np.var(goals) > 0 sampler2.shutdown_worker() env.close()
class TestSingleWrappedEnv: def setup_method(self): self.env = PointEnv() obs, _ = self.env.reset() self.base_len = len(obs) self.n_total_tasks = 5 self.task_index = 1 self.wrapped = TaskOnehotWrapper(self.env, self.task_index, self.n_total_tasks) def test_produces_correct_onehots(self): obs, _ = self.wrapped.reset() assert len(obs) == self.base_len + self.n_total_tasks assert (obs[-self.n_total_tasks:] == np.array([0, 1, 0, 0, 0])).all() def test_spec_obs_space(self): obs, _ = self.wrapped.reset() assert self.wrapped.observation_space.contains(obs) assert self.wrapped.spec.observation_space.contains(obs) assert (self.wrapped.spec.observation_space == self.wrapped.observation_space) def test_visualization(self): assert self.env.render_modes == self.wrapped.render_modes mode = self.env.render_modes[0] assert self.env.render(mode) == self.wrapped.render(mode)
def test_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) max_path_length = 200 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) env = GarageEnv(PointEnv()) algo = OptimalActionInference(env=env, max_path_length=max_path_length) runner.setup(algo, env) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) logger.log(tabular) meta_eval.evaluate(algo) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) assert len(rows) == 2 assert float( rows[0]['MetaTest/__unnamed_task__/TerminationRate']) < 1.0 assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0 assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float( rows[0]['MetaTest/__unnamed_task__/AverageReturn'])) assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >= float(rows[0]['MetaTest/__unnamed_task__/MinReturn'])) assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
def test_meta_evaluator_with_tf(): set_seed(100) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) max_path_length = 200 env = GarageEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: ctxt = SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='none', snapshot_gap=1) with LocalTFRunner(ctxt) as runner: meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = GaussianMLPPolicy(env.spec) algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) algo_pickle = cloudpickle.dumps(algo) tf.compat.v1.reset_default_graph() with LocalTFRunner(ctxt) as runner: algo2 = cloudpickle.loads(algo_pickle) runner.setup(algo2, env) runner.train(10, 0)
def test_observation_dimension_with_max_obs_dim(self): env = PointEnv() wrapped_env = RL2Env(PointEnv(), max_obs_dim=10) assert wrapped_env.spec.observation_space.shape[ 0] == 10 + env.action_space.shape[0] + 2 obs = wrapped_env.reset() assert 10 + env.action_space.shape[0] + 2 == obs.shape[0] obs, _, _, _ = wrapped_env.step(env.action_space.sample()) assert 10 + env.action_space.shape[0] + 2 == obs.shape[0]
def test_visualization(self): inner_env = PointEnv(goal=(1., 2.)) env = NormalizedEnv(inner_env) env.visualize() env.reset() assert inner_env.render_modes == env.render_modes mode = inner_env.render_modes[0] assert inner_env.render(mode) == env.render(mode)
def test_observation_dimension(self): env = PointEnv() wrapped_env = RL2Env(PointEnv()) assert wrapped_env.spec.observation_space.shape[0] == ( env.observation_space.shape[0] + env.action_space.shape[0] + 2) obs = env.reset() obs2 = wrapped_env.reset() assert obs.shape[0] + env.action_space.shape[0] + 2 == obs2.shape[0] obs, _, _, _ = env.step(env.action_space.sample()) obs2, _, _, _ = wrapped_env.step(env.action_space.sample()) assert obs.shape[0] + env.action_space.shape[0] + 2 == obs2.shape[0]
def test_visualization(self): env = PointEnv() wrapped_env = RL2Env(env) assert env.render_modes == wrapped_env.render_modes mode = env.render_modes[0] assert env.render(mode) == wrapped_env.render(mode) wrapped_env.reset() wrapped_env.visualize() wrapped_env.step(wrapped_env.action_space.sample()) wrapped_env.close()
def test_wrapped_env_list_produces_correct_onehots(): envs = [PointEnv(), PointEnv(), PointEnv(), PointEnv()] base_len = len(envs[0].reset()) n_total_tasks = len(envs) wrapped = TaskOnehotWrapper.wrap_env_list(envs) assert len(wrapped) == n_total_tasks for i, env in enumerate(wrapped): obs = env.reset() assert len(obs) == base_len + n_total_tasks onehot = np.zeros(n_total_tasks) onehot[i] = 1. assert (obs[-n_total_tasks:] == onehot).all() next_obs, _, _, _ = env.step(env.action_space.sample()) assert (next_obs[-n_total_tasks:] == onehot).all()
def test_obtain_exact_episodes(ray_local_session_fixture): del ray_local_session_fixture assert ray.is_initialized() max_episode_length = 15 n_workers = 8 env = PointEnv() per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_episode_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = RaySampler.from_worker_factory(workers, policies, envs=env) n_eps_per_worker = 3 episodes = sampler.obtain_exact_episodes(n_eps_per_worker, policies) # At least one action per episode. assert sum(episodes.lengths) >= n_workers * n_eps_per_worker # All of the episodes. assert len(episodes.lengths) == n_workers * n_eps_per_worker worker = -1 for count, eps in enumerate(episodes.split()): if count % n_eps_per_worker == 0: worker += 1 assert (eps.actions == per_worker_actions[worker]).all()
def bc_point(ctxt=None, loss='log_prob'): """Run Behavioral Cloning on garage.envs.PointEnv. Args: ctxt (ExperimentContext): Provided by wrap_experiment. loss (str): Either 'log_prob' or 'mse' """ trainer = Trainer(ctxt) goal = np.array([1., 1.]) env = PointEnv(goal=goal, max_episode_length=200) expert = OptimalPolicy(env.spec, goal=goal) policy = GaussianMLPPolicy(env.spec, [8, 8]) batch_size = 1000 sampler = RaySampler(agents=expert, envs=env, max_episode_length=env.spec.max_episode_length) algo = BC(env.spec, policy, batch_size=batch_size, source=expert, sampler=sampler, policy_lr=1e-2, loss=loss) trainer.setup(algo, env) trainer.train(100, batch_size=batch_size)
def test_update_envs_env_update(): max_path_length = 16 env = TfEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, env) rollouts = sampler.obtain_samples(0, 160, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for rollout in rollouts.split(): mean_rewards.append(rollout.rewards.mean()) goals.append(rollout.env_infos['task'][0]['goal']) assert np.var(mean_rewards) > 0 assert np.var(goals) > 0 with pytest.raises(ValueError): sampler.obtain_samples(0, 10, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers + 1))
def test_obtain_exact_trajectories(): max_path_length = 15 n_workers = 8 env = TfEnv(PointEnv()) per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_path_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policies, envs=env) n_traj_per_worker = 3 rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker, agent_update=policies) # At least one action per trajectory. assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker # All of the trajectories. assert len(rollouts.lengths) == n_workers * n_traj_per_worker worker = -1 for count, rollout in enumerate(rollouts.split()): if count % n_traj_per_worker == 0: worker += 1 assert (rollout.actions == per_worker_actions[worker]).all()
def test_init_with_env_updates(): max_path_length = 16 env = TfEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(lambda: TfEnv(PointEnv())) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, policy) assert sum(rollouts.lengths) >= 160
def test_does_not_modify_action(self): inner_env = PointEnv(goal=(1., 2.)) env = NormalizedEnv(inner_env, scale_reward=10.) a = env.action_space.high + 1. a_copy = a env.reset() env.step(a) assert np.array_equal(a, a_copy) env.close()
def test_init_with_env_updates(): max_episode_length = 16 env = GarageEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) n_workers = 8 workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory( workers, policy, envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, policy) assert sum(rollouts.lengths) >= 160 sampler.shutdown_worker() env.close()
def test_pickleable(self): inner_env = PointEnv(goal=(1., 2.)) env = NormalizedEnv(inner_env, scale_reward=10.) round_trip = pickle.loads(pickle.dumps(env)) assert round_trip assert round_trip._scale_reward == env._scale_reward assert np.array_equal(round_trip.env._goal, env.env._goal) step_env(round_trip) round_trip.close() env.close()
def test_init_with_env_updates(ray_local_session_fixture): del ray_local_session_fixture assert ray.is_initialized() max_path_length = 16 env = GarageEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(lambda: GarageEnv(PointEnv())) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = RaySampler.from_worker_factory(workers, policy, envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, policy) assert sum(rollouts.lengths) >= 160
def test_init_without_worker_factory(): max_episode_length = 16 env = PointEnv() policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) sampler = MultiprocessingSampler(agents=policy, envs=env, seed=100, max_episode_length=max_episode_length) worker_factory = WorkerFactory(seed=100, max_episode_length=max_episode_length) assert sampler._factory._seed == worker_factory._seed assert (sampler._factory._max_episode_length == worker_factory._max_episode_length) with pytest.raises(TypeError, match='Must construct a sampler from'): MultiprocessingSampler(agents=policy, envs=env) sampler.shutdown_worker() env.close()
def tutorial_vpg(ctxt=None): """Train VPG with PointEnv environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~LocalRunner` to create the :class:`~Snapshotter`. """ set_seed(100) runner = LocalRunner(ctxt) env = PointEnv() policy = GaussianMLPPolicy(env.spec) algo = SimpleVPG(env.spec, policy) runner.setup(algo, env) runner.train(n_epochs=200, batch_size=4000)
def tutorial_vpg(ctxt=None): """Train VPG with PointEnv environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. """ set_seed(100) with LocalTFRunner(ctxt) as runner: env = GarageEnv(PointEnv()) policy = GaussianMLPPolicy(env.spec) algo = SimpleVPG(env.spec, policy) runner.setup(algo, env) runner.train(n_epochs=200, batch_size=4000)
def test_no_seed(): max_episode_length = 16 env = PointEnv() policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) n_workers = 8 workers = WorkerFactory(seed=None, max_episode_length=max_episode_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, env) episodes = sampler.obtain_samples(0, 160, policy) assert sum(episodes.lengths) >= 160
def test_bc_point_sample_batches(): deterministic.set_seed(100) runner = LocalRunner(snapshot_config) goal = np.array([1., 1.]) env = PointEnv(goal=goal) max_episode_length = 100 source = list(expert_source(env, goal, max_episode_length, 5)) policy = DeterministicMLPPolicy(env.spec, hidden_sizes=[8, 8]) batch_size = 600 algo = BC(env.spec, policy, batch_size=batch_size, source=source, policy_lr=1e-2, loss='mse') runner.setup(algo, env) run_bc(runner, algo, batch_size)