def test_vectorized_rollout(): concurrency = 3 vectorized_env = SequentialVectorEnv([build_dummy_structured_env] * concurrency) standard_env = build_dummy_structured_env() assert vectorized_env.observation_spaces_dict == standard_env.observation_spaces_dict assert vectorized_env.action_spaces_dict == standard_env.action_spaces_dict policy = DistributedRandomPolicy(vectorized_env.action_spaces_dict, concurrency=concurrency) observation = vectorized_env.reset() for _ in range(3): action = policy.compute_action(observation, actor_id=vectorized_env.actor_id(), maze_state=None) observation, reward, done, info = vectorized_env.step(action)
def _get_alg_config(env_name: str, runner_type: str) -> A2CAlgorithmConfig: """ Returns algorithm config used in tests. :param env_name: Env name for rollout evaluator. :param runner_type: Runner type. "dev" or "local". :return: A2CAlgorithmConfig instance. """ env_factory = lambda: GymMazeEnv(env_name) return A2CAlgorithmConfig( n_epochs=1, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SubprocVectorEnv([env_factory]) if runner_type == "local" else SequentialVectorEnv([env_factory]), n_episodes=1, model_selection=None, deterministic=True))
def _algorithm_config(): eval_env = SequentialVectorEnv([_env_factory for _ in range(2)], logging_prefix='eval') return ImpalaAlgorithmConfig(n_epochs=2, epoch_length=2, queue_out_of_sync_factor=2, patience=15, n_rollout_steps=20, lr=0.0005, gamma=0.98, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.0, max_grad_norm=0.0, device="cpu", vtrace_clip_pg_rho_threshold=1, vtrace_clip_rho_threshold=1, num_actors=1, actors_batch_size=5, critic_burn_in_epochs=0, rollout_evaluator=RolloutEvaluator( eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True))
def create_distributed_eval_env( cls, env_factory: Callable[[], Union[StructuredEnv, StructuredEnvSpacesMixin]], eval_concurrency: int, logging_prefix: str) -> SequentialVectorEnv: """create single-threaded env distribution""" return SequentialVectorEnv( [env_factory for _ in range(eval_concurrency)], logging_prefix=logging_prefix)
def create_distributed_eval_env( self, env_factory: Callable[[], Union[StructuredEnv, MazeEnv]], eval_concurrency: int, logging_prefix: str) -> SequentialVectorEnv: """create single-threaded env distribution""" # fallback to a fixed number of pseudo-concurrent environments to avoid making this sequential execution # unnecessary slow on machines with a higher core number return SequentialVectorEnv( [env_factory for _ in range(eval_concurrency)], logging_prefix=logging_prefix)
def _generate_inconsistency_type_2_configs( ) -> Tuple[Dict, Dict, Dict, A2CAlgorithmConfig, Dict]: """ Returns configsf for tests of inconsistencies of type 2. :return: es_dev_runner_config, a2c_dev_runner_config, invalid_a2c_dev_runner_config, a2c_alg_config, default_overrides. """ gym_env_name = "CartPole-v0" es_dev_runner_config = { 'state_dict_dump_file': 'state_dict.pt', 'spaces_config_dump_file': 'spaces_config.pkl', 'normalization_samples': 1, '_target_': 'maze.train.trainers.es.ESDevRunner', 'n_eval_rollouts': 1, 'shared_noise_table_size': 10, "dump_interval": None } a2c_dev_runner_config = { 'state_dict_dump_file': 'state_dict.pt', 'spaces_config_dump_file': 'spaces_config.pkl', 'normalization_samples': 1, '_target_': 'maze.train.trainers.common.actor_critic.actor_critic_runners.ACDevRunner', "trainer_class": "maze.train.trainers.a2c.a2c_trainer.A2C", 'concurrency': 1, "dump_interval": None, "eval_concurrency": 1 } invalid_a2c_dev_runner_config = copy.deepcopy(a2c_dev_runner_config) invalid_a2c_dev_runner_config[ "trainer_class"] = "maze.train.trainers.es.es_trainer.ESTrainer" a2c_alg_config = A2CAlgorithmConfig( n_epochs=1, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator(eval_env=SequentialVectorEnv( [lambda: GymMazeEnv(gym_env_name)]), n_episodes=1, model_selection=None, deterministic=True)) default_overrides = {"env.name": gym_env_name} return es_dev_runner_config, a2c_dev_runner_config, invalid_a2c_dev_runner_config, a2c_alg_config, default_overrides
def test_autoresolving_proxy_attribute(): """ Tests auto-resolving proxy attributes like critic (see for :py:class:`maze.api.utils._ATTRIBUTE_PROXIES` for more info). """ cartpole_env_factory = lambda: GymMazeEnv(env=gym.make("CartPole-v0")) _, _, critic_composer, _, _ = _get_cartpole_setup_components() alg_config = A2CAlgorithmConfig(n_epochs=1, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv( [cartpole_env_factory]), n_episodes=1, model_selection=None, deterministic=True)) default_overrides = { "runner.normalization_samples": 1, "runner.concurrency": 1 } rc = run_context.RunContext(env=cartpole_env_factory, silent=True, algorithm=alg_config, critic=critic_composer, runner="dev", overrides=default_overrides) rc.train(n_epochs=1) assert isinstance(rc._runners[RunMode.TRAINING][0].model_composer.critic, TorchSharedStateCritic) rc = run_context.RunContext(env=cartpole_env_factory, silent=True, algorithm=alg_config, critic="template_state", runner="dev", overrides=default_overrides) rc.train(n_epochs=1) assert isinstance(rc._runners[RunMode.TRAINING][0].model_composer.critic, TorchStepStateCritic)
def test_evaluation(): """ Tests evaluation. """ # Test with ES: No rollout evaluator in config. rc = run_context.RunContext( env=lambda: GymMazeEnv(env=gym.make("CartPole-v0")), silent=True, configuration="test", overrides={ "runner.normalization_samples": 1, "runner.shared_noise_table_size": 10 }) rc.train(1) stats = rc.evaluate(n_episodes=5) assert len(stats) == 1 assert stats[0][(BaseEnvEvents.reward, "episode_count", None)] in (5, 6) # Test with A2C: Partially specified rollout evaluator in config. rc = run_context.RunContext( env=lambda: GymMazeEnv(env=gym.make("CartPole-v0")), silent=True, algorithm="a2c", configuration="test", overrides={"runner.concurrency": 1}) rc.train(1) stats = rc.evaluate(n_episodes=2) assert len(stats) == 1 assert stats[0][(BaseEnvEvents.reward, "episode_count", None)] in (2, 3) # Test with A2C and instanatiated RolloutEvaluator. rc = run_context.RunContext( env=lambda: GymMazeEnv(env=gym.make("CartPole-v0")), silent=True, algorithm="a2c", configuration="test", overrides={ "runner.concurrency": 1, "algorithm.rollout_evaluator": RolloutEvaluator(eval_env=SequentialVectorEnv( [lambda: GymMazeEnv("CartPole-v0")]), n_episodes=1, model_selection=None, deterministic=True) }) rc.train(1) stats = rc.evaluate(n_episodes=5) assert len(stats) == 1 assert stats[0][(BaseEnvEvents.reward, "episode_count", None)] in (1, 2)
def test_inconsistency_identification_type_3() -> None: """ Tests identification of inconsistency due to derived config group. """ es_dev_runner_config = { 'state_dict_dump_file': 'state_dict.pt', 'spaces_config_dump_file': 'spaces_config.pkl', 'normalization_samples': 10000, '_target_': 'maze.train.trainers.es.ESDevRunner', 'n_eval_rollouts': 10, 'shared_noise_table_size': 100000000 } a2c_alg_config = A2CAlgorithmConfig( n_epochs=1, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator(eval_env=SequentialVectorEnv( [lambda: GymMazeEnv(env="CartPole-v0")]), n_episodes=1, model_selection=None, deterministic=True)) default_overrides = { "runner.normalization_samples": 1, "runner.concurrency": 1 } rc = run_context.RunContext(algorithm=a2c_alg_config, env=lambda: GymMazeEnv(env="CartPole-v0"), silent=True, runner="dev", overrides=default_overrides) rc.train(1) run_context.RunContext(env=lambda: GymMazeEnv(env="CartPole-v0"), runner=es_dev_runner_config, silent=True, overrides=default_overrides) rc.train(1)
def test_rollout_evaluator(): env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env(), max_episode_steps=2)] * 2) policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env()) model_selection = _MockModelSelection() evaluator = RolloutEvaluator(eval_env=env, n_episodes=3, model_selection=model_selection) for i in range(2): evaluator.evaluate(policy) increment_log_step() assert model_selection.update_count == 2 assert evaluator.eval_env.get_stats_value( BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="total_episode_count" ) >= 2 * 3
def test_does_not_carry_over_stats_from_unfinished_episodes(): policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env()) # Wrap envs in a time-limit wrapper env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env())] * 2) # Make one env slower than the other env.envs[0].set_max_episode_steps(2) env.envs[1].set_max_episode_steps(10) evaluator = RolloutEvaluator(eval_env=env, n_episodes=1, model_selection=None) for i in range(2): evaluator.evaluate(policy) increment_log_step() # We should get just one episode counted in stats assert evaluator.eval_env.get_stats_value( BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="episode_count" ) == 1
def test_vectorized_rollout(): """Rollout with a vector env.""" concurrency = 3 env = SequentialVectorEnv([build_dummy_structured_env] * concurrency) rollout_generator = RolloutGenerator(env=env) policy = DistributedRandomPolicy(env.action_spaces_dict, concurrency=concurrency) trajectory = rollout_generator.rollout(policy, n_steps=10) assert len(trajectory) == 10 sub_step_keys = env.action_spaces_dict.keys() for record in trajectory.step_records: assert sub_step_keys == record.actions_dict.keys() assert sub_step_keys == record.observations_dict.keys() assert sub_step_keys == record.rewards_dict.keys() assert record.batch_shape == [concurrency] # The first dimension of the observations should correspond to the distributed env concurrency # (We just check the very first array present in the first observation) first_sub_step_obs: Dict = list(record.observations_dict.values())[0] first_obs_value = list(first_sub_step_obs.values())[0] assert first_obs_value.shape[0] == concurrency
def main(n_epochs: int) -> None: """Trains the cart pole environment with the multi-step a2c implementation. """ # initialize distributed env envs = SequentialVectorEnv( [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)], logging_prefix="train") # initialize the env and enable statistics collection eval_env = SequentialVectorEnv( [lambda: GymMazeEnv(env="CartPole-v0") for _ in range(8)], logging_prefix="eval") # init distribution mapper env = GymMazeEnv(env="CartPole-v0") # init default distribution mapper distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) # initialize policies policies = { 0: PolicyNet({'observation': (4, )}, {'action': (2, )}, non_lin=nn.Tanh) } # initialize critic critics = {0: ValueNet({'observation': (4, )})} # initialize optimizer algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs, epoch_length=10, patience=10, critic_burn_in_epochs=0, n_rollout_steps=20, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.0, max_grad_norm=0.0, device="cpu", rollout_evaluator=RolloutEvaluator( eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True)) # initialize actor critic model model = TorchActorCritic(policy=TorchPolicy( networks=policies, distribution_mapper=distribution_mapper, device=algorithm_config.device), critic=TorchSharedStateCritic( networks=critics, obs_spaces_dict=env.observation_spaces_dict, device=algorithm_config.device, stack_observations=False), device=algorithm_config.device) a2c = A2C(rollout_generator=RolloutGenerator(envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=model, model_selection=None) setup_logging(job_config=None) # train agent a2c.train() # final evaluation run print("Final Evaluation Run:") a2c.evaluate()
def main(n_epochs: int, rnn_steps: int) -> None: """Trains the cart pole environment with the multi-step a2c implementation. """ env_name = "CartPole-v0" # initialize distributed env envs = SequentialVectorEnv([ lambda: to_rnn_dict_space_environment(env=env_name, rnn_steps=rnn_steps) for _ in range(4) ], logging_prefix="train") # initialize the env and enable statistics collection eval_env = SequentialVectorEnv([ lambda: to_rnn_dict_space_environment(env=env_name, rnn_steps=rnn_steps) for _ in range(4) ], logging_prefix="eval") # map observations to a modality obs_modalities_mappings = {"observation": "feature"} # define how to process a modality modality_config = dict() modality_config["feature"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [32, 32], "non_lin": "torch.nn.Tanh" } } modality_config["hidden"] = { "block_type": "maze.perception.blocks.DenseBlock", "block_params": { "hidden_units": [64], "non_lin": "torch.nn.Tanh" } } modality_config["recurrence"] = {} if rnn_steps > 0: modality_config["recurrence"] = { "block_type": "maze.perception.blocks.LSTMLastStepBlock", "block_params": { "hidden_size": 8, "num_layers": 1, "bidirectional": False, "non_lin": "torch.nn.Tanh" } } template_builder = TemplateModelComposer( action_spaces_dict=envs.action_spaces_dict, observation_spaces_dict=envs.observation_spaces_dict, agent_counts_dict=envs.agent_counts_dict, distribution_mapper_config={}, model_builder=ConcatModelBuilder(modality_config, obs_modalities_mappings, None), policy={ '_target_': 'maze.perception.models.policies.ProbabilisticPolicyComposer' }, critic={ '_target_': 'maze.perception.models.critics.StateCriticComposer' }) algorithm_config = A2CAlgorithmConfig(n_epochs=n_epochs, epoch_length=10, patience=10, critic_burn_in_epochs=0, n_rollout_steps=20, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.0, max_grad_norm=0.0, device="cpu", rollout_evaluator=RolloutEvaluator( eval_env=eval_env, n_episodes=1, model_selection=None, deterministic=True)) model = TorchActorCritic(policy=TorchPolicy( networks=template_builder.policy.networks, distribution_mapper=template_builder.distribution_mapper, device=algorithm_config.device), critic=template_builder.critic, device=algorithm_config.device) a2c = A2C(rollout_generator=RolloutGenerator(envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=model, model_selection=None) setup_logging(job_config=None) # train agent a2c.train() # final evaluation run print("Final Evaluation Run:") a2c.evaluate()
def test_concepts_and_structures_run_context_overview(): """ Tests snippets in docs/source/concepts_and_structure/run_context_overview.rst. """ # Default overrides for faster tests. Shouldn't change functionality. ac_overrides = {"runner.concurrency": 1} es_overrides = {"algorithm.n_epochs": 1, "algorithm.n_rollouts_per_update": 1} # Training # -------- rc = RunContext( algorithm="a2c", overrides={"env.name": "CartPole-v0", **ac_overrides}, model="vector_obs", critic="template_state", runner="dev", configuration="test" ) rc.train(n_epochs=1) alg_config = A2CAlgorithmConfig( n_epochs=1, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv([lambda: GymMazeEnv("CartPole-v0")]), n_episodes=1, model_selection=None, deterministic=True ) ) rc = RunContext( algorithm=alg_config, overrides={"env.name": "CartPole-v0", **ac_overrides}, model="vector_obs", critic="template_state", runner="dev", configuration="test" ) rc.train(n_epochs=1) rc = RunContext(env=lambda: GymMazeEnv('CartPole-v0'), overrides=es_overrides, runner="dev", configuration="test") rc.train(n_epochs=1) policy_composer_config = { '_target_': 'maze.perception.models.policies.ProbabilisticPolicyComposer', 'networks': [{ '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet', 'non_lin': 'torch.nn.Tanh', 'hidden_units': [256, 256] }], "substeps_with_separate_agent_nets": [], "agent_counts_dict": {0: 1} } rc = RunContext( overrides={"model.policy": policy_composer_config, **es_overrides}, runner="dev", configuration="test" ) rc.train(n_epochs=1) env = GymMazeEnv('CartPole-v0') policy_composer = ProbabilisticPolicyComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, distribution_mapper=DistributionMapper(action_space=env.action_space, distribution_mapper_config={}), networks=[{ '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet', 'non_lin': 'torch.nn.Tanh', 'hidden_units': [222, 222] }], substeps_with_separate_agent_nets=[], agent_counts_dict={0: 1} ) rc = RunContext(overrides={"model.policy": policy_composer, **es_overrides}, runner="dev", configuration="test") rc.train(n_epochs=1) rc = RunContext(algorithm=alg_config, overrides=ac_overrides, runner="dev", configuration="test") rc.train(n_epochs=1) rc.train() # Rollout # ------- obs = env.reset() for i in range(10): action = rc.compute_action(obs) obs, rewards, dones, info = env.step(action) # Evaluation # ---------- env.reset() evaluator = RolloutEvaluator( # Environment has to be have statistics logging capabilities for RolloutEvaluator. eval_env=LogStatsWrapper.wrap(env, logging_prefix="eval"), n_episodes=1, model_selection=None ) evaluator.evaluate(rc.policy)
def train(n_epochs: int) -> int: """ Trains agent in pure Python. :param n_epochs: Number of epochs to train. :return: 0 if successful. """ # Environment setup # ----------------- env = cartpole_env_factory() # Algorithm setup # --------------- algorithm_config = A2CAlgorithmConfig( n_epochs=5, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv([cartpole_env_factory]), n_episodes=1, model_selection=None, deterministic=True ) ) # Custom model setup # ------------------ # Policy customization # ^^^^^^^^^^^^^^^^^^^^ # Policy network. policy_net = CartpolePolicyNet( obs_shapes={'observation': env.observation_space.spaces['observation'].shape}, action_logit_shapes={'action': (env.action_space.spaces['action'].n,)} ) policy_networks = [policy_net] # Policy distribution. distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) # Policy composer. policy_composer = ProbabilisticPolicyComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, # Derive distribution from environment's action space. distribution_mapper=distribution_mapper, networks=policy_networks, # We have only one agent and network, thus this is an empty list. substeps_with_separate_agent_nets=[], # We have only one step and one agent. agent_counts_dict={0: 1} ) # Critic customization # ^^^^^^^^^^^^^^^^^^^^ # Value networks. value_networks = { 0: TorchModelBlock( in_keys='observation', out_keys='value', in_shapes=env.observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=CartpoleValueNet({'observation': env.observation_space.spaces['observation'].shape}) ) } # Critic composer. critic_composer = SharedStateCriticComposer( observation_spaces_dict=env.observation_spaces_dict, agent_counts_dict={0: 1}, networks=value_networks, stack_observations=True ) # Training # ^^^^^^^^ rc = run_context.RunContext( env=cartpole_env_factory, algorithm=algorithm_config, policy=policy_composer, critic=critic_composer, runner="dev" ) rc.train(n_epochs=n_epochs) # Distributed training # ^^^^^^^^^^^^^^^^^^^^ algorithm_config.rollout_evaluator.eval_env = SubprocVectorEnv([cartpole_env_factory]) rc = run_context.RunContext( env=cartpole_env_factory, algorithm=algorithm_config, policy=policy_composer, critic=critic_composer, runner="local" ) rc.train(n_epochs=n_epochs) # Evaluation # ^^^^^^^^^^ print("-----------------") evaluator = RolloutEvaluator( eval_env=LogStatsWrapper.wrap(cartpole_env_factory(), logging_prefix="eval"), n_episodes=1, model_selection=None ) evaluator.evaluate(rc.policy) return 0
def evaluate(self, **eval_kwargs) -> Union[LogStats, List[LogStats]]: """ Evaluates the trained/loaded policy with an RolloutEvaluator. By default 8 episodes are evaluated sequentially. :param eval_kwargs: kwargs to overwrite set (or default) initialization parameters for RolloutEvaluator. Note that these arguments are ignored if RolloutRunner was passed as instance in AlgorithmConfig. :return: Logged statistics. One LogStats object if RunContext doesn't operate in multi-run mode, otherwise a list thereof. """ # Collect env factories and policies, wrap them in lists if they aren't already. env_factories = self.env_factory policies = self.policy if not isinstance(env_factories, List): env_factories = [env_factories] policies = [policies] # Generate rollout evaluators. rollout_evaluators: List[RolloutEvaluator] = [] for runner, env_fn in zip(self._runners[RunMode.TRAINING], env_factories): # If rollout evaluator is not specified at all, create incomplete config with target. try: ro_eval = runner.cfg.algorithm.rollout_evaluator except omegaconf.errors.ConfigAttributeError: ro_eval = { "_target_": "maze.train.trainers.common.evaluators.rollout_evaluator.RolloutEvaluator" } # Override with specified arguments. if isinstance(ro_eval, DictConfig): ro_eval = omegaconf.OmegaConf.to_object(ro_eval) if isinstance(ro_eval, dict): ro_eval = {**ro_eval, **eval_kwargs} # Try to instantiate rollout runner directly from config. Works if completely specified in config or present # as instance of RolloutEvaluator. try: ro_eval = Factory(RolloutEvaluator).instantiate(ro_eval) # Merge with default values in case of incomplete RolloutEvaluator config. except TypeError: default_params = { "eval_env": SequentialVectorEnv(env_factories=[env_fn]), "n_episodes": 8, "model_selection": None, "deterministic": False } ro_eval = Factory(RolloutEvaluator).instantiate({ **default_params, **ro_eval }) finally: rollout_evaluators.append(ro_eval) # Evaluate policies. stats = [ self._silence(lambda: [ ro_eval.evaluate(policy), ro_eval.eval_env.get_stats(LogStatsLevel.EPOCH).last_stats ][-1]) for env_factory, policy, ro_eval in zip( env_factories, policies, rollout_evaluators) ] return stats[0] if len(stats) == 0 else stats
def train(n_epochs): # Instantiate one environment. This will be used for convenient access to observation # and action spaces. env = cartpole_env_factory() observation_space = env.observation_space action_space = env.action_space # Policy Setup # ------------ # Policy Network # ^^^^^^^^^^^^^^ # Instantiate policy with the correct shapes of observation and action spaces. policy_net = CartpolePolicyNet( obs_shapes={'observation': observation_space.spaces['observation'].shape}, action_logit_shapes={'action': (action_space.spaces['action'].n,)}) maze_wrapped_policy_net = TorchModelBlock( in_keys='observation', out_keys='action', in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=policy_net) policy_networks = {0: maze_wrapped_policy_net} # Policy Distribution # ^^^^^^^^^^^^^^^^^^^ distribution_mapper = DistributionMapper( action_space=action_space, distribution_mapper_config={}) # Optionally, you can specify a different distribution with the distribution_mapper_config argument. Using a # Categorical distribution for a discrete action space would be done via distribution_mapper = DistributionMapper( action_space=action_space, distribution_mapper_config=[{ "action_space": gym.spaces.Discrete, "distribution": "maze.distributions.categorical.CategoricalProbabilityDistribution"}]) # Instantiating the Policy # ^^^^^^^^^^^^^^^^^^^^^^^^ torch_policy = TorchPolicy(networks=policy_networks, distribution_mapper=distribution_mapper, device='cpu') # Value Function Setup # -------------------- # Value Network # ^^^^^^^^^^^^^ value_net = CartpoleValueNet(obs_shapes={'observation': observation_space.spaces['observation'].shape}) maze_wrapped_value_net = TorchModelBlock( in_keys='observation', out_keys='value', in_shapes=observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=value_net) value_networks = {0: maze_wrapped_value_net} # Instantiate the Value Function # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ torch_critic = TorchSharedStateCritic(networks=value_networks, obs_spaces_dict=env.observation_spaces_dict, device='cpu', stack_observations=False) # Initializing the ActorCritic Model. # ----------------------------------- actor_critic_model = TorchActorCritic(policy=torch_policy, critic=torch_critic, device='cpu') # Instantiating the Trainer # ========================= algorithm_config = A2CAlgorithmConfig( n_epochs=n_epochs, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv([cartpole_env_factory]), n_episodes=1, model_selection=None, deterministic=True ) ) # Distributed Environments # ------------------------ # In order to use the distributed trainers, the previously created env factory is supplied to one of Maze's # distribution classes: train_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="train") eval_envs = SequentialVectorEnv([cartpole_env_factory for _ in range(2)], logging_prefix="eval") # Initialize best model selection. model_selection = BestModelSelection(dump_file="params.pt", model=actor_critic_model) a2c_trainer = A2C(rollout_generator=RolloutGenerator(train_envs), evaluator=algorithm_config.rollout_evaluator, algorithm_config=algorithm_config, model=actor_critic_model, model_selection=model_selection) # Train the Agent # =============== # Before starting the training, we will enable logging by calling log_dir = '.' setup_logging(job_config=None, log_dir=log_dir) # Now, we can train the agent. a2c_trainer.train() return 0