def test_rollout_evaluator(): env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env(), max_episode_steps=2)] * 2) policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env()) model_selection = _MockModelSelection() evaluator = RolloutEvaluator(eval_env=env, n_episodes=3, model_selection=model_selection) for i in range(2): evaluator.evaluate(policy) increment_log_step() assert model_selection.update_count == 2 assert evaluator.eval_env.get_stats_value( BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="total_episode_count" ) >= 2 * 3
def test_does_not_carry_over_stats_from_unfinished_episodes(): policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env()) # Wrap envs in a time-limit wrapper env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env())] * 2) # Make one env slower than the other env.envs[0].set_max_episode_steps(2) env.envs[1].set_max_episode_steps(10) evaluator = RolloutEvaluator(eval_env=env, n_episodes=1, model_selection=None) for i in range(2): evaluator.evaluate(policy) increment_log_step() # We should get just one episode counted in stats assert evaluator.eval_env.get_stats_value( BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="episode_count" ) == 1
def test_concepts_and_structures_run_context_overview(): """ Tests snippets in docs/source/concepts_and_structure/run_context_overview.rst. """ # Default overrides for faster tests. Shouldn't change functionality. ac_overrides = {"runner.concurrency": 1} es_overrides = {"algorithm.n_epochs": 1, "algorithm.n_rollouts_per_update": 1} # Training # -------- rc = RunContext( algorithm="a2c", overrides={"env.name": "CartPole-v0", **ac_overrides}, model="vector_obs", critic="template_state", runner="dev", configuration="test" ) rc.train(n_epochs=1) alg_config = A2CAlgorithmConfig( n_epochs=1, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv([lambda: GymMazeEnv("CartPole-v0")]), n_episodes=1, model_selection=None, deterministic=True ) ) rc = RunContext( algorithm=alg_config, overrides={"env.name": "CartPole-v0", **ac_overrides}, model="vector_obs", critic="template_state", runner="dev", configuration="test" ) rc.train(n_epochs=1) rc = RunContext(env=lambda: GymMazeEnv('CartPole-v0'), overrides=es_overrides, runner="dev", configuration="test") rc.train(n_epochs=1) policy_composer_config = { '_target_': 'maze.perception.models.policies.ProbabilisticPolicyComposer', 'networks': [{ '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet', 'non_lin': 'torch.nn.Tanh', 'hidden_units': [256, 256] }], "substeps_with_separate_agent_nets": [], "agent_counts_dict": {0: 1} } rc = RunContext( overrides={"model.policy": policy_composer_config, **es_overrides}, runner="dev", configuration="test" ) rc.train(n_epochs=1) env = GymMazeEnv('CartPole-v0') policy_composer = ProbabilisticPolicyComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, distribution_mapper=DistributionMapper(action_space=env.action_space, distribution_mapper_config={}), networks=[{ '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet', 'non_lin': 'torch.nn.Tanh', 'hidden_units': [222, 222] }], substeps_with_separate_agent_nets=[], agent_counts_dict={0: 1} ) rc = RunContext(overrides={"model.policy": policy_composer, **es_overrides}, runner="dev", configuration="test") rc.train(n_epochs=1) rc = RunContext(algorithm=alg_config, overrides=ac_overrides, runner="dev", configuration="test") rc.train(n_epochs=1) rc.train() # Rollout # ------- obs = env.reset() for i in range(10): action = rc.compute_action(obs) obs, rewards, dones, info = env.step(action) # Evaluation # ---------- env.reset() evaluator = RolloutEvaluator( # Environment has to be have statistics logging capabilities for RolloutEvaluator. eval_env=LogStatsWrapper.wrap(env, logging_prefix="eval"), n_episodes=1, model_selection=None ) evaluator.evaluate(rc.policy)
def train(n_epochs: int) -> int: """ Trains agent in pure Python. :param n_epochs: Number of epochs to train. :return: 0 if successful. """ # Environment setup # ----------------- env = cartpole_env_factory() # Algorithm setup # --------------- algorithm_config = A2CAlgorithmConfig( n_epochs=5, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv([cartpole_env_factory]), n_episodes=1, model_selection=None, deterministic=True ) ) # Custom model setup # ------------------ # Policy customization # ^^^^^^^^^^^^^^^^^^^^ # Policy network. policy_net = CartpolePolicyNet( obs_shapes={'observation': env.observation_space.spaces['observation'].shape}, action_logit_shapes={'action': (env.action_space.spaces['action'].n,)} ) policy_networks = [policy_net] # Policy distribution. distribution_mapper = DistributionMapper(action_space=env.action_space, distribution_mapper_config={}) # Policy composer. policy_composer = ProbabilisticPolicyComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, # Derive distribution from environment's action space. distribution_mapper=distribution_mapper, networks=policy_networks, # We have only one agent and network, thus this is an empty list. substeps_with_separate_agent_nets=[], # We have only one step and one agent. agent_counts_dict={0: 1} ) # Critic customization # ^^^^^^^^^^^^^^^^^^^^ # Value networks. value_networks = { 0: TorchModelBlock( in_keys='observation', out_keys='value', in_shapes=env.observation_space.spaces['observation'].shape, in_num_dims=[2], out_num_dims=2, net=CartpoleValueNet({'observation': env.observation_space.spaces['observation'].shape}) ) } # Critic composer. critic_composer = SharedStateCriticComposer( observation_spaces_dict=env.observation_spaces_dict, agent_counts_dict={0: 1}, networks=value_networks, stack_observations=True ) # Training # ^^^^^^^^ rc = run_context.RunContext( env=cartpole_env_factory, algorithm=algorithm_config, policy=policy_composer, critic=critic_composer, runner="dev" ) rc.train(n_epochs=n_epochs) # Distributed training # ^^^^^^^^^^^^^^^^^^^^ algorithm_config.rollout_evaluator.eval_env = SubprocVectorEnv([cartpole_env_factory]) rc = run_context.RunContext( env=cartpole_env_factory, algorithm=algorithm_config, policy=policy_composer, critic=critic_composer, runner="local" ) rc.train(n_epochs=n_epochs) # Evaluation # ^^^^^^^^^^ print("-----------------") evaluator = RolloutEvaluator( eval_env=LogStatsWrapper.wrap(cartpole_env_factory(), logging_prefix="eval"), n_episodes=1, model_selection=None ) evaluator.evaluate(rc.policy) return 0