def test_observation_monitoring(): """ Observation logging unit test """ env = GymMazeEnv(env="CartPole-v0") env = ObservationVisualizationWrapper.wrap(env, plot_function=None) env = LogStatsWrapper.wrap(env, logging_prefix="train") with SimpleStatsLoggingSetup(env, log_dir="."): env.reset() done = False while not done: obs, rew, done, info = env.step(env.action_space.sample())
def test_heuristic_lunar_lander_policy(): """unit tests""" policy = HeuristicLunarLanderPolicy() env = GymMazeEnv("LunarLander-v2") obs = env.reset() action = policy.compute_action(obs) obs, _, _, _ = env.step(action)
def test_maze_gym_env_wrapper(): """ gym env wrapper unit test """ env = GymMazeEnv(env="CartPole-v0") env.seed(1234) obs = env.reset() env.observation_conversion.space_to_maze(obs) assert not env.is_actor_done() assert env.get_serializable_components() == {} for _ in range(10): env.step(env.action_space.sample()) env.close()
def test_observation_skipping_wrapper_sticky_flat(): """ Step skipping unit test """ n_steps = 3 # instantiate env env = GymMazeEnv("CartPole-v0") env = StepSkipWrapper.wrap(env, n_steps=n_steps, skip_mode='sticky') env = LogStatsWrapper.wrap(env) # for accessing events from previous steps # reset environment and run interaction loop env.reset() cum_rew = 0 for i in range(2): action = env.action_space.sample() obs, reward, done, info = env.step(action) cum_rew += reward assert len(env.get_last_step_events(query=RewardEvents.reward_original)) == 1 assert cum_rew == 6
def test_readme(): """ Tests snippets in readme.md. """ rc = RunContext(env=lambda: GymMazeEnv('CartPole-v0')) rc.train(n_epochs=1) # Run trained policy. env = GymMazeEnv('CartPole-v0') obs = env.reset() done = False while not done: action = rc.compute_action(obs) obs, reward, done, info = env.step(action) break
def test_cartpole_policy_model(): env = GymMazeEnv(env='CartPole-v0') observation_spaces_dict = env.observation_spaces_dict action_spaces_dict = env.action_spaces_dict flat_action_space = flat_structured_space(action_spaces_dict) distribution_mapper = DistributionMapper(action_space=flat_action_space, distribution_mapper_config={}) action_logits_shapes = { step_key: { action_head: distribution_mapper.required_logits_shape(action_head) for action_head in action_spaces_dict[step_key].spaces.keys() } for step_key in action_spaces_dict.keys() } obs_shapes = observation_spaces_to_in_shapes(observation_spaces_dict) policy = CustomPlainCartpolePolicyNet(obs_shapes[0], action_logits_shapes[0], hidden_layer_0=16, hidden_layer_1=32, use_bias=True) critic = CustomPlainCartpoleCriticNet(obs_shapes[0], hidden_layer_0=16, hidden_layer_1=32, use_bias=True) obs_np = env.reset() obs = {k: torch.from_numpy(v) for k, v in obs_np.items()} actions = policy(obs) values = critic(obs) assert 'action' in actions assert 'value' in values
"""Contains an example showing how to use observation pre-processing directly from python.""" from maze.core.wrappers.maze_gym_env_wrapper import GymMazeEnv from maze.core.wrappers.observation_preprocessing.preprocessing_wrapper import PreProcessingWrapper # this is the pre-processor config as a python dict config = { "pre_processor_mapping": [ { "observation": "observation", "_target_": "maze.preprocessors.Rgb2GrayPreProcessor", "keep_original": False, "config": { "rgb_dim": -1 } }, ] } # instantiate a maze environment env = GymMazeEnv("CarRacing-v0") # wrap the environment for observation pre-processing env = PreProcessingWrapper.wrap( env, pre_processor_mapping=config["pre_processor_mapping"]) # after this step the training env yields pre-processed observations pre_processed_obs = env.reset()
def test_concepts_and_structures_run_context_overview(): """ Tests snippets in docs/source/concepts_and_structure/run_context_overview.rst. """ # Default overrides for faster tests. Shouldn't change functionality. ac_overrides = {"runner.concurrency": 1} es_overrides = {"algorithm.n_epochs": 1, "algorithm.n_rollouts_per_update": 1} # Training # -------- rc = RunContext( algorithm="a2c", overrides={"env.name": "CartPole-v0", **ac_overrides}, model="vector_obs", critic="template_state", runner="dev", configuration="test" ) rc.train(n_epochs=1) alg_config = A2CAlgorithmConfig( n_epochs=1, epoch_length=25, patience=15, critic_burn_in_epochs=0, n_rollout_steps=100, lr=0.0005, gamma=0.98, gae_lambda=1.0, policy_loss_coef=1.0, value_loss_coef=0.5, entropy_coef=0.00025, max_grad_norm=0.0, device='cpu', rollout_evaluator=RolloutEvaluator( eval_env=SequentialVectorEnv([lambda: GymMazeEnv("CartPole-v0")]), n_episodes=1, model_selection=None, deterministic=True ) ) rc = RunContext( algorithm=alg_config, overrides={"env.name": "CartPole-v0", **ac_overrides}, model="vector_obs", critic="template_state", runner="dev", configuration="test" ) rc.train(n_epochs=1) rc = RunContext(env=lambda: GymMazeEnv('CartPole-v0'), overrides=es_overrides, runner="dev", configuration="test") rc.train(n_epochs=1) policy_composer_config = { '_target_': 'maze.perception.models.policies.ProbabilisticPolicyComposer', 'networks': [{ '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet', 'non_lin': 'torch.nn.Tanh', 'hidden_units': [256, 256] }], "substeps_with_separate_agent_nets": [], "agent_counts_dict": {0: 1} } rc = RunContext( overrides={"model.policy": policy_composer_config, **es_overrides}, runner="dev", configuration="test" ) rc.train(n_epochs=1) env = GymMazeEnv('CartPole-v0') policy_composer = ProbabilisticPolicyComposer( action_spaces_dict=env.action_spaces_dict, observation_spaces_dict=env.observation_spaces_dict, distribution_mapper=DistributionMapper(action_space=env.action_space, distribution_mapper_config={}), networks=[{ '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet', 'non_lin': 'torch.nn.Tanh', 'hidden_units': [222, 222] }], substeps_with_separate_agent_nets=[], agent_counts_dict={0: 1} ) rc = RunContext(overrides={"model.policy": policy_composer, **es_overrides}, runner="dev", configuration="test") rc.train(n_epochs=1) rc = RunContext(algorithm=alg_config, overrides=ac_overrides, runner="dev", configuration="test") rc.train(n_epochs=1) rc.train() # Rollout # ------- obs = env.reset() for i in range(10): action = rc.compute_action(obs) obs, rewards, dones, info = env.step(action) # Evaluation # ---------- env.reset() evaluator = RolloutEvaluator( # Environment has to be have statistics logging capabilities for RolloutEvaluator. eval_env=LogStatsWrapper.wrap(env, logging_prefix="eval"), n_episodes=1, model_selection=None ) evaluator.evaluate(rc.policy)
default_statistics=None, statistics_dump="statistics.pkl", sampling_policy=RandomPolicy(env.action_spaces_dict), exclude=None, manual_config=None) # next we estimate the normalization statistics by # (1) collecting observations by randomly sampling 1000 transitions from the environment # (2) computing the statistics according to the define normalization strategy normalization_statistics = obtain_normalization_statistics(env, n_samples=1000) env.set_normalization_statistics(normalization_statistics) # after this step all observations returned by the environment will be normalized # stable-baselines does not support dict spaces so we have to remove them env = NoDictSpacesWrapper(env) # TRAINING AND ROLLOUT (remains unchanged) # ---------------------------------------- model = A2C('MlpPolicy', env, verbose=1) model.learn(total_timesteps=10000) obs = env.reset() for i in range(1000): action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
"statistics_dump": "statistics.pkl", "sampling_policy": RandomPolicy(env.action_spaces_dict), "exclude": None, "manual_config": None } # 1. PREPARATION: first we estimate normalization statistics # ---------------------------------------------------------- # wrap the environment for observation normalization env = ObservationNormalizationWrapper.wrap(env, **normalization_config) # before we can start working with normalized observations # we need to estimate the normalization statistics normalization_statistics = obtain_normalization_statistics(env, n_samples=1000) # 2. APPLICATION (training, rollout, deployment) # ---------------------------------------------- # instantiate a maze environment training_env = GymMazeEnv("CartPole-v0") # wrap the environment for observation normalization training_env = ObservationNormalizationWrapper.wrap(training_env, **normalization_config) # reuse the estimated the statistics in our training environment(s) training_env.set_normalization_statistics(normalization_statistics) # after this step the training env yields normalized observations normalized_obs = training_env.reset()
def test_observation_statistics_logging(): """ observation normalization logging test """ # normalization config normalization_config = { "default_strategy": "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy", "default_strategy_config": { "clip_range": (None, None), "axis": 0 }, "default_statistics": None, "statistics_dump": "statistics.pkl", "exclude": None, "manual_config": { "observation": { "strategy": "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy", "strategy_config": { "clip_range": (0, 1) }, "statistics": { "mean": [0, 0, 0, 0], "std": [1, 1, 1, 1] } } } } writer = LogStatsWriterTensorboard(log_dir='test_log', tensorboard_render_figure=True) register_log_stats_writer(writer) # attach a console writer as well for immediate console feedback register_log_stats_writer(LogStatsWriterConsole()) # init environment env = GymMazeEnv("CartPole-v0") # wrap env with observation normalization env = ObservationNormalizationWrapper( env, default_strategy=normalization_config["default_strategy"], default_strategy_config=normalization_config[ "default_strategy_config"], default_statistics=normalization_config["default_statistics"], statistics_dump=normalization_config["statistics_dump"], sampling_policy=RandomPolicy(env.action_spaces_dict), exclude=normalization_config["exclude"], manual_config=normalization_config["manual_config"]) env = LogStatsWrapper.wrap(env, logging_prefix="train") n_episodes = 10 n_steps_per_episode = 100 for episode in range(n_episodes): _ = env.reset() for step in range(n_steps_per_episode): # take random action action = env.action_space.sample() # take step in env and trigger log stats writing _, _, done, _ = env.step(action) if done: break increment_log_step()