예제 #1
0
def test_observation_monitoring():
    """ Observation logging unit test """
    env = GymMazeEnv(env="CartPole-v0")

    env = ObservationVisualizationWrapper.wrap(env, plot_function=None)
    env = LogStatsWrapper.wrap(env, logging_prefix="train")

    with SimpleStatsLoggingSetup(env, log_dir="."):
        env.reset()
        done = False
        while not done:
            obs, rew, done, info = env.step(env.action_space.sample())
def test_heuristic_lunar_lander_policy():
    """unit tests"""
    policy = HeuristicLunarLanderPolicy()
    env = GymMazeEnv("LunarLander-v2")

    obs = env.reset()
    action = policy.compute_action(obs)
    obs, _, _, _ = env.step(action)
예제 #3
0
def test_maze_gym_env_wrapper():
    """ gym env wrapper unit test """
    env = GymMazeEnv(env="CartPole-v0")
    env.seed(1234)
    obs = env.reset()
    env.observation_conversion.space_to_maze(obs)
    assert not env.is_actor_done()
    assert env.get_serializable_components() == {}
    for _ in range(10):
        env.step(env.action_space.sample())
    env.close()
예제 #4
0
def test_observation_skipping_wrapper_sticky_flat():
    """ Step skipping unit test """

    n_steps = 3

    # instantiate env
    env = GymMazeEnv("CartPole-v0")
    env = StepSkipWrapper.wrap(env, n_steps=n_steps, skip_mode='sticky')
    env = LogStatsWrapper.wrap(env)  # for accessing events from previous steps

    # reset environment and run interaction loop
    env.reset()
    cum_rew = 0
    for i in range(2):
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        cum_rew += reward

        assert len(env.get_last_step_events(query=RewardEvents.reward_original)) == 1

    assert cum_rew == 6
예제 #5
0
def test_readme():
    """
    Tests snippets in readme.md.
    """

    rc = RunContext(env=lambda: GymMazeEnv('CartPole-v0'))
    rc.train(n_epochs=1)

    # Run trained policy.
    env = GymMazeEnv('CartPole-v0')
    obs = env.reset()
    done = False

    while not done:
        action = rc.compute_action(obs)
        obs, reward, done, info = env.step(action)
        break
예제 #6
0
def test_cartpole_policy_model():
    env = GymMazeEnv(env='CartPole-v0')
    observation_spaces_dict = env.observation_spaces_dict
    action_spaces_dict = env.action_spaces_dict

    flat_action_space = flat_structured_space(action_spaces_dict)
    distribution_mapper = DistributionMapper(action_space=flat_action_space,
                                             distribution_mapper_config={})

    action_logits_shapes = {
        step_key: {
            action_head: distribution_mapper.required_logits_shape(action_head)
            for action_head in action_spaces_dict[step_key].spaces.keys()
        }
        for step_key in action_spaces_dict.keys()
    }

    obs_shapes = observation_spaces_to_in_shapes(observation_spaces_dict)

    policy = CustomPlainCartpolePolicyNet(obs_shapes[0],
                                          action_logits_shapes[0],
                                          hidden_layer_0=16,
                                          hidden_layer_1=32,
                                          use_bias=True)

    critic = CustomPlainCartpoleCriticNet(obs_shapes[0],
                                          hidden_layer_0=16,
                                          hidden_layer_1=32,
                                          use_bias=True)

    obs_np = env.reset()
    obs = {k: torch.from_numpy(v) for k, v in obs_np.items()}

    actions = policy(obs)
    values = critic(obs)

    assert 'action' in actions
    assert 'value' in values
예제 #7
0
"""Contains an example showing how to use observation pre-processing directly from python."""
from maze.core.wrappers.maze_gym_env_wrapper import GymMazeEnv
from maze.core.wrappers.observation_preprocessing.preprocessing_wrapper import PreProcessingWrapper

# this is the pre-processor config as a python dict
config = {
    "pre_processor_mapping": [
        {
            "observation": "observation",
            "_target_": "maze.preprocessors.Rgb2GrayPreProcessor",
            "keep_original": False,
            "config": {
                "rgb_dim": -1
            }
        },
    ]
}

# instantiate a maze environment
env = GymMazeEnv("CarRacing-v0")

# wrap the environment for observation pre-processing
env = PreProcessingWrapper.wrap(
    env, pre_processor_mapping=config["pre_processor_mapping"])

# after this step the training env yields pre-processed observations
pre_processed_obs = env.reset()
예제 #8
0
def test_concepts_and_structures_run_context_overview():
    """
    Tests snippets in docs/source/concepts_and_structure/run_context_overview.rst.
    """

    # Default overrides for faster tests. Shouldn't change functionality.
    ac_overrides = {"runner.concurrency": 1}
    es_overrides = {"algorithm.n_epochs": 1, "algorithm.n_rollouts_per_update": 1}

    # Training
    # --------

    rc = RunContext(
        algorithm="a2c",
        overrides={"env.name": "CartPole-v0", **ac_overrides},
        model="vector_obs",
        critic="template_state",
        runner="dev",
        configuration="test"
    )
    rc.train(n_epochs=1)

    alg_config = A2CAlgorithmConfig(
        n_epochs=1,
        epoch_length=25,
        patience=15,
        critic_burn_in_epochs=0,
        n_rollout_steps=100,
        lr=0.0005,
        gamma=0.98,
        gae_lambda=1.0,
        policy_loss_coef=1.0,
        value_loss_coef=0.5,
        entropy_coef=0.00025,
        max_grad_norm=0.0,
        device='cpu',
        rollout_evaluator=RolloutEvaluator(
            eval_env=SequentialVectorEnv([lambda: GymMazeEnv("CartPole-v0")]),
            n_episodes=1,
            model_selection=None,
            deterministic=True
        )
    )

    rc = RunContext(
        algorithm=alg_config,
        overrides={"env.name": "CartPole-v0", **ac_overrides},
        model="vector_obs",
        critic="template_state",
        runner="dev",
        configuration="test"
    )
    rc.train(n_epochs=1)

    rc = RunContext(env=lambda: GymMazeEnv('CartPole-v0'), overrides=es_overrides, runner="dev", configuration="test")
    rc.train(n_epochs=1)

    policy_composer_config = {
        '_target_': 'maze.perception.models.policies.ProbabilisticPolicyComposer',
        'networks': [{
            '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet',
            'non_lin': 'torch.nn.Tanh',
            'hidden_units': [256, 256]
        }],
        "substeps_with_separate_agent_nets": [],
        "agent_counts_dict": {0: 1}
    }
    rc = RunContext(
        overrides={"model.policy": policy_composer_config, **es_overrides}, runner="dev", configuration="test"
    )
    rc.train(n_epochs=1)

    env = GymMazeEnv('CartPole-v0')
    policy_composer = ProbabilisticPolicyComposer(
        action_spaces_dict=env.action_spaces_dict,
        observation_spaces_dict=env.observation_spaces_dict,
        distribution_mapper=DistributionMapper(action_space=env.action_space, distribution_mapper_config={}),
        networks=[{
            '_target_': 'maze.perception.models.built_in.flatten_concat.FlattenConcatPolicyNet',
            'non_lin': 'torch.nn.Tanh',
            'hidden_units': [222, 222]
        }],
        substeps_with_separate_agent_nets=[],
        agent_counts_dict={0: 1}
    )
    rc = RunContext(overrides={"model.policy": policy_composer, **es_overrides}, runner="dev", configuration="test")
    rc.train(n_epochs=1)

    rc = RunContext(algorithm=alg_config, overrides=ac_overrides, runner="dev", configuration="test")
    rc.train(n_epochs=1)
    rc.train()

    # Rollout
    # -------

    obs = env.reset()
    for i in range(10):
        action = rc.compute_action(obs)
        obs, rewards, dones, info = env.step(action)

    # Evaluation
    # ----------

    env.reset()
    evaluator = RolloutEvaluator(
        # Environment has to be have statistics logging capabilities for RolloutEvaluator.
        eval_env=LogStatsWrapper.wrap(env, logging_prefix="eval"),
        n_episodes=1,
        model_selection=None
    )
    evaluator.evaluate(rc.policy)
예제 #9
0
    default_statistics=None,
    statistics_dump="statistics.pkl",
    sampling_policy=RandomPolicy(env.action_spaces_dict),
    exclude=None,
    manual_config=None)

# next we estimate the normalization statistics by
# (1) collecting observations by randomly sampling 1000 transitions from the environment
# (2) computing the statistics according to the define normalization strategy
normalization_statistics = obtain_normalization_statistics(env, n_samples=1000)
env.set_normalization_statistics(normalization_statistics)

# after this step all observations returned by the environment will be normalized

# stable-baselines does not support dict spaces so we have to remove them
env = NoDictSpacesWrapper(env)

# TRAINING AND ROLLOUT (remains unchanged)
# ----------------------------------------

model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()
예제 #10
0
    "statistics_dump": "statistics.pkl",
    "sampling_policy": RandomPolicy(env.action_spaces_dict),
    "exclude": None,
    "manual_config": None
}

# 1. PREPARATION: first we estimate normalization statistics
# ----------------------------------------------------------

# wrap the environment for observation normalization
env = ObservationNormalizationWrapper.wrap(env, **normalization_config)

# before we can start working with normalized observations
# we need to estimate the normalization statistics
normalization_statistics = obtain_normalization_statistics(env, n_samples=1000)

# 2. APPLICATION (training, rollout, deployment)
# ----------------------------------------------

# instantiate a maze environment
training_env = GymMazeEnv("CartPole-v0")
# wrap the environment for observation normalization
training_env = ObservationNormalizationWrapper.wrap(training_env,
                                                    **normalization_config)

# reuse the estimated the statistics in our training environment(s)
training_env.set_normalization_statistics(normalization_statistics)

# after this step the training env yields normalized observations
normalized_obs = training_env.reset()
예제 #11
0
def test_observation_statistics_logging():
    """ observation normalization logging test """

    # normalization config
    normalization_config = {
        "default_strategy":
        "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy",
        "default_strategy_config": {
            "clip_range": (None, None),
            "axis": 0
        },
        "default_statistics": None,
        "statistics_dump": "statistics.pkl",
        "exclude": None,
        "manual_config": {
            "observation": {
                "strategy":
                "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy",
                "strategy_config": {
                    "clip_range": (0, 1)
                },
                "statistics": {
                    "mean": [0, 0, 0, 0],
                    "std": [1, 1, 1, 1]
                }
            }
        }
    }
    writer = LogStatsWriterTensorboard(log_dir='test_log',
                                       tensorboard_render_figure=True)
    register_log_stats_writer(writer)
    # attach a console writer as well for immediate console feedback
    register_log_stats_writer(LogStatsWriterConsole())

    # init environment
    env = GymMazeEnv("CartPole-v0")

    # wrap env with observation normalization
    env = ObservationNormalizationWrapper(
        env,
        default_strategy=normalization_config["default_strategy"],
        default_strategy_config=normalization_config[
            "default_strategy_config"],
        default_statistics=normalization_config["default_statistics"],
        statistics_dump=normalization_config["statistics_dump"],
        sampling_policy=RandomPolicy(env.action_spaces_dict),
        exclude=normalization_config["exclude"],
        manual_config=normalization_config["manual_config"])

    env = LogStatsWrapper.wrap(env, logging_prefix="train")

    n_episodes = 10
    n_steps_per_episode = 100
    for episode in range(n_episodes):
        _ = env.reset()
        for step in range(n_steps_per_episode):
            # take random action
            action = env.action_space.sample()

            # take step in env and trigger log stats writing
            _, _, done, _ = env.step(action)

            if done:
                break

        increment_log_step()