import pytest from mlagents.trainers.tests.simple_test_envs import ( SimpleEnvironment, MemoryEnvironment, ) from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config from mlagents.trainers.tests.check_env_trains import check_environment_trains BRAIN_NAME = "1D" PPO_TORCH_CONFIG = ppo_dummy_config() SAC_TORCH_CONFIG = sac_dummy_config() @pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)]) def test_hybrid_ppo(action_size): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8) new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings) new_hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings,
def dummy_config(): return attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
def dummy_config(): return sac_dummy_config()
EncoderType, FrameworkType, ) from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager from mlagents_envs.side_channel.environment_parameters_channel import ( EnvironmentParametersChannel, ) from mlagents_envs.communicator_objects.demonstration_meta_pb2 import ( DemonstrationMetaProto, ) from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config PPO_TF_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW) SAC_TF_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.TENSORFLOW) BRAIN_NAME = "1D" # The reward processor is passed as an argument to _check_environment_trains. # It is applied to the list of all final rewards for each brain individually. # This is so that we can process all final rewards in different ways for different algorithms. # Custom reward processors should be built within the test function and passed to _check_environment_trains # Default is average over the last 5 final rewards def default_reward_processor(rewards, last_n_rewards=5): rewards_to_use = rewards[-last_n_rewards:] # For debugging tests print(f"Last {last_n_rewards} rewards:", rewards_to_use) return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
from mlagents.trainers.tests.simple_test_envs import ( SimpleEnvironment, MemoryEnvironment, ) from mlagents.trainers.settings import NetworkSettings, FrameworkType from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config from mlagents.trainers.tests.check_env_trains import check_environment_trains BRAIN_NAME = "1D" PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH) SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH) @pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)]) def test_hybrid_ppo(action_size): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8) new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings) new_hyperparams = attr.evolve( PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024 ) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=10000, )
def dummy_config(): return attr.evolve(sac_dummy_config(), framework=FrameworkType.TENSORFLOW)
def sac_config(): return RunOptions(behaviors={"test_brain": sac_dummy_config()})
assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,) def reward_signal_update(optimizer, reward_signal_name): buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update( optimizer.policy, buffer.make_mini_batch(0, 10), 2 ) out = optimizer.policy._execute_model( feed_dict, optimizer.reward_signals[reward_signal_name].update_dict ) assert type(out) is dict @pytest.mark.parametrize( "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] ) def test_gail_cc(trainer_config, gail_dummy_config): # noqa: F811 trainer_config.behavioral_cloning = BehavioralCloningSettings( demo_path=CONTINUOUS_DEMO_PATH ) optimizer = create_optimizer_mock( trainer_config, gail_dummy_config, False, False, False ) reward_signal_eval(optimizer, "gail") reward_signal_update(optimizer, "gail") @pytest.mark.parametrize( "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] )