def test_recurrent_sac(use_discrete): step_size = 0.5 if use_discrete else 0.2 env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=step_size) new_networksettings = attr.evolve( SAC_TF_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), ) new_hyperparams = attr.evolve( SAC_TF_CONFIG.hyperparameters, batch_size=128, learning_rate=1e-3, buffer_init_steps=1000, steps_per_update=2, ) config = attr.evolve( SAC_TF_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=5000, framework=FrameworkType.TENSORFLOW, ) _check_environment_trains(env, {BRAIN_NAME: config})
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_brain = mb.setup_mock_brain( use_discrete, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16 ) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def test_networkbody_lstm(): torch.manual_seed(0) obs_size = 4 seq_len = 6 network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings( sequence_length=seq_len, memory_size=12)) obs_shapes = [(obs_size, )] networkbody = NetworkBody(create_observation_specs_with_shapes(obs_shapes), network_settings) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = torch.ones((seq_len, obs_size)) for _ in range(300): encoded, _ = networkbody([sample_obs], memories=torch.ones(1, 1, 12), sequence_length=seq_len) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_recurrent_poca(action_sizes, is_multiagent): if is_multiagent: # This is not a recurrent environment, just check if LSTM doesn't crash env = MultiAgentEnvironment([BRAIN_NAME], action_sizes=action_sizes, num_agents=2) else: # Actually test LSTM here env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes) new_network_settings = attr.evolve( POCA_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( POCA_TORCH_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128, ) config = attr.evolve( POCA_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=500 if is_multiagent else 6000, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None if is_multiagent else 0.9)
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = trainer_config trainer_settings.reward_signals = reward_signal_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = NNPolicy(0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False) if trainer_settings.trainer_type == TrainerType.SAC: optimizer = SACOptimizer(policy, trainer_settings) else: optimizer = PPOOptimizer(policy, trainer_settings) return optimizer
def create_policy_mock( dummy_config: TrainerSettings, use_rnn: bool = False, use_discrete: bool = True, use_visual: bool = False, model_path: str = "", load: bool = False, seed: int = 0, ) -> TFPolicy: mock_spec = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = dummy_config trainer_settings.keep_checkpoints = 3 trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings() if use_rnn else None ) policy = TFPolicy( seed, mock_spec, trainer_settings, model_path=model_path, load=load ) return policy
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): # model_path = env.external_brain_names[0] trainer_config = TrainerSettings() trainer_config.network_settings.memory = (NetworkSettings.MemorySettings() if use_rnn else None) policy = NNPolicy( 0, mock_behavior_specs, trainer_config, False, "test", False, tanhresample, tanhresample, ) with policy.graph.as_default(): bc_module = BCModule( policy, policy_learning_rate=trainer_config.hyperparameters.learning_rate, default_batch_size=trainer_config.hyperparameters.batch_size, default_num_epoch=3, settings=bc_settings, ) policy.initialize_or_load( ) # Normally the optimizer calls this after the BCModule is created return bc_module
def test_actor_critic(ac_type, lstm): obs_size = 4 network_settings = NetworkSettings( memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True) obs_spec = create_observation_specs_with_shapes([(obs_size, )]) act_size = 2 mask = torch.ones([1, act_size * 2]) stream_names = [f"stream_name{n}" for n in range(4)] # action_spec = ActionSpec.create_continuous(act_size[0]) action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) actor = ac_type(obs_spec, network_settings, action_spec, stream_names) if lstm: sample_obs = torch.ones( (1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones( (1, network_settings.memory.sequence_length, actor.memory_size)) else: sample_obs = torch.ones((1, obs_size)) memories = torch.tensor([]) # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass value_out, memories_out = actor.critic_pass([sample_obs], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1, ) # Test get action stats and_value action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value( [sample_obs], memories=memories, masks=mask) if lstm: assert action.continuous_tensor.shape == (64, 2) else: assert action.continuous_tensor.shape == (1, 2) assert len(action.discrete_list) == 2 for _disc in action.discrete_list: if lstm: assert _disc.shape == (64, 1) else: assert _disc.shape == (1, 1) if mem_out is not None: assert mem_out.shape == memories.shape for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) else: assert value_out[stream].shape == (1, )
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_behavior_spec = mb.setup_test_behavior_specs( use_discrete, False, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config mock_brain_name = "MockBrain" behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name) trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(behavior_id, mock_behavior_spec) trainer.add_policy(behavior_id, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not if use_discrete: n_agents = len(buffer["discrete_log_probs"]) buffer["discrete_log_probs"].reset_field() for _ in range(n_agents): buffer["discrete_log_probs"].append( np.ones( int(sum(mock_behavior_spec.action_spec.discrete_branches)), dtype=np.float32, )) else: n_agents = len(buffer["continuous_log_probs"]) buffer["continuous_log_probs"].reset_field() for _ in range(n_agents): buffer["continuous_log_probs"].append( np.ones(mock_behavior_spec.action_spec.continuous_size, dtype=np.float32)) trainer.update_buffer = buffer trainer._update_policy()
def test_bad_config(): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) # Test that we throw an error if we have sequence length greater than batch size with pytest.raises(TrainerConfigError): TrainerSettings( network_settings=NetworkSettings( memory=NetworkSettings.MemorySettings(sequence_length=64)), hyperparameters=PPOSettings(batch_size=32), ) _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
def test_multinetworkbody_lstm(with_actions): torch.manual_seed(0) obs_size = 4 act_size = 2 seq_len = 16 n_agents = 3 network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings( sequence_length=seq_len, memory_size=12)) obs_shapes = [(obs_size, )] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = [[0.1 * torch.ones((seq_len, obs_size))] for _ in range(n_agents)] # simulate baseline in POCA sample_act = [ AgentAction( 0.1 * torch.ones((seq_len, 2)), [0.1 * torch.ones(seq_len) for _ in range(act_size)], ) for _ in range(n_agents - 1) ] for _ in range(300): if with_actions: encoded, _ = networkbody( obs_only=sample_obs[:1], obs=sample_obs[1:], actions=sample_act, memories=torch.ones(1, 1, 12), sequence_length=seq_len, ) else: encoded, _ = networkbody( obs_only=sample_obs, obs=[], actions=[], memories=torch.ones(1, 1, 12), sequence_length=seq_len, ) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = dummy_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=12) if use_rnn else None) policy = TorchPolicy(0, mock_brain, trainer_settings) optimizer = TorchSACOptimizer(policy, trainer_settings) return optimizer
def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]: all_behavior_config_dict = {} default_config = old_trainer_config.get("default", {}) for behavior_name, config in old_trainer_config.items(): if behavior_name != "default": config = default_config.copy() config.update(old_trainer_config[behavior_name]) # Convert to split TrainerSettings, Hyperparameters, NetworkSettings # Set trainer_type and get appropriate hyperparameter settings try: trainer_type = config["trainer"] except KeyError: raise TrainerConfigError( "Config doesn't specify a trainer type. " "Please specify trainer: in your config." ) new_config = {} new_config["trainer_type"] = trainer_type hyperparam_cls = TrainerType(trainer_type).to_settings() # Try to absorb as much as possible into the hyperparam_cls new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls) # Try to absorb as much as possible into the network settings new_config["network_settings"] = cattr.structure(config, NetworkSettings) # Deal with recurrent try: if config["use_recurrent"]: new_config[ "network_settings" ].memory = NetworkSettings.MemorySettings( sequence_length=config["sequence_length"], memory_size=config["memory_size"], ) except KeyError: raise TrainerConfigError( "Config doesn't specify use_recurrent. " "Please specify true or false for use_recurrent in your config." ) # Absorb the rest into the base TrainerSettings for key, val in config.items(): if key in attr.fields_dict(TrainerSettings): new_config[key] = val # Structure the whole thing all_behavior_config_dict[behavior_name] = cattr.structure( new_config, TrainerSettings ) return all_behavior_config_dict
def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False) optimizer = TorchPPOOptimizer(policy, trainer_settings) return optimizer
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): # model_path = env.external_brain_names[0] trainer_config = TrainerSettings() trainer_config.network_settings.memory = (NetworkSettings.MemorySettings() if use_rnn else None) policy = TorchPolicy(0, mock_behavior_specs, trainer_config, tanhresample, tanhresample) bc_module = BCModule( policy, settings=bc_settings, policy_learning_rate=trainer_config.hyperparameters.learning_rate, default_batch_size=trainer_config.hyperparameters.batch_size, default_num_epoch=3, ) return bc_module
def test_recurrent_ppo(use_discrete): env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete) new_network_settings = attr.evolve( PPO_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128 ) config = attr.evolve( PPO_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=5000, ) _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_actor_critic(ac_type, lstm): obs_size = 4 network_settings = NetworkSettings( memory=NetworkSettings.MemorySettings() if lstm else None) obs_shapes = [(obs_size, )] act_size = [2] stream_names = [f"stream_name{n}" for n in range(4)] actor = ac_type(obs_shapes, network_settings, ActionType.CONTINUOUS, act_size, stream_names) if lstm: sample_obs = torch.ones( (1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones(( 1, network_settings.memory.sequence_length, network_settings.memory.memory_size, )) else: sample_obs = torch.ones((1, obs_size)) memories = torch.tensor([]) # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass value_out, memories_out = actor.critic_pass([sample_obs], [], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1, ) # Test get_dist_and_value dists, value_out, mem_out = actor.get_dist_and_value([sample_obs], [], memories=memories) if mem_out is not None: assert mem_out.shape == memories.shape for dist in dists: assert isinstance(dist, GaussianDistInstance) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) else: assert value_out[stream].shape == (1, )
def test_hybrid_recurrent_ppo(): env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5) new_network_settings = attr.evolve( PPO_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( PPO_TORCH_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=512, ) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=3000, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_mock_brain( use_discrete, use_visual, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_settings = dummy_config trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) if use_rnn else None ) policy = NNPolicy( 0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False ) optimizer = SACOptimizer(policy, trainer_settings) return optimizer
def test_recurrent_ppo(action_sizes): env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes) new_network_settings = attr.evolve( PPO_TF_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( PPO_TF_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128, ) config = attr.evolve( PPO_TF_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=5000, framework=FrameworkType.TENSORFLOW, ) _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_hybrid_recurrent_sac(): env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5) new_networksettings = attr.evolve( SAC_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), ) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, batch_size=256, learning_rate=1e-3, buffer_init_steps=1000, steps_per_update=2, ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=4000, ) check_environment_trains(env, {BRAIN_NAME: config})
def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.reward_signals = { RewardSignalType.EXTRINSIC: RewardSignalSettings(strength=1.0, gamma=0.99) } trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=8, memory_size=10) if use_rnn else None) policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False) optimizer = TorchPOCAOptimizer(policy, trainer_settings) return optimizer
def test_recurrent_sac(use_discrete): env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete) new_networksettings = attr.evolve( SAC_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=32), ) new_hyperparams = attr.evolve( SAC_CONFIG.hyperparameters, batch_size=64, learning_rate=1e-3, buffer_init_steps=500, steps_per_update=2, ) config = attr.evolve( SAC_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=5000, ) _check_environment_trains(env, {BRAIN_NAME: config})
def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config, framework=FrameworkType.TENSORFLOW) trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) if use_rnn else None ) policy = TFPolicy( 0, mock_specs, trainer_settings, "test", False, create_tf_graph=False ) optimizer = PPOOptimizer(policy, trainer_settings) policy.initialize() return optimizer
def create_policy_mock( dummy_config: TrainerSettings, use_rnn: bool = False, use_discrete: bool = True, use_visual: bool = False, load: bool = False, seed: int = 0, ) -> NNPolicy: mock_brain = mb.setup_mock_brain( use_discrete, use_visual, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_settings = dummy_config trainer_settings.keep_checkpoints = 3 trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings() if use_rnn else None) policy = NNPolicy(seed, mock_brain, trainer_settings, False, load) return policy
def test_recurrent_sac(action_sizes): step_size = 0.2 if action_sizes == (0, 1) else 0.5 env = MemoryEnvironment( [BRAIN_NAME], action_sizes=action_sizes, step_size=step_size ) new_networksettings = attr.evolve( SAC_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), ) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, batch_size=256, learning_rate=3e-4, buffer_init_steps=1000, steps_per_update=2, ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=4000, ) check_environment_trains(env, {BRAIN_NAME: config}, training_seed=1337)
def test_recurrent_sac(action_sizes): step_size = 0.2 if action_sizes == (0, 1) else 0.5 env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=step_size) new_networksettings = attr.evolve( SAC_TF_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( SAC_TF_CONFIG.hyperparameters, batch_size=128, learning_rate=1e-3, buffer_init_steps=1000, steps_per_update=2, ) config = attr.evolve( SAC_TF_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=4000, framework=FrameworkType.TENSORFLOW, ) _check_environment_trains(env, {BRAIN_NAME: config})
EXPORT_FILE, opset_version=9, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes) if __name__ == '__main__': obs_spec = [ ObservationSpec(shape=(16, ), dimension_property=(DimensionProperty.UNSPECIFIED, ), observation_type=ObservationType.DEFAULT) ] act_spec = ActionSpec(continuous_size=4, discrete_branches=()) net_settings = NetworkSettings(normalize=False, hidden_units=256, num_layers=2, vis_encode_type=EncoderType.SIMPLE, memory=NetworkSettings.MemorySettings( sequence_length=64, memory_size=256)) network = SerializableSimpleActor(obs_spec, net_settings, act_spec) state_dict = torch.load(MODEL_FILE, map_location=torch.device('cpu')) filtered_sd = { i: j for i, j in state_dict['Policy'].items() if 'critic' not in i } network.load_state_dict(filtered_sd) export_model(network)
def test_memory_settings_validation(): with pytest.raises(TrainerConfigError): NetworkSettings.MemorySettings(sequence_length=128, memory_size=63) with pytest.raises(TrainerConfigError): NetworkSettings.MemorySettings(sequence_length=128, memory_size=0)