Пример #1
0
def test_save_load_pytorch_var(tmp_path):
    model = SAC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1))
    model.learn(200)
    save_path = str(tmp_path / "sac_pendulum")
    model.save(save_path)
    env = model.get_env()
    log_ent_coef_before = model.log_ent_coef

    del model

    model = SAC.load(save_path, env=env)
    assert th.allclose(log_ent_coef_before, model.log_ent_coef)
    model.learn(200)
    log_ent_coef_after = model.log_ent_coef
    # Check that the entropy coefficient is still optimized
    assert not th.allclose(log_ent_coef_before, log_ent_coef_after)

    # With a fixed entropy coef
    model = SAC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1))
    model.learn(200)
    save_path = str(tmp_path / "sac_pendulum")
    model.save(save_path)
    env = model.get_env()
    assert model.log_ent_coef is None
    ent_coef_before = model.ent_coef_tensor

    del model

    model = SAC.load(save_path, env=env)
    assert th.allclose(ent_coef_before, model.ent_coef_tensor)
    model.learn(200)
    ent_coef_after = model.ent_coef_tensor
    assert model.log_ent_coef is None
    # Check that the entropy coefficient is still the same
    assert th.allclose(ent_coef_before, ent_coef_after)
Пример #2
0
def get_perf(i):
    model = SAC("MlpPolicy",
                "Pendulum-v0",
                policy_kwargs=dict(net_arch=[64, 64]),
                learning_starts=5e3,
                verbose=1,
                create_eval_env=True,
                buffer_size=1000000,
                ent_coef=0.2,
                action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)),
                seed=42)
    saved_policy = MlpPolicy.load(
        "Pendulum-v0#test4SAC#custom#None#{}.zip".format(i))
    mean_reward, std_reward = evaluate_policy(saved_policy,
                                              model.get_env(),
                                              n_eval_episodes=900)
    return mean_reward, std_reward
Пример #3
0
def test_train_freq(tmp_path, train_freq):

    model = SAC(
        "MlpPolicy",
        "Pendulum-v1",
        policy_kwargs=dict(net_arch=[64, 64], n_critics=1),
        learning_starts=100,
        buffer_size=10000,
        verbose=1,
        train_freq=train_freq,
    )
    model.learn(total_timesteps=150)
    model.save(tmp_path / "test_save.zip")
    env = model.get_env()
    model = SAC.load(tmp_path / "test_save.zip", env=env)
    model.learn(total_timesteps=150)
    model = SAC.load(tmp_path / "test_save.zip", train_freq=train_freq, env=env)
    model.learn(total_timesteps=150)
Пример #4
0
def advanced_saving_and_loading_example():
    # Advanced Saving and Loading.

    from stable_baselines3.sac.policies import MlpPolicy

    # Create the model, the training environment and the test environment (for evaluation).
    model = SAC('MlpPolicy',
                'Pendulum-v1',
                verbose=1,
                learning_rate=1e-3,
                create_eval_env=True)

    # Evaluate the model every 1000 steps on 5 test episodes and save the evaluation to the "logs/" folder.
    model.learn(6000,
                eval_freq=1000,
                n_eval_episodes=5,
                eval_log_path="./logs/")

    # Save the model.
    model.save("sac_pendulum")

    # The saved model does not contain the replay buffer.
    loaded_model = SAC.load("sac_pendulum")
    print(
        f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer"
    )

    # Now save the replay buffer too.
    model.save_replay_buffer("sac_replay_buffer")

    # Load it into the loaded_model.
    loaded_model.load_replay_buffer("sac_replay_buffer")

    # Now the loaded replay is not empty anymore.
    print(
        f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer"
    )

    # Save the policy independently from the model.
    # Note: if you don't save the complete model with 'model.save()'
    # you cannot continue training afterward.
    policy = model.policy
    policy.save("sac_policy_pendulum")

    # Retrieve the environment.
    env = model.get_env()

    # Evaluate the policy.
    mean_reward, std_reward = evaluate_policy(policy,
                                              env,
                                              n_eval_episodes=10,
                                              deterministic=True)

    print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

    # Load the policy independently from the model.
    saved_policy = MlpPolicy.load("sac_policy_pendulum")

    # Evaluate the loaded policy.
    mean_reward, std_reward = evaluate_policy(saved_policy,
                                              env,
                                              n_eval_episodes=10,
                                              deterministic=True)

    print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
Пример #5
0
if __name__ == '__main__':
    env_id = 'gym_spm:spm-v0'
    num_cpu = 4  # Number of processes to use

    env = gym.make('gym_spm:spm-v0')

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=.75 * np.ones(n_actions))

    # model = SAC(MlpPolicy, env, action_noise=action_noise, verbose=1)
    model = SAC(MlpPolicy, env, verbose=1)

    model.learn(total_timesteps=25000)

    # model.load('DDPG_test_2_SOC_point5_two_states')
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    
    print("Mean Reward = ", mean_reward)
    
    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []
    
    obs = env.reset()
    for _ in range(3600):
    
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, done, info = env.step(action)
    
Пример #6
0
# load it into the loaded_model
loaded_model.load_replay_buffer("sac_replay_buffer")

# now the loaded replay is not empty anymore
print(
    f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer"
)

# Save the policy independently from the model
# Note: if you don't save the complete model with `model.save()`
# you cannot continue training afterward
policy = model.policy
policy.save("sac_policy_pendulum")

# Retrieve the environment
env = model.get_env()

# Evaluate the policy
mean_reward, std_reward = evaluate_policy(policy,
                                          env,
                                          n_eval_episodes=10,
                                          deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

# Load the policy independently from the model
saved_policy = MlpPolicy.load("sac_policy_pendulum")

# Evaluate the loaded policy
mean_reward, std_reward = evaluate_policy(saved_policy,
                                          env,