def test_vec_noise():
    num_envs = 4
    num_actions = 10
    mu = np.zeros(num_actions)
    sigma = np.ones(num_actions) * 0.4
    base: ActionNoise = OrnsteinUhlenbeckActionNoise(mu, sigma)
    with pytest.raises(ValueError):
        vec = VectorizedActionNoise(base, -1)
    with pytest.raises(ValueError):
        vec = VectorizedActionNoise(base, None)
    with pytest.raises(ValueError):
        vec = VectorizedActionNoise(base, "whatever")

    vec = VectorizedActionNoise(base, num_envs)
    assert vec.n_envs == num_envs
    assert vec().shape == (num_envs, num_actions)
    assert not (vec() == base()).all()
    with pytest.raises(ValueError):
        vec = VectorizedActionNoise(None, num_envs)
    with pytest.raises(TypeError):
        vec = VectorizedActionNoise(12, num_envs)
    with pytest.raises(AssertionError):
        vec.noises = []
    with pytest.raises(TypeError):
        vec.noises = None
    with pytest.raises(ValueError):
        vec.noises = [None] * vec.n_envs
    with pytest.raises(AssertionError):
        vec.noises = [base] * (num_envs - 1)
    assert all(isinstance(noise, type(base)) for noise in vec.noises)
    assert len(vec.noises) == num_envs
示例#2
0
def run(train_freq,
        gradient_steps,
        batch_size,
        envname,
        n_envs,
        log_interval,
        learning_rate,
        buffer_size,
        tau,
        gamma,
        target_policy_noise,
        target_noise_clip,
        learning_starts,
        total_timesteps,
        policy_kwargs,
        action_noise_mean,
        action_noise_sigma,
        noise_type,
        eval_freq,
        n_eval_episodes,
        verbose=True,
        tensorboard_log="logs/"):

    # Normalize with multi environments
    eval_freq = max(eval_freq // n_envs, 1)
    buffer_size = max(buffer_size // n_envs, 1)

    all_args = locals()

    path = "/" + os.path.join(*sb3.__file__.split("/")[:-2])
    commit_num = subprocess.check_output(["git", "describe", "--always"],
                                         cwd=path).strip().decode()

    env = gym.make(envname)
    vecenv = make_vec_env(envname, vec_env_cls=SubprocVecEnv, n_envs=n_envs)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    if noise_type == "OU":
        base_noise_class = OrnsteinUhlenbeckActionNoise
    elif noise_type == "Normal":
        base_noise_class = NormalActionNoise
    base_noise = base_noise_class(mean=np.ones(n_actions) * action_noise_mean,
                                  sigma=action_noise_sigma *
                                  np.ones(n_actions))
    action_noise = VectorizedActionNoise(base_noise, vecenv.num_envs)

    # Callbacks
    loggercallback = LoggerCallback("json", [("arguments", all_args),
                                             ("git", commit_num)])
    evalcallback = EvalCallback(make_vec_env(envname,
                                             vec_env_cls=SubprocVecEnv),
                                n_eval_episodes=n_eval_episodes,
                                eval_freq=eval_freq)

    # Initiate the model and start learning
    model = TD3("MlpPolicy",
                vecenv,
                action_noise=action_noise,
                batch_size=batch_size,
                train_freq=train_freq,
                gradient_steps=gradient_steps,
                learning_starts=learning_starts,
                n_episodes_rollout=-1,
                learning_rate=learning_rate,
                buffer_size=buffer_size,
                tau=tau,
                gamma=gamma,
                create_eval_env=True,
                target_policy_noise=target_policy_noise,
                target_noise_clip=target_noise_clip,
                verbose=verbose,
                policy_kwargs=policy_kwargs,
                tensorboard_log=tensorboard_log,
                device="cuda")
    model.learn(
        total_timesteps=total_timesteps,
        log_interval=log_interval,
        callback=[loggercallback, evalcallback],
        tb_log_name=envname,
    )
    model.env.close()
    evalcallback.eval_env.close()

    return evalcallback.best_mean_reward