Python evaluate_policy示例，stable_baselines3.common.evaluation.evaluate_policy Python示例

示例#1

0

显示文件

def process(file):
    env = gym.make('PerigeeRaising-Continuous3D-v0')
    env = NormalizeObservationSpace(env, lambda o: o / env.unwrapped.observation_space.high)
    env = Monitor(env)
    env.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(get_action_dim(env.action_space))
    evaluate_policy(agent, env, n_eval_episodes=1)

    hist_sc_state = env.unwrapped.hist_sc_state
    hist_action = env.unwrapped.hist_action
    time = np.array(list(map(lambda sc_state: sc_state.getDate().durationFrom(hist_sc_state[0].getDate()),
                             hist_sc_state))) / 3600.0  # Convert to hours
    a = np.array(list(map(lambda sc_state: sc_state.getA(), hist_sc_state))) / 1000.0  # Convert to km
    e = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state)))
    mass = np.array(list(map(lambda sc_state: sc_state.getMass(), hist_sc_state)))
    ra = a * (1.0 + e)
    rp = a * (1.0 - e)
    v = np.array(list(map(lambda sc_state: sc_state.getPVCoordinates().getVelocity().toArray(), hist_sc_state)))
    h = np.array(list(map(lambda sc_state: sc_state.getPVCoordinates().getMomentum().toArray(), hist_sc_state)))
    angle_f_v = list(map(lambda q:
                         np.degrees(np.arccos(
                             np.dot(q[0], q[1]) / np.linalg.norm(q[0]) / (np.linalg.norm(q[1]) + 1e-10)
                         )),
                         zip(v, hist_action)))
    hist_action_plane = list(map(lambda q: q[1] - np.dot(q[1], q[0]) * q[0] / (np.linalg.norm(q[0]) ** 2),
                                 zip(h, hist_action)))
    angle_fp_v = list(map(lambda q:
                          np.degrees(np.arccos(
                              np.dot(q[0], q[1] * [1, 1, 0]) / np.linalg.norm(q[0]) / (
                                      np.linalg.norm(q[1] * [1, 1, 0]) + 1e-10)
                          )),
                          zip(v, hist_action_plane)))

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=ra[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(ra[0] - 20.0, ra[0] + 20.0)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("ra (km)")
    axs.plot(time, ra, "k")
    plt.tight_layout()
    fig.savefig("plan_ra.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=rp[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(rp[0] - 5.0, rp[0] + 35.0)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("rp (km)")
    axs.plot(time, rp, "k")
    plt.tight_layout()
    fig.savefig("plan_rp.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=mass[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(mass[0] - 0.04, mass[0])
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("mass (kg)")
    axs.plot(time, mass, "k")
    plt.tight_layout()
    fig.savefig("plan_m.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain')
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(-1.3, 1.3)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("action")
    l1, l2, l3 = axs.plot(time[0:-1], hist_action, "k")
    l1.set_color("#000000")
    l2.set_color("#777777")
    l3.set_color("#BBBBBB")
    axs.legend(["Act1", "Act2", "Act3"], loc='upper left')
    plt.tight_layout()
    fig.savefig("plan_action.pdf", format="pdf")
    plt.close(fig)

示例#2

0

显示文件

文件： PPO.py 项目： jackt53ng/gym_fishing

import numpy as np
import gym
import gym_fishing
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make("fishing-v0")
model = PPO("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=100000)

## simulate and plot results
df = env.simulate(model, reps=10)
env.plot(df, "ppo.png")

## Evaluate model
mean_reward, std_reward = evaluate_policy(model,
                                          model.get_env(),
                                          n_eval_episodes=50)
print("mean reward:", mean_reward, "std:", std_reward)

# save trained agent for future use, if desired
# model.save("ppo")

示例#3

0

显示文件

#!/usr/bin/env python
# _*_ coding: utf-8 _*_
# @Time : 2021/1/18 17:52
# @Author : lucky3721
# @Version：V 1.0
# @File : lunar_land_example.py
# @desc :
import gym
import numpy as np

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

model = DQN('MlpPolicy',
            'LunarLander-v2',
            verbose=1,
            exploration_final_eps=0.1,
            target_update_interval=250)
model.learn(total_timesteps=int(1e5))

# Separate env for evaluation
eval_env = gym.make('LunarLander-v2')

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model,
                                          eval_env,
                                          n_eval_episodes=10,
                                          deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

示例#4

0

显示文件

文件： neural_nets_4.py 项目： Tsiliii/Neural-Networks-Course

time_stamp=datetime.datetime.now().strftime("-%Y%m%d-%H%M%S")
model_log= LOG_DIR + model_name + time_stamp

dqn_model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=model_log, buffer_size=100000)

max_steps=100
dqn_model.learn(total_timesteps=max_steps)

# Commented out IPython magic to ensure Python compatibility.
# %tensorboard --logdir {LOG_DIR}

"""## Εκτίμηση απόδοσης"""

from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(dqn_model, test_env, n_eval_episodes=10)
print(f"Eval reward: {mean_reward} (+/-{std_reward})")

"""## Σώσιμο εκπαιδευμένου μοντέλου"""

dqn_model.save("dqn_pong")

"""Το μοντέλο θα αποθηκευθεί ως zip και μπορείτε να το κατεβάσετε τοπικά από το αριστερό sidebar του Colab στο "Files" και μετά στο ellipsis menu πάνω στο filename.

## Φόρτωση εκπαιδευμένου μοντέλου

Από το αριστερό sidebar του Colab και το "Files" ανεβάστε το αρχείο zip του εκπαιδευμένου μοντέλου.

Εδώ θα ανεβάσουμε ένα μοντέλο Α2C που έχουμε εκπαιδεύσει νωρίτερα. Μπορείτε να το κατεβάσετε από [εδώ](https://drive.google.com/uc?export=download&id=1COsaNOH8SjbpxxIYc5lOF-QUiUJGU5ZB). 

Αν χρειαστεί μετονομάστε το αρχείο σε a2c_pong.zip

示例#5

0

显示文件

文件： compute_leaderboard.py 项目： jackt53ng/conservation-agents

url = hash_url(file)  # get hash URL at start of execution
tensorboard_log = "/var/log/tensorboard/leaderboard"

seed = 0

ENV = "fishing-v1"
env = gym.make(ENV, sigma=0.1)

vec_env = make_vec_env(ENV, n_envs=4, seed=seed,
                       sigma=0.1)  # parallel workers for PPO, A2C

## Constant Escapement ######################################################
model = escapement(env)
df = env.simulate(model, reps=10)
env.plot(df, "results/escapement.png")
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000)
leaderboard("ESC", ENV, mean_reward, std_reward, url)
print("algo:", "ESC", "env:", ENV, "mean reward:", mean_reward, "std:",
      std_reward)

## MSY ######################################################################
model = msy(env)
df = env.simulate(model, reps=10)
env.plot(df, "results/msy.png")
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000)
# Rescale score against optimum solution in this environment
opt = escapement(env)
opt_reward, std_reward = evaluate_policy(opt, env, n_eval_episodes=100)
mean_reward = mean_reward / opt_reward
std_reward = std_reward / opt_reward
leaderboard("MSY", ENV, mean_reward, std_reward, url)

示例#6

0

显示文件

文件： main.py 项目： Tbarkin121/PytorchRL-Getting-Started

def main():
    # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(),
    #                          torch.nn.Linear(64, 2))

    os.makedirs(_log_dir, exist_ok=True)

    DoTraining = True
    StartFresh = True
    num_cpu = 8
    if (DoTraining):

        # This doesn't work but it might have something to do with how the environment is written
        # num_cpu = 1
        # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor

        # Create the callback: check every 1000 steps
        # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir)

        if (StartFresh):
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize(env,
                               norm_obs=True,
                               norm_reward=True,
                               clip_obs=10.)
            env.reset()
            policy_kwargs = {
                'net_arch': [128, 128, 128],
            }
            model = PPO('MlpPolicy',
                        env,
                        policy_kwargs=policy_kwargs,
                        verbose=2,
                        tensorboard_log=tb_log)
        else:
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize.load(_stats_path, env)
            env.reset()

            model = PPO.load(
                'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl',
                tensorboard_log=tb_log)
            model.set_env(env)

        eval_env = gym.make(env_id)
        # print('!!!!Checking Environment!!!!')
        # print(check_env(eval_env))

        mean_reward, std_reward = evaluate_policy(model,
                                                  eval_env,
                                                  n_eval_episodes=10)
        print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
        for _ in range(50):
            model.learn(total_timesteps=100000,
                        tb_log_name=env_id,
                        reset_num_timesteps=False)  #, callback=callback
            mean_reward, std_reward = evaluate_policy(model,
                                                      eval_env,
                                                      n_eval_episodes=10)
            print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
            model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) +
                       '.mdl')
            env.save(_log_dir +
                     'vec_normalize_{}'.format(model.num_timesteps) + '.pkl')

    if (not DoTraining):
        # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env)
        # eval_env = VecVideoRecorder(eval_env, video_folder='videos/',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='test')
        # eval_env.training = False
        # eval_env.norm_reward = False
        # eval_env.reset()

        eval_env = DummyVecEnv(
            [make_env(env_id, i, log_dir=_log_dir) for i in range(1)])
        # eval_env = gym.make(env_id)
        eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl',
                                     eval_env)

        model = PPO.load(
            'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl',
            tensorboard_log=tb_log)
        model.set_env(eval_env)
        # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id)
        # Start the video at step=0 and record 500 steps
        # eval_env = VecVideoRecorder(eval_env, video_folder='tmp',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='')

        obs = eval_env.reset()
        # for i in range(500):
        #     action, _ = model.predict(obs)
        #     obs, _, _, _ = eval_env.step(action)
        # eval_env.close()
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, _, done, _ = eval_env.step(action)
            # eval_env.render()
            if done.any():
                # obs = eval_env.reset()
                # time.sleep(1/30)
                eval_env.close()
                break

示例#7

0

显示文件

def main():

    set_random_seed(RANDOM_SEED)

    t_start = time()
    name = "LargeFinalLayer"

    checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name)
    os.makedirs(checkpoint_path, exist_ok=True)

    log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name)
    os.makedirs(log_path, exist_ok=True)

    results_path = os.path.join(checkpoint_path, "results.json")

    env_args = dict(
        frame_skip=4,
        screen_size=84,
        terminal_on_life_loss=True,
        clip_reward=True,
    )

    # Creates a gym environment for an atari game using the specified seed and number of environments
    # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors
    # for improved performance..
    # train_env = make_atari_env(ENV_NAME, n_envs=N_ENVS, seed=RANDOM_SEED, wrapper_kwargs=env_args)

    def atari_wrapper(env: gym.Env) -> gym.Env:
        env = AtariWrapper(env, **env_args)
        return env

    def make_env(rank: int, count: int) -> VecEnv:
        return make_vec_env(
            ENV_NAME,
            n_envs=count,
            seed=RANDOM_SEED + rank,
            start_index=0,
            monitor_dir=None,
            wrapper_class=atari_wrapper,
            env_kwargs=None,
            vec_env_cls=None,
            vec_env_kwargs=None,
            monitor_kwargs=None,
        )

    train_env = make_env(0, N_ENVS)
    eval_env = make_env(1, 1)

    # required by models in baselines
    train_env = VecTransposeImage(train_env)
    eval_env = VecTransposeImage(eval_env)

    # setup callback to save model at fixed intervals
    save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                       save_path=checkpoint_path,
                                       name_prefix=name)
    stop_callback = StopTrainingOnRewardThreshold(
        reward_threshold=EVAL_THRESHOLD)
    time_callback = TimeLimitCallback(max_time=TIME_LIMIT)
    best_callback = EvalCallback(
        eval_env,
        eval_freq=EVAL_FREQ,
        best_model_save_path=checkpoint_path,
        callback_on_new_best=stop_callback,
    )
    list_callback = CallbackList([save_callback, best_callback, time_callback])

    model = PPO(
        CnnPolicy,
        train_env,
        verbose=VERBOSE,
        batch_size=BATCH_SIZE,
        seed=RANDOM_SEED,
        tensorboard_log=log_path,
        learning_rate=LEARNING_RATE,
        n_steps=UPDATE_STEPS,
        n_epochs=N_EPOCHS,
        ent_coef=ENT_COEF,
        vf_coef=VF_COEF,
        clip_range=CLIP_RANGE,
        device=DEVICE_TYPE,
        policy_kwargs=dict(features_extractor_class=FeatureExtractor),
    )

    config_path = os.path.join(checkpoint_path, "cnn_config")
    zip_path = os.path.join(checkpoint_path, "model.zip")

    # output the model config to a file for easier viewing
    with open(config_path, "w") as file:
        file.write(f"{name}\n")
        file.write(str(model.policy.features_extractor.cnn))

    print("Beginning training...")

    model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run")
    # model.learn(TRAIN_STEPS, tb_log_name="run")
    model.save(zip_path)

    del train_env
    # del eval_env

    time_taken = time() - t_start

    print("Beginning evaluation...")

    # score of the game, standard deviation of multiple runs
    reward_mean, reward_std = evaluate_policy(model, make_env(2, 1))

    with open(results_path, "w") as handle:
        handle.write(json.dumps((reward_mean, reward_std, time_taken)))

示例#8

0

显示文件

文件： gym_Scara.py 项目： JuanNavarroWSU/IntelligentPantry

env = gym.make("IntelligentPantry-v1")
#env = gym.make("Reacher-v2")

observation = env.reset()
print(env.action_space)
a = 0.45
b = 0.45
f = 1200
log_path = os.path.join('training', 'Logs')
#env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
model3 = TD3("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model2 = DDPG('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
model3.learn(total_timesteps=500000, log_interval=100)
eval = evaluate_policy(model3, env, n_eval_episodes=20, render=True)
# episodes = 5
# for episode in range(1, episodes+1):
#     state = env.reset()
#     done = False
#     score = 0
#
#     while not done:
#         env.render()
#         action = env.action_space.sample()
#         n_state, reward, done, info = env.step(action)
#         score += reward
#     print("Episode:{} Score:{}".format(episode, score))
# env.close()

# while f > 0:

示例#9

0

显示文件

def process(file):
    env = gym.make('PerigeeRaising-Continuous3D-v0')
    env.unwrapped._ref_sv[2] = 0.0
    env.unwrapped._ref_sv[3] = 0.0
    env.unwrapped._ref_sv[4] = 0.0
    env = NormalizeObservationSpace(
        env, lambda o: o / env.unwrapped.observation_space.high)
    env = Monitor(env)
    env.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(
        get_action_dim(env.action_space))
    evaluate_policy(agent, env, n_eval_episodes=1)

    hist_sc_state = env.unwrapped.hist_sc_state
    hist_action = env.unwrapped.hist_action
    x = np.array(
        list(
            map(
                lambda sc_state: sc_state.getPVCoordinates().getPosition().
                getX(), hist_sc_state))) / 1000.0  # Convert to km
    y = np.array(
        list(
            map(
                lambda sc_state: sc_state.getPVCoordinates().getPosition().
                getY(), hist_sc_state))) / 1000.0  # Convert to km

    env2 = gym.make('PerigeeRaising-Continuous3D-v0')
    env2.unwrapped._ref_sv[0] = 11000000.0 / 1.05
    env2.unwrapped._ref_sv[1] = 0.05
    env2.unwrapped._ref_sv[2] = 0.0
    env2.unwrapped._ref_sv[3] = 0.0
    env2.unwrapped._ref_sv[4] = 0.0
    env2 = NormalizeObservationSpace(
        env2, lambda o: o / env2.unwrapped.observation_space.high)
    env2 = Monitor(env2)
    env2.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(
        get_action_dim(env.action_space))
    evaluate_policy(agent, env2, n_eval_episodes=1)

    hist_sc_state2 = env2.unwrapped.hist_sc_state
    hist_action2 = env2.unwrapped.hist_action
    x2 = np.array(
        list(
            map(
                lambda sc_state: sc_state.getPVCoordinates().getPosition().
                getX(), hist_sc_state2))) / 1000.0  # Convert to km
    y2 = np.array(
        list(
            map(
                lambda sc_state: sc_state.getPVCoordinates().getPosition().
                getY(), hist_sc_state2))) / 1000.0  # Convert to km

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.set_xlim(-12000, 12000)
    axs.set_ylim(-12000, 12000)
    axs.grid(False)
    axs.plot(x, y, "k", zorder=2)
    l2, = axs.plot(x2, y2, zorder=1)
    l2.set_color("#777777")
    axs.legend(["Before", "After"],
               loc='upper right',
               frameon=False,
               bbox_to_anchor=(0.0, 1.0))
    im = mpimg.imread('earth.png')
    plt.imshow(im, extent=[-6400, 6400, -6400, 6400], interpolation="none")
    axs.set_aspect('equal')
    plt.text(11000, 0, "Pericenter")
    plt.text(-18500, 0, "Apocenter")
    plt.axis('off')
    plt.tight_layout()
    fig.savefig("orbit.pdf", format="pdf")
    plt.close(fig)

示例#10

0

显示文件

文件： stable_baselines_pybullet_envs_eval.py 项目： HannesStark/gnn-reinforcement-learning

    "C:\\Users\\tsbau\\git\\tum-adlr-ws21-04\\runs\\MLP_S64_P64_V64_N1000_B64_lr3e-4_AntCpLeftBackBulletEnv-v0_14-02_23-42-32"
)
model_dir = Path('')

model = PPO.load(model_dir / "model2.zip", device='cpu')
env_name = 'AntBulletEnv-v0'

eval_env = gym.make(env_name)

eval_env.render(
)  # call this before env.reset, if you want a window showing the environment


def logging_callback(local_args, globals):
    if local_args["done"]:
        i = len(local_args["episode_rewards"])
        episode_reward = local_args["episode_reward"]
        episode_length = local_args["episode_length"]
        print(f"Finished {i} episode with reward {episode_reward}")


mean_reward, std_reward = evaluate_policy(model,
                                          eval_env,
                                          n_eval_episodes=109,
                                          render=True,
                                          deterministic=True,
                                          return_episode_rewards=False,
                                          callback=logging_callback)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

示例#11

0

显示文件

文件： PPO_eval_test.py 项目： emenriquez/Humanoid_PyBullet_PPO

from stable_baselines3 import ppo
from stable_baselines3.ppo.ppo import PPO
from stable_baselines3.common.evaluation import evaluate_policy
import gym
import Humanoid_Basic_Env

env = gym.make('HumanoidTinyEnv-v0')


# model = PPO.load('test_ppo', env=env)
model = PPO('MlpPolicy', env=env)

results = evaluate_policy(model=model, env=env, render=False)
print(results)

示例#12

0

显示文件

def evaluate(individual: Individual,
             device: Union[torch.device, str] = "auto") -> Tuple[int]:
    """
    Evaluate a single individual model and return it's mean score after the training time is elapsed.
    Models are trained and evaluated for a number of timestamps as parameterized in the constants at the
    top of the file.
    :param individual: The individual to evaluate.
    :return:
    """

    t_start = time()
    layers = individual.weights
    name = individual.encode()

    checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name)

    if os.path.exists(checkpoint_path):
        return (random.randint(MIN_SCORE, MAX_SCORE), )

    os.makedirs(checkpoint_path, exist_ok=True)
    log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name)
    os.makedirs(log_path, exist_ok=True)

    results_path = os.path.join(checkpoint_path, "results.json")

    if not os.path.exists(results_path):
        env_args = dict(
            frame_skip=4,
            screen_size=84,
            terminal_on_life_loss=True,
            clip_reward=True,
        )

        # Creates a gym environment for an atari game using the specified seed and number of environments
        # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors
        # for improved performance..
        def atari_wrapper(env: gym.Env) -> gym.Env:
            env = AtariWrapper(env, **env_args)
            return env

        def make_env(rank: int, count: int) -> VecEnv:
            return make_vec_env(
                ENV_NAME,
                n_envs=count,
                seed=RANDOM_SEED + rank,
                start_index=0,
                monitor_dir=None,
                wrapper_class=atari_wrapper,
                env_kwargs=None,
                vec_env_cls=SubprocVecEnv,
                vec_env_kwargs=None,
                monitor_kwargs=None,
            )

        train_env = make_env(0, N_ENVS)
        eval_env = make_env(1, 1)

        # required by models in baselines
        train_env = VecTransposeImage(train_env)
        eval_env = VecTransposeImage(eval_env)

        # setup callback to save model at fixed intervals
        save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                           save_path=checkpoint_path,
                                           name_prefix=name)
        stop_callback = StopTrainingOnRewardThreshold(
            reward_threshold=EVAL_THRESHOLD)
        time_callback = TimeLimitCallback(max_time=TIME_LIMIT)
        best_callback = EvalCallback(
            eval_env,
            eval_freq=EVAL_FREQ,
            best_model_save_path=checkpoint_path,
            callback_on_new_best=stop_callback,
        )
        list_callback = CallbackList(
            [save_callback, best_callback, time_callback])

        model = PPO(
            CnnPolicy,
            train_env,
            verbose=VERBOSE,
            batch_size=BATCH_SIZE,
            seed=RANDOM_SEED * 7,
            tensorboard_log=log_path,
            learning_rate=LEARNING_RATE,
            n_steps=UPDATE_STEPS,
            n_epochs=N_EPOCHS,
            ent_coef=ENT_COEF,
            vf_coef=VF_COEF,
            clip_range=CLIP_RANGE,
            device=device,
            policy_kwargs=dict(features_extractor_class=VariableBenchmark,
                               features_extractor_kwargs=dict(layers=layers)),
        )

        config_path = os.path.join(checkpoint_path, "cnn_config")
        zip_path = os.path.join(checkpoint_path, "model.zip")

        # output the model config to a file for easier viewing
        with open(config_path, "w") as file:
            file.write(f"{name}\n")
            file.write(str(model.policy.features_extractor.cnn))

        print("Beginning training...")

        model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run")
        model.save(zip_path)

        del train_env
        del eval_env

        time_taken = time() - t_start

        print("Beginning evaluation...")

        # score of the game, standard deviation of multiple runs
        reward_mean, reward_std = evaluate_policy(model, make_env(2, 1))

        with open(results_path, "w") as handle:
            handle.write(json.dumps((reward_mean, reward_std, time_taken)))
    else:
        reward_mean, reward_std, time_taken = json.load(open(
            results_path, "r"))

    reward_mean = abs(MIN_SCORE) + reward_mean
    value = (reward_mean * weighted_time(time_taken), )

    print(f"Evaluated {name} with a score of {value}  in {(time_taken):.2f}s")

    return value

示例#13

0

显示文件

def train_and_evaluate(config, train_env, eval_env, eval_callback, tb_log_name):
    """Trains and evaluates on separate environments with config parameters.

    Args:
        config (Dict): Relevant parameters.
        train_env (RampupEnv2): Environment to train with.
        eval_env (RampupEnv2): Environment to evaluate with.
        eval_callback (EvalCallback): Callback wrapper for evaluation
            callbacks during training.
        tb_log_name (str): Log name to identify metrics on Tensorboard.
    """
    best_model = None
    best_mean = -np.inf

    # sched_LR = LinearSchedule(config["TIMESTEPS"], 0.005, 0.00001)

    for i in range(config["REPETITIONS"]):
        print(f"\nRunning repetition {i+1}/{config['REPETITIONS']}...")
        model = A2C(
            "MlpPolicy",
            train_env,
            learning_rate=config["LEARNING_RATE"],
            policy_kwargs=config["POLICY_KWARGS"],
            tensorboard_log=config["TENSORBOARD_LOG"],
            verbose=0,
        )

        model.learn(
            total_timesteps=config["TIMESTEPS"],
            callback=[eval_callback, TensorboardCallback(tb_log_name)],
            tb_log_name=tb_log_name,
        )

        if config["SHOW_TABLE"]:
            eval_env.fill_table = True
            obs = eval_env._set_initial_state(initial_state_status=3)
            while not eval_env.done:
                action, _states = model.predict(obs, deterministic=False)
                obs, reward, done, info = eval_env.step(action)

        mean_reward, std_reward = evaluate_policy(
            model, eval_env, n_eval_episodes=config["EVAL_EPISODES"]
        )
        if mean_reward > best_mean:
            best_mean = mean_reward
            best_model = model
        if config["PUNISH_ILLEGAL"]:
            economic_potential = eval_env.demand.economic_potential_no_illegal()
        else:
            economic_potential = eval_env.demand.economic_potential()
        lost_potential = economic_potential - max(mean_reward, 0)
        lost_potential_perc = round(lost_potential / economic_potential * 100, 4)

        summary = "POLICY EVALUATION RESULTS"
        summary += f"\nEvaluated episodes:\t{config['EVAL_EPISODES']}"
        summary += f"\nMean reward:\t\t{mean_reward}"
        summary += f"\nStandard deviation:\t{std_reward}"
        summary += f"\nEconomic potential:\t{economic_potential}"
        summary += f"\nLost potential:\t\t{lost_potential} ({lost_potential_perc}%)"
        print(summary)
        if config["SHOW_TABLE"]:
            print("Sample Episode Table")
            display(eval_env.episode_table)

    return best_model, train_env, eval_env

示例#14

0

显示文件

def test_agent(agent):
    env = Monitor(get_env(
        42))  # We always test with seed 42 because it's the answer to... :-)
    mean_reward, std_reward = evaluate_policy(agent, env, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

示例#15

0

显示文件

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
#action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=0.1 * np.ones(n_actions))

model = DDPG('MlpPolicy',
             env,
             action_noise=action_noise,
             verbose=1,
             tensorboard_log="./ddpg_pendulum_tensorboard/")

print("start model evaluation without learning !")
mean_reward_before, std_reward_before = evaluate_policy(model,
                                                        env,
                                                        n_eval_episodes=100)
print("end model evaluation !")

print("start model learning !")
model.learn(total_timesteps=10000, log_interval=10)
print("end model learning !")

print("-> model saved !!")
model.save("ddpg_pendulum")

print("start model evaluation with learning !")
mean_reward_after, std_reward_after = evaluate_policy(model,
                                                      env,
                                                      n_eval_episodes=100)
print("end model evaluation !")

示例#16

0

显示文件

文件： test_identity.py 项目： useric/autohedger

def test_discrete(model_class):
    env = IdentityEnv(10)
    model = model_class('MlpPolicy', env, gamma=0.5, seed=0).learn(3000)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)

示例#17

0

显示文件

文件： train.py 项目： dwoiwode/gym-traffic

def train(env_function,
          name="model",
          n_processes: int = 6,
          seed: int = 0,
          load_checkpoint: Optional[str] = None,
          from_index=0,
          to_index=12,
          steps_per_episode=125 * 1000):
    """
    Trains a model with a given environment

    :param env_function: Function that creates an gym.Env
    :param name: name for saving
    :param n_processes: number of processes used for training
    :param seed:
    :param load_checkpoint: if None: Create new model. Else: Load model from file
    :param steps_per_episode: Number of steps for model.learn()
    :param from_index: starting with this episode (for continuing training later than 0)
    :param to_index: last index of episode
    :return:
    """
    def make_env(rank: int):
        """
        Utility function for multiprocessed env.

        :param rank: index of the subprocess (needed to update seed)
        """
        def _init():
            env = env_function()
            # Important: use a different seed for each environment
            env.seed(seed + rank)
            return env

        return _init

    # Create the vectorized environment
    env_vector = SubprocVecEnv([make_env(i) for i in range(n_processes)])

    # Create model
    if load_checkpoint is None:
        model = PPO(
            "MlpPolicy",
            env_vector,
            tensorboard_log="./ppo_trafficgym_tensorboard/",
            verbose=2,
            learning_rate=1e-2,
            # gamma=0.95,
            batch_size=256,
            policy_kwargs=dict(net_arch=[64, 64]),
        )
    else:
        model = PPO.load(load_checkpoint)

    # Evaluate before training
    env = Monitor(env_function())
    print("Evaluating...")
    evaluation = evaluate_policy(model, env)
    print("Eval1:", evaluation)

    # Actual training
    t1 = time.time()
    for i in range(from_index, to_index + 1):
        try:
            model.learn(steps_per_episode)
            print(f"Save model {i}")
            model.save(f"{name}{i:02d}.stable_baselines")
        except KeyboardInterrupt:
            print("Interrupted by KeyBoard")
            break
    t2 = time.time()
    print(f"Learning took {t2 - t1} seconds")

    # Evaluate after training
    print("Evaluating...")
    evaluation = evaluate_policy(model, env)
    print("Eval2:", evaluation)

示例#18

0

显示文件

文件： train.py 项目： bjkomer/ssp-experiments

        max_dist = get_max_dist(args.env_size)
        steps_per_dist = args.n_steps // (max_dist + 1)
        goal_distance = 1
        for i in range(1, max_dist):
            cur_envs.append(
                create_env(goal_distance=i,
                           args=args,
                           max_steps=args.max_steps))
        # env.close()
        # del env
        # env = create_env(goal_distance=goal_distance, args=args)
        # model.set_env(env)

    total_steps = 0
    for eval in range(n_evals):
        mean_reward, std_reward = evaluate_policy(
            model, eval_env, n_eval_episodes=args.n_eval_episodes)
        eval_data[eval, 0] = mean_reward
        eval_data[eval, 1] = std_reward
        print("Total Steps: {}".format(total_steps))
        print(f"Mean Reward: {mean_reward:.2f} +/- {std_reward:.2f}")
        print("")

        if args.save_periodically:
            np.savez(fname + '.npz', eval_data=eval_data)
            # model.save(fname + '_steps_{}'.format(total_steps))
            model.save(fname + '_partial')

        model.learn(total_timesteps=steps_per_eval)
        total_steps += steps_per_eval

        # potentially update curriculum if there is one

示例#19

0

显示文件

def process(file):
    env = gym.make('PerigeeRaising-Continuous3D-v0',
                   use_perturbations=True,
                   perturb_action=True)
    env = NormalizeObservationSpace(
        env, lambda o: o / env.unwrapped.observation_space.high)
    env = Monitor(env)
    env.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(
        get_action_dim(env.action_space))
    evaluate_policy(agent, env, n_eval_episodes=1)

    hist_sc_state = env.unwrapped.hist_sc_state
    hist_action = env.unwrapped.hist_action
    time = np.array(
        list(
            map(
                lambda sc_state: sc_state.getDate().durationFrom(hist_sc_state[
                    0].getDate()),
                hist_sc_state))) / 3600.0  # Convert to hours
    a = np.array(list(map(lambda sc_state: sc_state.getA(),
                          hist_sc_state))) / 1000.0  # Convert to km
    e = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state)))
    mass = np.array(
        list(map(lambda sc_state: sc_state.getMass(), hist_sc_state)))
    ra = a * (1.0 + e)
    rp = a * (1.0 - e)

    env2 = gym.make('PerigeeRaising-Continuous3D-v0')
    env2 = NormalizeObservationSpace(
        env2, lambda o: o / env2.unwrapped.observation_space.high)
    env2 = Monitor(env2)
    env2.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(
        get_action_dim(env.action_space))
    evaluate_policy(agent, env2, n_eval_episodes=1)

    hist_sc_state2 = env2.unwrapped.hist_sc_state
    hist_action2 = env2.unwrapped.hist_action
    time2 = np.array(
        list(
            map(
                lambda sc_state: sc_state.getDate().durationFrom(
                    hist_sc_state2[0].getDate()),
                hist_sc_state2))) / 3600.0  # Convert to hours
    a2 = np.array(list(map(lambda sc_state: sc_state.getA(),
                           hist_sc_state2))) / 1000.0  # Convert to km
    e2 = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state2)))
    mass2 = np.array(
        list(map(lambda sc_state: sc_state.getMass(), hist_sc_state2)))
    ra2 = a2 * (1.0 + e2)
    rp2 = a2 * (1.0 - e2)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=ra[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(ra[0] - 20.0, ra[0] + 20.0)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("ra (km)")
    l2, = axs.plot(time2, ra2, "--")
    l2.set_color("#777777")
    axs.plot(time, ra, "k")
    axs.legend(["Planned", "Real"], loc='upper left')
    plt.tight_layout()
    fig.savefig("real_ra.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=rp[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(rp[0] - 5.0, rp[0] + 35.0)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("rp (km)")
    l2, = axs.plot(time2, rp2, "--")
    l2.set_color("#777777")
    axs.plot(time, rp, "k")
    axs.legend(["Planned", "Real"], loc='upper left')
    plt.tight_layout()
    fig.savefig("real_rp.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=mass[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(mass[0] - 0.04, mass[0])
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("mass (kg)")
    l2, = axs.plot(time2, mass2, "--")
    l2.set_color("#777777")
    axs.plot(time, mass, "k")
    axs.legend(["Planned", "Real"], loc='upper right')
    plt.tight_layout()
    fig.savefig("real_m.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain')
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(-1.3, 1.3)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("action")
    l1, l2, l3 = axs.plot(time[0:-1], hist_action)
    l1.set_color("#000000")
    l2.set_color("#777777")
    l3.set_color("#BBBBBB")
    axs.legend(["Act1", "Act2", "Act3"], loc='upper left')
    plt.tight_layout()
    fig.savefig("real_action.pdf", format="pdf")
    plt.close(fig)

示例#20

0

显示文件

print(
    f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer"
)

# Save the policy independently from the model
# Note: if you don't save the complete model with `model.save()`
# you cannot continue training afterward
policy = model.policy
policy.save("sac_policy_pendulum")

# Retrieve the environment
env = model.get_env()

# Evaluate the policy
mean_reward, std_reward = evaluate_policy(policy,
                                          env,
                                          n_eval_episodes=10,
                                          deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

# Load the policy independently from the model
saved_policy = MlpPolicy.load("sac_policy_pendulum")

# Evaluate the loaded policy
mean_reward, std_reward = evaluate_policy(saved_policy,
                                          env,
                                          n_eval_episodes=10,
                                          deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

示例#21

0

显示文件

    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            if self.model.get_vec_normalize_env() is not None:
                try:
                    sync_envs_normalization(self.training_env, self.eval_env)
                except AttributeError:
                    raise AssertionError(
                        "Training and eval env are not wrapped the same way, "
                        "see https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html#evalcallback "
                        "and warning above.")

            # Reset success rate buffer
            self._is_success_buffer = []

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_success_callback,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    **kwargs,
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, "
                      f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(
                    f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}"
                )
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose > 0:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            # Dump log so the evaluation results are printed with the correct timestep
            self.logger.record("time/total_timesteps",
                               self.num_timesteps,
                               exclude="tensorboard")
            self.logger.dump(self.num_timesteps)

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, "best_model"))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True

示例#22

0

显示文件

def test_goal_env(model_class):
    env = BitFlippingEnv(n_bits=4)
    # check that goal env works for PPO/A2C that cannot use HER replay buffer
    model = model_class("MultiInputPolicy", env, n_steps=64).learn(250)
    evaluate_policy(model, model.get_env())

示例#23

0

显示文件

                             log_path=log_dir,
                             eval_freq=500)
'''
# run baseline algorithm
baseline_score = 0
done = False
observation = env.reset(NO_logging=0)
while not done:
    action = 'baseline'
    observation_, reward, done, info = env.step(action)
    #if done:
    #    env.plot()
    observation = observation_
    baseline_score += reward
print('baseline score: %.3f' % baseline_score)'''

# Train the agent
timesteps = 1 * 5000  #1e5
#model.learn(total_timesteps=int(timesteps), callback=ransim_callback)
model.learn(total_timesteps=int(timesteps))

#fig = plt.figure( )
plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "A2C ran-sim")
plt.savefig(log_dir + 'A2C_ran-sim_rewards_plot.png', format="png")
plt.show()

episode_rewards, episode_lengths = evaluate_policy(model,
                                                   env,
                                                   n_eval_episodes=10,
                                                   return_episode_rewards=True)

示例#24

0

显示文件

import gym
import numpy as np

from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv

from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('Pendulum-v0')
env = DummyVecEnv([lambda: env])

model = SAC('MlpPolicy', env, verbose=1)

print("start model evaluation without learning !")
mean_reward_before, std_reward_before = evaluate_policy(model,
                                                        env,
                                                        n_eval_episodes=100)
print("end model evaluation !")

print("start model learning !")
model.learn(total_timesteps=10000, log_interval=10)
print("end model learning !")

print("-> model saved !!")
model.save("sac_pendulum")

print("start model evaluation with learning !")
mean_reward_after, std_reward_after = evaluate_policy(model,
                                                      env,
                                                      n_eval_episodes=100)
print("end model evaluation !")

示例#25

0

显示文件

import numpy as np
import gym
import gym_fishing
from stable_baselines3 import TD3
from stable_baselines3.common.env_checker import check_env

env = gym.make('fishing-v1')
check_env(env)

load = False
if load:
    model = TD3.load("td3")
else:
    model = TD3('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=200)

## Simulate a run with the trained model, visualize result
df = env.simulate(model)
env.plot(df, "td3.png")

## Evaluate model
from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print("mean reward:", mean_reward, "std:", std_reward)

## Save and reload the model
if not load:
    model.save("td3")
    model = TD3.load("td3")

示例#26

0

显示文件

文件： train.py 项目： gkswamy98/pillbox

def train_adril(env, n=0, balanced=False):
    num_trajs = 20
    expert_data = make_sa_dataset(env, max_trajs=num_trajs)
    n_expert = len(expert_data["obs"])
    expert_sa = np.concatenate(
        (expert_data["obs"], np.reshape(expert_data["acts"], (n_expert, -1))),
        axis=1)

    for i in range(0, n):
        venv = AdRILWrapper(gym.make(env))
        mean_rewards = []
        std_rewards = []
        # Create model
        if isinstance(venv.action_space, Discrete):
            model = DQN(SQLPolicy,
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[64, 64]),
                        learning_starts=1)
        else:
            model = SAC('MlpPolicy',
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[256, 256]),
                        ent_coef='auto',
                        learning_rate=linear_schedule(7.3e-4),
                        train_freq=64,
                        gradient_steps=64,
                        gamma=0.98,
                        tau=0.02)
        model.replay_buffer = AdRILReplayBuffer(model.buffer_size,
                                                model.observation_space,
                                                model.action_space,
                                                model.device,
                                                1,
                                                model.optimize_memory_usage,
                                                expert_data=expert_data,
                                                N_expert=num_trajs,
                                                balanced=balanced)
        if not balanced:
            for j in range(len(expert_sa)):
                obs = expert_data["obs"][j]
                act = expert_data["acts"][j]
                next_obs = expert_data["next_obs"][j]
                done = expert_data["dones"][j]
                model.replay_buffer.add(obs, next_obs, act, -1, done)
        for train_steps in range(400):
            # Train policy
            if train_steps > 0:
                if 'Bullet' in env:
                    model.learn(total_timesteps=1250, log_interval=1000)
                else:
                    model.learn(total_timesteps=25000, log_interval=1000)
                if train_steps % 1 == 0:  # written to support more complex update schemes
                    model.replay_buffer.set_iter(train_steps)
                    model.replay_buffer.set_n_learner(venv.num_trajs)

            # Evaluate policy
            if train_steps % 20 == 0:
                model.set_env(gym.make(env))
                mean_reward, std_reward = evaluate_policy(model,
                                                          model.env,
                                                          n_eval_episodes=10)
                mean_rewards.append(mean_reward)
                std_rewards.append(std_reward)
                print("{0} Steps: {1}".format(int(train_steps * 1250),
                                              mean_reward))
                np.savez(os.path.join("learners", env,
                                      "adril_rewards_{0}".format(i)),
                         means=mean_rewards,
                         stds=std_rewards)
            # Update env
            if train_steps > 0:
                if train_steps % 1 == 0:
                    venv.set_iter(train_steps + 1)
            model.set_env(venv)

示例#27

0

显示文件

def test_evaluate_policy_monitors(vec_env_class):
    # Make numpy warnings throw exception
    np.seterr(all="raise")
    # Test that results are correct with monitor environments.
    # Also test VecEnvs
    n_eval_episodes = 3
    n_envs = 2
    env_id = "CartPole-v0"
    model = A2C("MlpPolicy", env_id, seed=0)

    def make_eval_env(with_monitor, wrapper_class=gym.Wrapper):
        # Make eval environment with or without monitor in root,
        # and additionally wrapped with another wrapper (after Monitor).
        env = None
        if vec_env_class is None:
            # No vecenv, traditional env
            env = gym.make(env_id)
            if with_monitor:
                env = Monitor(env)
            env = wrapper_class(env)
        else:
            if with_monitor:
                env = vec_env_class(
                    [lambda: wrapper_class(Monitor(gym.make(env_id)))] *
                    n_envs)
            else:
                env = vec_env_class([lambda: wrapper_class(gym.make(env_id))] *
                                    n_envs)
        return env

    # Test that evaluation with VecEnvs works as expected
    eval_env = make_eval_env(with_monitor=True)
    _ = evaluate_policy(model, eval_env, n_eval_episodes)
    eval_env.close()

    # Warning without Monitor
    eval_env = make_eval_env(with_monitor=False)
    with pytest.warns(UserWarning):
        _ = evaluate_policy(model, eval_env, n_eval_episodes)
    eval_env.close()

    # Test that we gather correct reward with Monitor wrapper
    # Sanity check that we get zero-reward without Monitor
    eval_env = make_eval_env(with_monitor=False,
                             wrapper_class=ZeroRewardWrapper)
    average_reward, _ = evaluate_policy(model,
                                        eval_env,
                                        n_eval_episodes,
                                        warn=False)
    assert average_reward == 0.0, "ZeroRewardWrapper wrapper for testing did not work"
    eval_env.close()

    # Should get non-zero-rewards with Monitor (true reward)
    eval_env = make_eval_env(with_monitor=True,
                             wrapper_class=ZeroRewardWrapper)
    average_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes)
    assert average_reward > 0.0, "evaluate_policy did not get reward from Monitor"
    eval_env.close()

    # Test that we also track correct episode dones, not the wrapped ones.
    # Sanity check that we get only one step per episode.
    eval_env = make_eval_env(with_monitor=False,
                             wrapper_class=AlwaysDoneWrapper)
    episode_rewards, episode_lengths = evaluate_policy(
        model,
        eval_env,
        n_eval_episodes,
        return_episode_rewards=True,
        warn=False)
    assert all(map(lambda l: l == 1, episode_lengths)
               ), "AlwaysDoneWrapper did not fix episode lengths to one"
    eval_env.close()

    # Should get longer episodes with with Monitor (true episodes)
    eval_env = make_eval_env(with_monitor=True,
                             wrapper_class=AlwaysDoneWrapper)
    episode_rewards, episode_lengths = evaluate_policy(
        model, eval_env, n_eval_episodes, return_episode_rewards=True)
    assert all(map(lambda l: l > 1, episode_lengths)
               ), "evaluate_policy did not get episode lengths from Monitor"
    eval_env.close()

示例#28

0

显示文件

文件： demo.py 项目： lkiel/rl-doom

import config
from environments import utils as env_utils

from helpers import cli

if __name__ == '__main__':
    # Extract command line arguments
    parser = cli.get_parser()
    args = parser.parse_args()

    scenario = args.scenario
    load_from = args.load
    config_path = args.config

    # Load config for session
    conf = config.load(config_path)

    # Create environments.
    eval_env = env_utils.get_evaluation_env(conf.environment_config)

    # Load agent
    agent = conf.get_agent(env=eval_env, load_from=load_from)

    mean_reward, std_reward = evaluation.evaluate_policy(agent,
                                                         eval_env,
                                                         n_eval_episodes=100)
    print(f'Mean reward {mean_reward} +/- {std_reward}')

    eval_env.close()

示例#29

0

显示文件

    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            # Reset success rate buffer
            self._is_success_buffer = []

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_success_callback,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    **kwargs,
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.last_mean_reward = mean_reward
            self.last_std = std_reward

            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, "
                      f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(
                    f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}"
                )
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose > 0:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    print("best model saving desactivated")
                    #self.model.save(os.path.join(self.best_model_save_path, "best_model"))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True

示例#30

0

显示文件

文件： sb3_on_policy_train.py 项目： benblack769/reward-surfaces

    def _on_step(self) -> bool:
        if self.n_calls > 0 and (self.n_calls - 1) % self.eval_freq == 0:
            self.old_params = [
                param.clone() for param in self.model.policy.parameters()
            ]
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            # Reset success rate buffer
            self._is_success_buffer = []

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_success_callback,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    **kwargs,
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, "
                      f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(
                    f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}"
                )
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose > 0:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            # Dump log so the evaluation results are printed with the correct timestep
            self.logger.record("time/total timesteps",
                               self.num_timesteps,
                               exclude="tensorboard")
            self.logger.dump(self.num_timesteps)

            if mean_reward >= self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, "checkpoint"))
                self.best_mean_reward = mean_reward
                os.makedirs(self.log_path, exist_ok=True)
                save_path = os.path.join(self.log_path, "checkpoint")
                self.save_folders.append(save_path)
                model_parameters = self.model.policy.state_dict()
                grads = OrderedDict([
                    (name, param.grad)
                    for name, param in model_parameters.items()
                ])
                torch.save(model_parameters,
                           os.path.join(self.log_path, "parameters.th"))
                torch.save(grads, os.path.join(self.log_path, "grads.th"))

                if self.old_params is not None:
                    delta = OrderedDict([
                        (name, param - old_param)
                        for old_param, (name, param) in zip(
                            self.old_params, model_parameters.items())
                    ])
                    torch.save(delta,
                               os.path.join(self.log_path, "prev_step.th"))

                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True