Пример #1
0
def create_eval_envs(all_envs, time_limit=400, seed=0, discrete=False):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    eval_envs = []
    for eid, (ob_rms_fname, env) in all_envs.items():
        [ob_rms] = torch.load(ob_rms_fname)
        eval_env = env(discrete=discrete)
        # eval_env.debug['show_reasons'] = True
        eval_env = utils.env.wrap_env(eval_env,
                                      action_normalize=not discrete,
                                      time_limit=time_limit,
                                      deterministic=True,
                                      seed=seed)
        env_fn = lambda: eval_env
        assert (ob_rms != None)
        envs = utils.env.vectorize_env([env_fn],
                                       state_normalize=True,
                                       device=device,
                                       train=False,
                                       ob_rms=ob_rms)
        eval_envs += [envs]

    return eval_envs
Пример #2
0
import torch
import json
import numpy as np
import matplotlib.pyplot as plt
import os

all_envs = {
    0: ("models/ob_rms/osco.pt", craft.OneStoppedCarOEnv),
    1: ("models/ob_rms/osc.pt", craft.OneStoppedCarEnv),
    2: ("models/ob_rms/2sc.pt", craft.TwoStoppedCarsEnv),
    3: ("models/ob_rms/3sc.pt", craft.ThreeStoppedCarsSSO),
}
num_episodes = 10
render = False
discrete = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
intervals = 10
all_buckets = {}
bmax = 0
plt.figure(figsize=(10, 6))

for env_id, (model_name, env) in all_envs.items():
    eval_env = env(discrete=discrete)
    eval_env.debug['action_buckets'] = True
    if not os.path.isfile(model_name):
        print("Not trained, run with -train")
        exit(0)
    policy = utils.torch.load_empty_policy(learn.PolicyPPO,
                                           "models/gym_spaces.pt",
                                           hidden=64)
    ob_rms = policy.load_model(model_name)
Пример #3
0
def train_ppo(env_class,
              steps,
              track_eps=25,
              log_interval=1,
              solved_at=90.0,
              continual_solved_at=90.0,
              care_about=None,
              num_processes=8,
              gamma=0.99,
              MaxT=400,
              num_steps=128,
              clip_param=0.3,
              linear_schedule=True,
              policy=None,
              ob_rms=None,
              eval_envs=None,
              eval_eps=-1,
              hidden=-1,
              entropy_coef=0,
              linear_schedule_mode=0,
              lr=3e-4,
              training_seed=0,
              verbosity=1,
              training_method=learn.PPO,
              log_extras={},
              policy_class=learn.PolicyPPO,
              discrete=False):

    assert (verbosity in [1, 2])
    is_continual = training_method.__name__ in ["PPO_EWC", "PPO_DM"]
    if is_continual: assert (care_about != None)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_env_steps = int(steps)
    if eval_envs != None: assert (eval_eps > 0)

    def env_fn(i):
        env = env_class(discrete=discrete)
        # env.debug['show_reasons'] = True
        env = utils.env.wrap_env(
            env,
            action_normalize=not discrete,
            time_limit=MaxT,
            deterministic=True,
            seed=i,
        )
        return lambda: env

    envs = utils.env.vectorize_env(
        [env_fn(i) for i in range(num_processes)],
        state_normalize=True,
        device=device,
        train=True,
    )
    if ob_rms != None: envs.ob_rms = ob_rms

    obs_space, action_space = envs.observation_space, envs.action_space
    init_obs = envs.reset()

    torch.manual_seed(training_seed)
    print("training_method = %s" % training_method.__name__)
    agent = training_method(obs_space,
                            action_space,
                            init_obs,
                            clip_param=clip_param,
                            num_steps=num_steps,
                            lr=lr,
                            num_processes=num_processes,
                            gamma=gamma,
                            policy=policy,
                            hidden=hidden,
                            linear_schedule=linear_schedule,
                            entropy_coef=entropy_coef,
                            linear_schedule_mode=linear_schedule_mode,
                            policy_class=policy_class)

    num_updates = agent.compute_updates_needed(num_env_steps, num_processes)
    episode_rewards = collections.deque(maxlen=track_eps)
    s = collections.deque(maxlen=track_eps)
    log_dict = {
        'r': episode_rewards,
        'eps_done': 0,
        'satisfactions': s,
        **log_extras
    }
    start = utils.timer()
    ret_steps = -1

    for j in range(num_updates):

        agent.pre_step(j, num_updates)
        agent.step(envs, log=log_dict)
        vloss, piloss, ent = agent.train()

        if (j + 1) % log_interval == 0 and len(log_dict['r']) > 1:

            total_num_steps = (j + 1) * num_processes * num_steps
            elapsed = "Elapsed %s" % utils.timer_done(start)

            MeanR = np.mean(log_dict['r'])
            MedR = np.median(log_dict['r'])
            MinR = np.min(log_dict['r'])
            MaxR = np.max(log_dict['r'])
            if verbosity == 1:
                reward_stats = "MeanR:%.2f" % (MeanR)
                extra_stats = [reward_stats]
            elif verbosity == 2:
                reward_stats1 = "MeanR,MedR:%.2f,%.2f" % (MeanR, MedR)
                reward_stats2 = "MinR,MaxR:%.2f,%.2f" % (MinR, MaxR)
                reg_loss = None
                if type(ent) == list:
                    ent, reg_loss = ent
                loss_stats = "Ent:%f, VLoss:%f, PiLoss:%f" % (ent, vloss,
                                                              piloss)
                if reg_loss is not None: loss_stats += ", Reg:%f" % (reg_loss)
                extra_stats = [
                    reward_stats1,
                    reward_stats2,
                    loss_stats,
                ]
            reasons = "Reasons: %s" % (set(list(s)))
            stats = [
                "Steps:%g" % total_num_steps,
                "Eps:%d" % log_dict['eps_done'],
                elapsed,
                *extra_stats,
            ]
            print(" ".join(stats))
            print(reasons)
            if eval_envs != None:
                eval_rews = []
                for eval_env in eval_envs:
                    eval_rews += [
                        utils.env.evaluate_ppo(agent.actor_critic,
                                               None,
                                               eval_env,
                                               device,
                                               num_episodes=eval_eps,
                                               wrap=False,
                                               silent=True)
                    ]
                    eval_rews[-1] = round(eval_rews[-1], 2)
                if is_continual:
                    eval_MeanR = np.mean(
                        np.clip(eval_rews[:care_about], -100., 100.))
                if not is_continual and care_about != None:
                    eval_relevant_R = np.clip(eval_rews[care_about - 1], -100.,
                                              100.)
                print(eval_rews)
                # print("")
            sys.stdout.flush()

            if MeanR >= solved_at:
                if eval_envs != None:
                    if is_continual:
                        if eval_MeanR < continual_solved_at:
                            continue
                    if not is_continual and care_about != None:
                        if eval_relevant_R < solved_at:
                            continue

                print("Model solved! Continue")
                ret_steps = total_num_steps
                break

    if ret_steps == -1: print("Not solved.")
    ob_rms = utils.env.get_ob_rms(envs)
    assert (ob_rms != None)
    envs.close()
    return agent.actor_critic, ob_rms, ret_steps