def create_eval_envs(all_envs, time_limit=400, seed=0, discrete=False): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") eval_envs = [] for eid, (ob_rms_fname, env) in all_envs.items(): [ob_rms] = torch.load(ob_rms_fname) eval_env = env(discrete=discrete) # eval_env.debug['show_reasons'] = True eval_env = utils.env.wrap_env(eval_env, action_normalize=not discrete, time_limit=time_limit, deterministic=True, seed=seed) env_fn = lambda: eval_env assert (ob_rms != None) envs = utils.env.vectorize_env([env_fn], state_normalize=True, device=device, train=False, ob_rms=ob_rms) eval_envs += [envs] return eval_envs
import torch import json import numpy as np import matplotlib.pyplot as plt import os all_envs = { 0: ("models/ob_rms/osco.pt", craft.OneStoppedCarOEnv), 1: ("models/ob_rms/osc.pt", craft.OneStoppedCarEnv), 2: ("models/ob_rms/2sc.pt", craft.TwoStoppedCarsEnv), 3: ("models/ob_rms/3sc.pt", craft.ThreeStoppedCarsSSO), } num_episodes = 10 render = False discrete = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") intervals = 10 all_buckets = {} bmax = 0 plt.figure(figsize=(10, 6)) for env_id, (model_name, env) in all_envs.items(): eval_env = env(discrete=discrete) eval_env.debug['action_buckets'] = True if not os.path.isfile(model_name): print("Not trained, run with -train") exit(0) policy = utils.torch.load_empty_policy(learn.PolicyPPO, "models/gym_spaces.pt", hidden=64) ob_rms = policy.load_model(model_name)
def train_ppo(env_class, steps, track_eps=25, log_interval=1, solved_at=90.0, continual_solved_at=90.0, care_about=None, num_processes=8, gamma=0.99, MaxT=400, num_steps=128, clip_param=0.3, linear_schedule=True, policy=None, ob_rms=None, eval_envs=None, eval_eps=-1, hidden=-1, entropy_coef=0, linear_schedule_mode=0, lr=3e-4, training_seed=0, verbosity=1, training_method=learn.PPO, log_extras={}, policy_class=learn.PolicyPPO, discrete=False): assert (verbosity in [1, 2]) is_continual = training_method.__name__ in ["PPO_EWC", "PPO_DM"] if is_continual: assert (care_about != None) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_env_steps = int(steps) if eval_envs != None: assert (eval_eps > 0) def env_fn(i): env = env_class(discrete=discrete) # env.debug['show_reasons'] = True env = utils.env.wrap_env( env, action_normalize=not discrete, time_limit=MaxT, deterministic=True, seed=i, ) return lambda: env envs = utils.env.vectorize_env( [env_fn(i) for i in range(num_processes)], state_normalize=True, device=device, train=True, ) if ob_rms != None: envs.ob_rms = ob_rms obs_space, action_space = envs.observation_space, envs.action_space init_obs = envs.reset() torch.manual_seed(training_seed) print("training_method = %s" % training_method.__name__) agent = training_method(obs_space, action_space, init_obs, clip_param=clip_param, num_steps=num_steps, lr=lr, num_processes=num_processes, gamma=gamma, policy=policy, hidden=hidden, linear_schedule=linear_schedule, entropy_coef=entropy_coef, linear_schedule_mode=linear_schedule_mode, policy_class=policy_class) num_updates = agent.compute_updates_needed(num_env_steps, num_processes) episode_rewards = collections.deque(maxlen=track_eps) s = collections.deque(maxlen=track_eps) log_dict = { 'r': episode_rewards, 'eps_done': 0, 'satisfactions': s, **log_extras } start = utils.timer() ret_steps = -1 for j in range(num_updates): agent.pre_step(j, num_updates) agent.step(envs, log=log_dict) vloss, piloss, ent = agent.train() if (j + 1) % log_interval == 0 and len(log_dict['r']) > 1: total_num_steps = (j + 1) * num_processes * num_steps elapsed = "Elapsed %s" % utils.timer_done(start) MeanR = np.mean(log_dict['r']) MedR = np.median(log_dict['r']) MinR = np.min(log_dict['r']) MaxR = np.max(log_dict['r']) if verbosity == 1: reward_stats = "MeanR:%.2f" % (MeanR) extra_stats = [reward_stats] elif verbosity == 2: reward_stats1 = "MeanR,MedR:%.2f,%.2f" % (MeanR, MedR) reward_stats2 = "MinR,MaxR:%.2f,%.2f" % (MinR, MaxR) reg_loss = None if type(ent) == list: ent, reg_loss = ent loss_stats = "Ent:%f, VLoss:%f, PiLoss:%f" % (ent, vloss, piloss) if reg_loss is not None: loss_stats += ", Reg:%f" % (reg_loss) extra_stats = [ reward_stats1, reward_stats2, loss_stats, ] reasons = "Reasons: %s" % (set(list(s))) stats = [ "Steps:%g" % total_num_steps, "Eps:%d" % log_dict['eps_done'], elapsed, *extra_stats, ] print(" ".join(stats)) print(reasons) if eval_envs != None: eval_rews = [] for eval_env in eval_envs: eval_rews += [ utils.env.evaluate_ppo(agent.actor_critic, None, eval_env, device, num_episodes=eval_eps, wrap=False, silent=True) ] eval_rews[-1] = round(eval_rews[-1], 2) if is_continual: eval_MeanR = np.mean( np.clip(eval_rews[:care_about], -100., 100.)) if not is_continual and care_about != None: eval_relevant_R = np.clip(eval_rews[care_about - 1], -100., 100.) print(eval_rews) # print("") sys.stdout.flush() if MeanR >= solved_at: if eval_envs != None: if is_continual: if eval_MeanR < continual_solved_at: continue if not is_continual and care_about != None: if eval_relevant_R < solved_at: continue print("Model solved! Continue") ret_steps = total_num_steps break if ret_steps == -1: print("Not solved.") ob_rms = utils.env.get_ob_rms(envs) assert (ob_rms != None) envs.close() return agent.actor_critic, ob_rms, ret_steps