def process(file): env = gym.make('PerigeeRaising-Continuous3D-v0') env = NormalizeObservationSpace(env, lambda o: o / env.unwrapped.observation_space.high) env = Monitor(env) env.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution(get_action_dim(env.action_space)) evaluate_policy(agent, env, n_eval_episodes=1) hist_sc_state = env.unwrapped.hist_sc_state hist_action = env.unwrapped.hist_action time = np.array(list(map(lambda sc_state: sc_state.getDate().durationFrom(hist_sc_state[0].getDate()), hist_sc_state))) / 3600.0 # Convert to hours a = np.array(list(map(lambda sc_state: sc_state.getA(), hist_sc_state))) / 1000.0 # Convert to km e = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state))) mass = np.array(list(map(lambda sc_state: sc_state.getMass(), hist_sc_state))) ra = a * (1.0 + e) rp = a * (1.0 - e) v = np.array(list(map(lambda sc_state: sc_state.getPVCoordinates().getVelocity().toArray(), hist_sc_state))) h = np.array(list(map(lambda sc_state: sc_state.getPVCoordinates().getMomentum().toArray(), hist_sc_state))) angle_f_v = list(map(lambda q: np.degrees(np.arccos( np.dot(q[0], q[1]) / np.linalg.norm(q[0]) / (np.linalg.norm(q[1]) + 1e-10) )), zip(v, hist_action))) hist_action_plane = list(map(lambda q: q[1] - np.dot(q[1], q[0]) * q[0] / (np.linalg.norm(q[0]) ** 2), zip(h, hist_action))) angle_fp_v = list(map(lambda q: np.degrees(np.arccos( np.dot(q[0], q[1] * [1, 1, 0]) / np.linalg.norm(q[0]) / ( np.linalg.norm(q[1] * [1, 1, 0]) + 1e-10) )), zip(v, hist_action_plane))) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=ra[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(ra[0] - 20.0, ra[0] + 20.0) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("ra (km)") axs.plot(time, ra, "k") plt.tight_layout() fig.savefig("plan_ra.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=rp[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(rp[0] - 5.0, rp[0] + 35.0) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("rp (km)") axs.plot(time, rp, "k") plt.tight_layout() fig.savefig("plan_rp.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=mass[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(mass[0] - 0.04, mass[0]) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("mass (kg)") axs.plot(time, mass, "k") plt.tight_layout() fig.savefig("plan_m.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain') axs.set_xlim(time[0], time[-1]) axs.set_ylim(-1.3, 1.3) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("action") l1, l2, l3 = axs.plot(time[0:-1], hist_action, "k") l1.set_color("#000000") l2.set_color("#777777") l3.set_color("#BBBBBB") axs.legend(["Act1", "Act2", "Act3"], loc='upper left') plt.tight_layout() fig.savefig("plan_action.pdf", format="pdf") plt.close(fig)
import numpy as np import gym import gym_fishing from stable_baselines3 import PPO from stable_baselines3.common.evaluation import evaluate_policy env = gym.make("fishing-v0") model = PPO("MlpPolicy", env, verbose=0) model.learn(total_timesteps=100000) ## simulate and plot results df = env.simulate(model, reps=10) env.plot(df, "ppo.png") ## Evaluate model mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=50) print("mean reward:", mean_reward, "std:", std_reward) # save trained agent for future use, if desired # model.save("ppo")
#!/usr/bin/env python # _*_ coding: utf-8 _*_ # @Time : 2021/1/18 17:52 # @Author : lucky3721 # @Version:V 1.0 # @File : lunar_land_example.py # @desc : import gym import numpy as np from stable_baselines3 import DQN from stable_baselines3.common.evaluation import evaluate_policy model = DQN('MlpPolicy', 'LunarLander-v2', verbose=1, exploration_final_eps=0.1, target_update_interval=250) model.learn(total_timesteps=int(1e5)) # Separate env for evaluation eval_env = gym.make('LunarLander-v2') # Random Agent, before training mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
time_stamp=datetime.datetime.now().strftime("-%Y%m%d-%H%M%S") model_log= LOG_DIR + model_name + time_stamp dqn_model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=model_log, buffer_size=100000) max_steps=100 dqn_model.learn(total_timesteps=max_steps) # Commented out IPython magic to ensure Python compatibility. # %tensorboard --logdir {LOG_DIR} """## Εκτίμηση απόδοσης""" from stable_baselines3.common.evaluation import evaluate_policy mean_reward, std_reward = evaluate_policy(dqn_model, test_env, n_eval_episodes=10) print(f"Eval reward: {mean_reward} (+/-{std_reward})") """## Σώσιμο εκπαιδευμένου μοντέλου""" dqn_model.save("dqn_pong") """Το μοντέλο θα αποθηκευθεί ως zip και μπορείτε να το κατεβάσετε τοπικά από το αριστερό sidebar του Colab στο "Files" και μετά στο ellipsis menu πάνω στο filename. ## Φόρτωση εκπαιδευμένου μοντέλου Από το αριστερό sidebar του Colab και το "Files" ανεβάστε το αρχείο zip του εκπαιδευμένου μοντέλου. Εδώ θα ανεβάσουμε ένα μοντέλο Α2C που έχουμε εκπαιδεύσει νωρίτερα. Μπορείτε να το κατεβάσετε από [εδώ](https://drive.google.com/uc?export=download&id=1COsaNOH8SjbpxxIYc5lOF-QUiUJGU5ZB). Αν χρειαστεί μετονομάστε το αρχείο σε a2c_pong.zip
url = hash_url(file) # get hash URL at start of execution tensorboard_log = "/var/log/tensorboard/leaderboard" seed = 0 ENV = "fishing-v1" env = gym.make(ENV, sigma=0.1) vec_env = make_vec_env(ENV, n_envs=4, seed=seed, sigma=0.1) # parallel workers for PPO, A2C ## Constant Escapement ###################################################### model = escapement(env) df = env.simulate(model, reps=10) env.plot(df, "results/escapement.png") mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000) leaderboard("ESC", ENV, mean_reward, std_reward, url) print("algo:", "ESC", "env:", ENV, "mean reward:", mean_reward, "std:", std_reward) ## MSY ###################################################################### model = msy(env) df = env.simulate(model, reps=10) env.plot(df, "results/msy.png") mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000) # Rescale score against optimum solution in this environment opt = escapement(env) opt_reward, std_reward = evaluate_policy(opt, env, n_eval_episodes=100) mean_reward = mean_reward / opt_reward std_reward = std_reward / opt_reward leaderboard("MSY", ENV, mean_reward, std_reward, url)
def main(): # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(), # torch.nn.Linear(64, 2)) os.makedirs(_log_dir, exist_ok=True) DoTraining = True StartFresh = True num_cpu = 8 if (DoTraining): # This doesn't work but it might have something to do with how the environment is written # num_cpu = 1 # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor # Create the callback: check every 1000 steps # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir) if (StartFresh): env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch': [128, 128, 128], } model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=2, tensorboard_log=tb_log) else: env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize.load(_stats_path, env) env.reset() model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl', tensorboard_log=tb_log) model.set_env(env) eval_env = gym.make(env_id) # print('!!!!Checking Environment!!!!') # print(check_env(eval_env)) mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') for _ in range(50): model.learn(total_timesteps=100000, tb_log_name=env_id, reset_num_timesteps=False) #, callback=callback mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) + '.mdl') env.save(_log_dir + 'vec_normalize_{}'.format(model.num_timesteps) + '.pkl') if (not DoTraining): # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # eval_env = VecVideoRecorder(eval_env, video_folder='videos/', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='test') # eval_env.training = False # eval_env.norm_reward = False # eval_env.reset() eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl', tensorboard_log=tb_log) model.set_env(eval_env) # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id) # Start the video at step=0 and record 500 steps # eval_env = VecVideoRecorder(eval_env, video_folder='tmp', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='') obs = eval_env.reset() # for i in range(500): # action, _ = model.predict(obs) # obs, _, _, _ = eval_env.step(action) # eval_env.close() while True: action, _states = model.predict(obs, deterministic=True) obs, _, done, _ = eval_env.step(action) # eval_env.render() if done.any(): # obs = eval_env.reset() # time.sleep(1/30) eval_env.close() break
def main(): set_random_seed(RANDOM_SEED) t_start = time() name = "LargeFinalLayer" checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name) os.makedirs(checkpoint_path, exist_ok=True) log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name) os.makedirs(log_path, exist_ok=True) results_path = os.path.join(checkpoint_path, "results.json") env_args = dict( frame_skip=4, screen_size=84, terminal_on_life_loss=True, clip_reward=True, ) # Creates a gym environment for an atari game using the specified seed and number of environments # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors # for improved performance.. # train_env = make_atari_env(ENV_NAME, n_envs=N_ENVS, seed=RANDOM_SEED, wrapper_kwargs=env_args) def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **env_args) return env def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=None, vec_env_kwargs=None, monitor_kwargs=None, ) train_env = make_env(0, N_ENVS) eval_env = make_env(1, 1) # required by models in baselines train_env = VecTransposeImage(train_env) eval_env = VecTransposeImage(eval_env) # setup callback to save model at fixed intervals save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=checkpoint_path, name_prefix=name) stop_callback = StopTrainingOnRewardThreshold( reward_threshold=EVAL_THRESHOLD) time_callback = TimeLimitCallback(max_time=TIME_LIMIT) best_callback = EvalCallback( eval_env, eval_freq=EVAL_FREQ, best_model_save_path=checkpoint_path, callback_on_new_best=stop_callback, ) list_callback = CallbackList([save_callback, best_callback, time_callback]) model = PPO( CnnPolicy, train_env, verbose=VERBOSE, batch_size=BATCH_SIZE, seed=RANDOM_SEED, tensorboard_log=log_path, learning_rate=LEARNING_RATE, n_steps=UPDATE_STEPS, n_epochs=N_EPOCHS, ent_coef=ENT_COEF, vf_coef=VF_COEF, clip_range=CLIP_RANGE, device=DEVICE_TYPE, policy_kwargs=dict(features_extractor_class=FeatureExtractor), ) config_path = os.path.join(checkpoint_path, "cnn_config") zip_path = os.path.join(checkpoint_path, "model.zip") # output the model config to a file for easier viewing with open(config_path, "w") as file: file.write(f"{name}\n") file.write(str(model.policy.features_extractor.cnn)) print("Beginning training...") model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run") # model.learn(TRAIN_STEPS, tb_log_name="run") model.save(zip_path) del train_env # del eval_env time_taken = time() - t_start print("Beginning evaluation...") # score of the game, standard deviation of multiple runs reward_mean, reward_std = evaluate_policy(model, make_env(2, 1)) with open(results_path, "w") as handle: handle.write(json.dumps((reward_mean, reward_std, time_taken)))
env = gym.make("IntelligentPantry-v1") #env = gym.make("Reacher-v2") observation = env.reset() print(env.action_space) a = 0.45 b = 0.45 f = 1200 log_path = os.path.join('training', 'Logs') #env = DummyVecEnv([lambda: env]) model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path) model3 = TD3("MlpPolicy", env, verbose=1, tensorboard_log=log_path) model2 = DDPG('MlpPolicy', env, verbose=1, tensorboard_log=log_path) model3.learn(total_timesteps=500000, log_interval=100) eval = evaluate_policy(model3, env, n_eval_episodes=20, render=True) # episodes = 5 # for episode in range(1, episodes+1): # state = env.reset() # done = False # score = 0 # # while not done: # env.render() # action = env.action_space.sample() # n_state, reward, done, info = env.step(action) # score += reward # print("Episode:{} Score:{}".format(episode, score)) # env.close() # while f > 0:
def process(file): env = gym.make('PerigeeRaising-Continuous3D-v0') env.unwrapped._ref_sv[2] = 0.0 env.unwrapped._ref_sv[3] = 0.0 env.unwrapped._ref_sv[4] = 0.0 env = NormalizeObservationSpace( env, lambda o: o / env.unwrapped.observation_space.high) env = Monitor(env) env.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution( get_action_dim(env.action_space)) evaluate_policy(agent, env, n_eval_episodes=1) hist_sc_state = env.unwrapped.hist_sc_state hist_action = env.unwrapped.hist_action x = np.array( list( map( lambda sc_state: sc_state.getPVCoordinates().getPosition(). getX(), hist_sc_state))) / 1000.0 # Convert to km y = np.array( list( map( lambda sc_state: sc_state.getPVCoordinates().getPosition(). getY(), hist_sc_state))) / 1000.0 # Convert to km env2 = gym.make('PerigeeRaising-Continuous3D-v0') env2.unwrapped._ref_sv[0] = 11000000.0 / 1.05 env2.unwrapped._ref_sv[1] = 0.05 env2.unwrapped._ref_sv[2] = 0.0 env2.unwrapped._ref_sv[3] = 0.0 env2.unwrapped._ref_sv[4] = 0.0 env2 = NormalizeObservationSpace( env2, lambda o: o / env2.unwrapped.observation_space.high) env2 = Monitor(env2) env2.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution( get_action_dim(env.action_space)) evaluate_policy(agent, env2, n_eval_episodes=1) hist_sc_state2 = env2.unwrapped.hist_sc_state hist_action2 = env2.unwrapped.hist_action x2 = np.array( list( map( lambda sc_state: sc_state.getPVCoordinates().getPosition(). getX(), hist_sc_state2))) / 1000.0 # Convert to km y2 = np.array( list( map( lambda sc_state: sc_state.getPVCoordinates().getPosition(). getY(), hist_sc_state2))) / 1000.0 # Convert to km fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.set_xlim(-12000, 12000) axs.set_ylim(-12000, 12000) axs.grid(False) axs.plot(x, y, "k", zorder=2) l2, = axs.plot(x2, y2, zorder=1) l2.set_color("#777777") axs.legend(["Before", "After"], loc='upper right', frameon=False, bbox_to_anchor=(0.0, 1.0)) im = mpimg.imread('earth.png') plt.imshow(im, extent=[-6400, 6400, -6400, 6400], interpolation="none") axs.set_aspect('equal') plt.text(11000, 0, "Pericenter") plt.text(-18500, 0, "Apocenter") plt.axis('off') plt.tight_layout() fig.savefig("orbit.pdf", format="pdf") plt.close(fig)
"C:\\Users\\tsbau\\git\\tum-adlr-ws21-04\\runs\\MLP_S64_P64_V64_N1000_B64_lr3e-4_AntCpLeftBackBulletEnv-v0_14-02_23-42-32" ) model_dir = Path('') model = PPO.load(model_dir / "model2.zip", device='cpu') env_name = 'AntBulletEnv-v0' eval_env = gym.make(env_name) eval_env.render( ) # call this before env.reset, if you want a window showing the environment def logging_callback(local_args, globals): if local_args["done"]: i = len(local_args["episode_rewards"]) episode_reward = local_args["episode_reward"] episode_length = local_args["episode_length"] print(f"Finished {i} episode with reward {episode_reward}") mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=109, render=True, deterministic=True, return_episode_rewards=False, callback=logging_callback) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
from stable_baselines3 import ppo from stable_baselines3.ppo.ppo import PPO from stable_baselines3.common.evaluation import evaluate_policy import gym import Humanoid_Basic_Env env = gym.make('HumanoidTinyEnv-v0') # model = PPO.load('test_ppo', env=env) model = PPO('MlpPolicy', env=env) results = evaluate_policy(model=model, env=env, render=False) print(results)
def evaluate(individual: Individual, device: Union[torch.device, str] = "auto") -> Tuple[int]: """ Evaluate a single individual model and return it's mean score after the training time is elapsed. Models are trained and evaluated for a number of timestamps as parameterized in the constants at the top of the file. :param individual: The individual to evaluate. :return: """ t_start = time() layers = individual.weights name = individual.encode() checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name) if os.path.exists(checkpoint_path): return (random.randint(MIN_SCORE, MAX_SCORE), ) os.makedirs(checkpoint_path, exist_ok=True) log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name) os.makedirs(log_path, exist_ok=True) results_path = os.path.join(checkpoint_path, "results.json") if not os.path.exists(results_path): env_args = dict( frame_skip=4, screen_size=84, terminal_on_life_loss=True, clip_reward=True, ) # Creates a gym environment for an atari game using the specified seed and number of environments # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors # for improved performance.. def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **env_args) return env def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=SubprocVecEnv, vec_env_kwargs=None, monitor_kwargs=None, ) train_env = make_env(0, N_ENVS) eval_env = make_env(1, 1) # required by models in baselines train_env = VecTransposeImage(train_env) eval_env = VecTransposeImage(eval_env) # setup callback to save model at fixed intervals save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=checkpoint_path, name_prefix=name) stop_callback = StopTrainingOnRewardThreshold( reward_threshold=EVAL_THRESHOLD) time_callback = TimeLimitCallback(max_time=TIME_LIMIT) best_callback = EvalCallback( eval_env, eval_freq=EVAL_FREQ, best_model_save_path=checkpoint_path, callback_on_new_best=stop_callback, ) list_callback = CallbackList( [save_callback, best_callback, time_callback]) model = PPO( CnnPolicy, train_env, verbose=VERBOSE, batch_size=BATCH_SIZE, seed=RANDOM_SEED * 7, tensorboard_log=log_path, learning_rate=LEARNING_RATE, n_steps=UPDATE_STEPS, n_epochs=N_EPOCHS, ent_coef=ENT_COEF, vf_coef=VF_COEF, clip_range=CLIP_RANGE, device=device, policy_kwargs=dict(features_extractor_class=VariableBenchmark, features_extractor_kwargs=dict(layers=layers)), ) config_path = os.path.join(checkpoint_path, "cnn_config") zip_path = os.path.join(checkpoint_path, "model.zip") # output the model config to a file for easier viewing with open(config_path, "w") as file: file.write(f"{name}\n") file.write(str(model.policy.features_extractor.cnn)) print("Beginning training...") model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run") model.save(zip_path) del train_env del eval_env time_taken = time() - t_start print("Beginning evaluation...") # score of the game, standard deviation of multiple runs reward_mean, reward_std = evaluate_policy(model, make_env(2, 1)) with open(results_path, "w") as handle: handle.write(json.dumps((reward_mean, reward_std, time_taken))) else: reward_mean, reward_std, time_taken = json.load(open( results_path, "r")) reward_mean = abs(MIN_SCORE) + reward_mean value = (reward_mean * weighted_time(time_taken), ) print(f"Evaluated {name} with a score of {value} in {(time_taken):.2f}s") return value
def train_and_evaluate(config, train_env, eval_env, eval_callback, tb_log_name): """Trains and evaluates on separate environments with config parameters. Args: config (Dict): Relevant parameters. train_env (RampupEnv2): Environment to train with. eval_env (RampupEnv2): Environment to evaluate with. eval_callback (EvalCallback): Callback wrapper for evaluation callbacks during training. tb_log_name (str): Log name to identify metrics on Tensorboard. """ best_model = None best_mean = -np.inf # sched_LR = LinearSchedule(config["TIMESTEPS"], 0.005, 0.00001) for i in range(config["REPETITIONS"]): print(f"\nRunning repetition {i+1}/{config['REPETITIONS']}...") model = A2C( "MlpPolicy", train_env, learning_rate=config["LEARNING_RATE"], policy_kwargs=config["POLICY_KWARGS"], tensorboard_log=config["TENSORBOARD_LOG"], verbose=0, ) model.learn( total_timesteps=config["TIMESTEPS"], callback=[eval_callback, TensorboardCallback(tb_log_name)], tb_log_name=tb_log_name, ) if config["SHOW_TABLE"]: eval_env.fill_table = True obs = eval_env._set_initial_state(initial_state_status=3) while not eval_env.done: action, _states = model.predict(obs, deterministic=False) obs, reward, done, info = eval_env.step(action) mean_reward, std_reward = evaluate_policy( model, eval_env, n_eval_episodes=config["EVAL_EPISODES"] ) if mean_reward > best_mean: best_mean = mean_reward best_model = model if config["PUNISH_ILLEGAL"]: economic_potential = eval_env.demand.economic_potential_no_illegal() else: economic_potential = eval_env.demand.economic_potential() lost_potential = economic_potential - max(mean_reward, 0) lost_potential_perc = round(lost_potential / economic_potential * 100, 4) summary = "POLICY EVALUATION RESULTS" summary += f"\nEvaluated episodes:\t{config['EVAL_EPISODES']}" summary += f"\nMean reward:\t\t{mean_reward}" summary += f"\nStandard deviation:\t{std_reward}" summary += f"\nEconomic potential:\t{economic_potential}" summary += f"\nLost potential:\t\t{lost_potential} ({lost_potential_perc}%)" print(summary) if config["SHOW_TABLE"]: print("Sample Episode Table") display(eval_env.episode_table) return best_model, train_env, eval_env
def test_agent(agent): env = Monitor(get_env( 42)) # We always test with seed 42 because it's the answer to... :-) mean_reward, std_reward = evaluate_policy(agent, env, n_eval_episodes=10) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
# The noise objects for DDPG n_actions = env.action_space.shape[-1] #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1, tensorboard_log="./ddpg_pendulum_tensorboard/") print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("start model learning !") model.learn(total_timesteps=10000, log_interval=10) print("end model learning !") print("-> model saved !!") model.save("ddpg_pendulum") print("start model evaluation with learning !") mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !")
def test_discrete(model_class): env = IdentityEnv(10) model = model_class('MlpPolicy', env, gamma=0.5, seed=0).learn(3000) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
def train(env_function, name="model", n_processes: int = 6, seed: int = 0, load_checkpoint: Optional[str] = None, from_index=0, to_index=12, steps_per_episode=125 * 1000): """ Trains a model with a given environment :param env_function: Function that creates an gym.Env :param name: name for saving :param n_processes: number of processes used for training :param seed: :param load_checkpoint: if None: Create new model. Else: Load model from file :param steps_per_episode: Number of steps for model.learn() :param from_index: starting with this episode (for continuing training later than 0) :param to_index: last index of episode :return: """ def make_env(rank: int): """ Utility function for multiprocessed env. :param rank: index of the subprocess (needed to update seed) """ def _init(): env = env_function() # Important: use a different seed for each environment env.seed(seed + rank) return env return _init # Create the vectorized environment env_vector = SubprocVecEnv([make_env(i) for i in range(n_processes)]) # Create model if load_checkpoint is None: model = PPO( "MlpPolicy", env_vector, tensorboard_log="./ppo_trafficgym_tensorboard/", verbose=2, learning_rate=1e-2, # gamma=0.95, batch_size=256, policy_kwargs=dict(net_arch=[64, 64]), ) else: model = PPO.load(load_checkpoint) # Evaluate before training env = Monitor(env_function()) print("Evaluating...") evaluation = evaluate_policy(model, env) print("Eval1:", evaluation) # Actual training t1 = time.time() for i in range(from_index, to_index + 1): try: model.learn(steps_per_episode) print(f"Save model {i}") model.save(f"{name}{i:02d}.stable_baselines") except KeyboardInterrupt: print("Interrupted by KeyBoard") break t2 = time.time() print(f"Learning took {t2 - t1} seconds") # Evaluate after training print("Evaluating...") evaluation = evaluate_policy(model, env) print("Eval2:", evaluation)
max_dist = get_max_dist(args.env_size) steps_per_dist = args.n_steps // (max_dist + 1) goal_distance = 1 for i in range(1, max_dist): cur_envs.append( create_env(goal_distance=i, args=args, max_steps=args.max_steps)) # env.close() # del env # env = create_env(goal_distance=goal_distance, args=args) # model.set_env(env) total_steps = 0 for eval in range(n_evals): mean_reward, std_reward = evaluate_policy( model, eval_env, n_eval_episodes=args.n_eval_episodes) eval_data[eval, 0] = mean_reward eval_data[eval, 1] = std_reward print("Total Steps: {}".format(total_steps)) print(f"Mean Reward: {mean_reward:.2f} +/- {std_reward:.2f}") print("") if args.save_periodically: np.savez(fname + '.npz', eval_data=eval_data) # model.save(fname + '_steps_{}'.format(total_steps)) model.save(fname + '_partial') model.learn(total_timesteps=steps_per_eval) total_steps += steps_per_eval # potentially update curriculum if there is one
def process(file): env = gym.make('PerigeeRaising-Continuous3D-v0', use_perturbations=True, perturb_action=True) env = NormalizeObservationSpace( env, lambda o: o / env.unwrapped.observation_space.high) env = Monitor(env) env.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution( get_action_dim(env.action_space)) evaluate_policy(agent, env, n_eval_episodes=1) hist_sc_state = env.unwrapped.hist_sc_state hist_action = env.unwrapped.hist_action time = np.array( list( map( lambda sc_state: sc_state.getDate().durationFrom(hist_sc_state[ 0].getDate()), hist_sc_state))) / 3600.0 # Convert to hours a = np.array(list(map(lambda sc_state: sc_state.getA(), hist_sc_state))) / 1000.0 # Convert to km e = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state))) mass = np.array( list(map(lambda sc_state: sc_state.getMass(), hist_sc_state))) ra = a * (1.0 + e) rp = a * (1.0 - e) env2 = gym.make('PerigeeRaising-Continuous3D-v0') env2 = NormalizeObservationSpace( env2, lambda o: o / env2.unwrapped.observation_space.high) env2 = Monitor(env2) env2.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution( get_action_dim(env.action_space)) evaluate_policy(agent, env2, n_eval_episodes=1) hist_sc_state2 = env2.unwrapped.hist_sc_state hist_action2 = env2.unwrapped.hist_action time2 = np.array( list( map( lambda sc_state: sc_state.getDate().durationFrom( hist_sc_state2[0].getDate()), hist_sc_state2))) / 3600.0 # Convert to hours a2 = np.array(list(map(lambda sc_state: sc_state.getA(), hist_sc_state2))) / 1000.0 # Convert to km e2 = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state2))) mass2 = np.array( list(map(lambda sc_state: sc_state.getMass(), hist_sc_state2))) ra2 = a2 * (1.0 + e2) rp2 = a2 * (1.0 - e2) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=ra[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(ra[0] - 20.0, ra[0] + 20.0) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("ra (km)") l2, = axs.plot(time2, ra2, "--") l2.set_color("#777777") axs.plot(time, ra, "k") axs.legend(["Planned", "Real"], loc='upper left') plt.tight_layout() fig.savefig("real_ra.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=rp[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(rp[0] - 5.0, rp[0] + 35.0) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("rp (km)") l2, = axs.plot(time2, rp2, "--") l2.set_color("#777777") axs.plot(time, rp, "k") axs.legend(["Planned", "Real"], loc='upper left') plt.tight_layout() fig.savefig("real_rp.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=mass[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(mass[0] - 0.04, mass[0]) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("mass (kg)") l2, = axs.plot(time2, mass2, "--") l2.set_color("#777777") axs.plot(time, mass, "k") axs.legend(["Planned", "Real"], loc='upper right') plt.tight_layout() fig.savefig("real_m.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain') axs.set_xlim(time[0], time[-1]) axs.set_ylim(-1.3, 1.3) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("action") l1, l2, l3 = axs.plot(time[0:-1], hist_action) l1.set_color("#000000") l2.set_color("#777777") l3.set_color("#BBBBBB") axs.legend(["Act1", "Act2", "Act3"], loc='upper left') plt.tight_layout() fig.savefig("real_action.pdf", format="pdf") plt.close(fig)
print( f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer" ) # Save the policy independently from the model # Note: if you don't save the complete model with `model.save()` # you cannot continue training afterward policy = model.policy policy.save("sac_policy_pendulum") # Retrieve the environment env = model.get_env() # Evaluate the policy mean_reward, std_reward = evaluate_policy(policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}") # Load the policy independently from the model saved_policy = MlpPolicy.load("sac_policy_pendulum") # Evaluate the loaded policy mean_reward, std_reward = evaluate_policy(saved_policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize if self.model.get_vec_normalize_env() is not None: try: sync_envs_normalization(self.training_env, self.eval_env) except AttributeError: raise AssertionError( "Training and eval env are not wrapped the same way, " "see https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html#evalcallback " "and warning above.") # Reset success rate buffer self._is_success_buffer = [] episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, warn=self.warn, callback=self._log_success_callback, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) kwargs = {} # Save success log if present if len(self._is_success_buffer) > 0: self.evaluations_successes.append(self._is_success_buffer) kwargs = dict(successes=self.evaluations_successes) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, **kwargs, ) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.last_mean_reward = mean_reward if self.verbose > 0: print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print( f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}" ) # Add to current Logger self.logger.record("eval/mean_reward", float(mean_reward)) self.logger.record("eval/mean_ep_length", mean_ep_length) if len(self._is_success_buffer) > 0: success_rate = np.mean(self._is_success_buffer) if self.verbose > 0: print(f"Success rate: {100 * success_rate:.2f}%") self.logger.record("eval/success_rate", success_rate) # Dump log so the evaluation results are printed with the correct timestep self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") self.logger.dump(self.num_timesteps) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, "best_model")) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() return True
def test_goal_env(model_class): env = BitFlippingEnv(n_bits=4) # check that goal env works for PPO/A2C that cannot use HER replay buffer model = model_class("MultiInputPolicy", env, n_steps=64).learn(250) evaluate_policy(model, model.get_env())
log_path=log_dir, eval_freq=500) ''' # run baseline algorithm baseline_score = 0 done = False observation = env.reset(NO_logging=0) while not done: action = 'baseline' observation_, reward, done, info = env.step(action) #if done: # env.plot() observation = observation_ baseline_score += reward print('baseline score: %.3f' % baseline_score)''' # Train the agent timesteps = 1 * 5000 #1e5 #model.learn(total_timesteps=int(timesteps), callback=ransim_callback) model.learn(total_timesteps=int(timesteps)) #fig = plt.figure( ) plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "A2C ran-sim") plt.savefig(log_dir + 'A2C_ran-sim_rewards_plot.png', format="png") plt.show() episode_rewards, episode_lengths = evaluate_policy(model, env, n_eval_episodes=10, return_episode_rewards=True)
import gym import numpy as np from stable_baselines3 import SAC from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.common.evaluation import evaluate_policy env = gym.make('Pendulum-v0') env = DummyVecEnv([lambda: env]) model = SAC('MlpPolicy', env, verbose=1) print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("start model learning !") model.learn(total_timesteps=10000, log_interval=10) print("end model learning !") print("-> model saved !!") model.save("sac_pendulum") print("start model evaluation with learning !") mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !")
import numpy as np import gym import gym_fishing from stable_baselines3 import TD3 from stable_baselines3.common.env_checker import check_env env = gym.make('fishing-v1') check_env(env) load = False if load: model = TD3.load("td3") else: model = TD3('MlpPolicy', env, verbose=1) model.learn(total_timesteps=200) ## Simulate a run with the trained model, visualize result df = env.simulate(model) env.plot(df, "td3.png") ## Evaluate model from stable_baselines3.common.evaluation import evaluate_policy mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10) print("mean reward:", mean_reward, "std:", std_reward) ## Save and reload the model if not load: model.save("td3") model = TD3.load("td3")
def train_adril(env, n=0, balanced=False): num_trajs = 20 expert_data = make_sa_dataset(env, max_trajs=num_trajs) n_expert = len(expert_data["obs"]) expert_sa = np.concatenate( (expert_data["obs"], np.reshape(expert_data["acts"], (n_expert, -1))), axis=1) for i in range(0, n): venv = AdRILWrapper(gym.make(env)) mean_rewards = [] std_rewards = [] # Create model if isinstance(venv.action_space, Discrete): model = DQN(SQLPolicy, venv, verbose=1, policy_kwargs=dict(net_arch=[64, 64]), learning_starts=1) else: model = SAC('MlpPolicy', venv, verbose=1, policy_kwargs=dict(net_arch=[256, 256]), ent_coef='auto', learning_rate=linear_schedule(7.3e-4), train_freq=64, gradient_steps=64, gamma=0.98, tau=0.02) model.replay_buffer = AdRILReplayBuffer(model.buffer_size, model.observation_space, model.action_space, model.device, 1, model.optimize_memory_usage, expert_data=expert_data, N_expert=num_trajs, balanced=balanced) if not balanced: for j in range(len(expert_sa)): obs = expert_data["obs"][j] act = expert_data["acts"][j] next_obs = expert_data["next_obs"][j] done = expert_data["dones"][j] model.replay_buffer.add(obs, next_obs, act, -1, done) for train_steps in range(400): # Train policy if train_steps > 0: if 'Bullet' in env: model.learn(total_timesteps=1250, log_interval=1000) else: model.learn(total_timesteps=25000, log_interval=1000) if train_steps % 1 == 0: # written to support more complex update schemes model.replay_buffer.set_iter(train_steps) model.replay_buffer.set_n_learner(venv.num_trajs) # Evaluate policy if train_steps % 20 == 0: model.set_env(gym.make(env)) mean_reward, std_reward = evaluate_policy(model, model.env, n_eval_episodes=10) mean_rewards.append(mean_reward) std_rewards.append(std_reward) print("{0} Steps: {1}".format(int(train_steps * 1250), mean_reward)) np.savez(os.path.join("learners", env, "adril_rewards_{0}".format(i)), means=mean_rewards, stds=std_rewards) # Update env if train_steps > 0: if train_steps % 1 == 0: venv.set_iter(train_steps + 1) model.set_env(venv)
def test_evaluate_policy_monitors(vec_env_class): # Make numpy warnings throw exception np.seterr(all="raise") # Test that results are correct with monitor environments. # Also test VecEnvs n_eval_episodes = 3 n_envs = 2 env_id = "CartPole-v0" model = A2C("MlpPolicy", env_id, seed=0) def make_eval_env(with_monitor, wrapper_class=gym.Wrapper): # Make eval environment with or without monitor in root, # and additionally wrapped with another wrapper (after Monitor). env = None if vec_env_class is None: # No vecenv, traditional env env = gym.make(env_id) if with_monitor: env = Monitor(env) env = wrapper_class(env) else: if with_monitor: env = vec_env_class( [lambda: wrapper_class(Monitor(gym.make(env_id)))] * n_envs) else: env = vec_env_class([lambda: wrapper_class(gym.make(env_id))] * n_envs) return env # Test that evaluation with VecEnvs works as expected eval_env = make_eval_env(with_monitor=True) _ = evaluate_policy(model, eval_env, n_eval_episodes) eval_env.close() # Warning without Monitor eval_env = make_eval_env(with_monitor=False) with pytest.warns(UserWarning): _ = evaluate_policy(model, eval_env, n_eval_episodes) eval_env.close() # Test that we gather correct reward with Monitor wrapper # Sanity check that we get zero-reward without Monitor eval_env = make_eval_env(with_monitor=False, wrapper_class=ZeroRewardWrapper) average_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes, warn=False) assert average_reward == 0.0, "ZeroRewardWrapper wrapper for testing did not work" eval_env.close() # Should get non-zero-rewards with Monitor (true reward) eval_env = make_eval_env(with_monitor=True, wrapper_class=ZeroRewardWrapper) average_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes) assert average_reward > 0.0, "evaluate_policy did not get reward from Monitor" eval_env.close() # Test that we also track correct episode dones, not the wrapped ones. # Sanity check that we get only one step per episode. eval_env = make_eval_env(with_monitor=False, wrapper_class=AlwaysDoneWrapper) episode_rewards, episode_lengths = evaluate_policy( model, eval_env, n_eval_episodes, return_episode_rewards=True, warn=False) assert all(map(lambda l: l == 1, episode_lengths) ), "AlwaysDoneWrapper did not fix episode lengths to one" eval_env.close() # Should get longer episodes with with Monitor (true episodes) eval_env = make_eval_env(with_monitor=True, wrapper_class=AlwaysDoneWrapper) episode_rewards, episode_lengths = evaluate_policy( model, eval_env, n_eval_episodes, return_episode_rewards=True) assert all(map(lambda l: l > 1, episode_lengths) ), "evaluate_policy did not get episode lengths from Monitor" eval_env.close()
import config from environments import utils as env_utils from helpers import cli if __name__ == '__main__': # Extract command line arguments parser = cli.get_parser() args = parser.parse_args() scenario = args.scenario load_from = args.load config_path = args.config # Load config for session conf = config.load(config_path) # Create environments. eval_env = env_utils.get_evaluation_env(conf.environment_config) # Load agent agent = conf.get_agent(env=eval_env, load_from=load_from) mean_reward, std_reward = evaluation.evaluate_policy(agent, eval_env, n_eval_episodes=100) print(f'Mean reward {mean_reward} +/- {std_reward}') eval_env.close()
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) # Reset success rate buffer self._is_success_buffer = [] episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, warn=self.warn, callback=self._log_success_callback, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) kwargs = {} # Save success log if present if len(self._is_success_buffer) > 0: self.evaluations_successes.append(self._is_success_buffer) kwargs = dict(successes=self.evaluations_successes) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, **kwargs, ) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.last_mean_reward = mean_reward self.last_std = std_reward if self.verbose > 0: print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print( f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}" ) # Add to current Logger self.logger.record("eval/mean_reward", float(mean_reward)) self.logger.record("eval/mean_ep_length", mean_ep_length) if len(self._is_success_buffer) > 0: success_rate = np.mean(self._is_success_buffer) if self.verbose > 0: print(f"Success rate: {100 * success_rate:.2f}%") self.logger.record("eval/success_rate", success_rate) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: print("best model saving desactivated") #self.model.save(os.path.join(self.best_model_save_path, "best_model")) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() return True
def _on_step(self) -> bool: if self.n_calls > 0 and (self.n_calls - 1) % self.eval_freq == 0: self.old_params = [ param.clone() for param in self.model.policy.parameters() ] if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) # Reset success rate buffer self._is_success_buffer = [] episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, warn=self.warn, callback=self._log_success_callback, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) kwargs = {} # Save success log if present if len(self._is_success_buffer) > 0: self.evaluations_successes.append(self._is_success_buffer) kwargs = dict(successes=self.evaluations_successes) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, **kwargs, ) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.last_mean_reward = mean_reward if self.verbose > 0: print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print( f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}" ) # Add to current Logger self.logger.record("eval/mean_reward", float(mean_reward)) self.logger.record("eval/mean_ep_length", mean_ep_length) if len(self._is_success_buffer) > 0: success_rate = np.mean(self._is_success_buffer) if self.verbose > 0: print(f"Success rate: {100 * success_rate:.2f}%") self.logger.record("eval/success_rate", success_rate) # Dump log so the evaluation results are printed with the correct timestep self.logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") self.logger.dump(self.num_timesteps) if mean_reward >= self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, "checkpoint")) self.best_mean_reward = mean_reward os.makedirs(self.log_path, exist_ok=True) save_path = os.path.join(self.log_path, "checkpoint") self.save_folders.append(save_path) model_parameters = self.model.policy.state_dict() grads = OrderedDict([ (name, param.grad) for name, param in model_parameters.items() ]) torch.save(model_parameters, os.path.join(self.log_path, "parameters.th")) torch.save(grads, os.path.join(self.log_path, "grads.th")) if self.old_params is not None: delta = OrderedDict([ (name, param - old_param) for old_param, (name, param) in zip( self.old_params, model_parameters.items()) ]) torch.save(delta, os.path.join(self.log_path, "prev_step.th")) # Trigger callback if needed if self.callback is not None: return self._on_event() return True