def train_SAC(env, title="Stand Up Task Learning Curve"): print(f"action space shape -1:{env.action_space.shape[-1]}") # The noise objects for TD3 n_actions = env.action_space.shape[-1] callback = Logger(log_dir=log_dir) timesteps = 20000 model = SAC('MlpPolicy', env, learning_rate=0.001, learning_starts=10000, ent_coef='auto_1.1', train_freq=1, n_episodes_rollout=-1, target_entropy=-21, buffer_size=1000000, action_noise=None, batch_size=64, verbose=1, policy_kwargs=dict(net_arch=[64, 64])) model.learn(total_timesteps=timesteps, callback=callback) model.save("SAC_pkl") plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, title) plt.savefig("{}/learn_curve.png".format(log_dir)) plt.show()
def test_ddpg(): log_dir = f"model_save/best_model_ddpg_cnn" env = ENV(istest=True) env.render = True env = Monitor(env, log_dir) model = DDPG.load(log_dir) plot_results(f"model_save/") for i in range(10): state = env.reset() while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",i,"action:", action,"now profit:",env.profit) if done: print('stock',i,' total profit=',env.profit,' buy hold=',env.buy_hold) break
def test_ppo(): log_dir = f"model_save/best_model_ppo" env = ENV_CONTINUE(istest=True) env.render = True env = Monitor(env, log_dir) model = PPO.load(log_dir) plot_results(f"model_save/") for i in range(10): state = env.reset() day = 0 while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",day,"reward:", reward,"now profit:",env.profit) day+=1 if done: print('stock',i,' total profit=',env.profit,' buy hold=',env.buy_hold) break
def test_td3(): log_dir = f"model_save/best_model_td3_sp2" env = ENV(istest=True) env.render = True env = Monitor(env, log_dir) model = TD3.load(log_dir) plot_results(f"model_save/") for i in range(10): state = env.reset() day = 0 while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",day,"reward:", reward,"now profit:",env.profit) day+=1 if done: print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}' .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad)) break
def train(args, data): # extract data from dictionry observations = torch.tensor(data['observations']) actions = torch.tensor(data['actions']) next_observations = torch.tensor(data['next_observations']) rewards = torch.tensor(data['rewards']) dones = data['dones'] in_dim = torch.cat([observations[0], actions[0]]).size(0) out_dim = torch.cat([next_observations[0], rewards[0]]).size(0) dynamics = DynamicsEnsemble(args.ensemble_size, in_dim, out_dim, args.encoder_hidden_dim, args.decoder_hidden_dim, args.latent_dim, args.n_hidden, ) # load saved model into dynamics ensemble checkpoint = torch.load(args.CHECKPOINTPATH) dynamics.load_state_dict(checkpoint['model_state_dict']) dynamics.opt.load_state_dict(checkpoint['optimizer_state_dict']) # define SAC agent agent = SAC('MlpPolicy', dynamics) # set up callback date_ = str(date.today()) exp_code = date_ + '_' + str(np.random.randint(1000000)) log_dir = args.write_to + '_' + exp_code callback = TensorboardCallback(exp_code, check_freq=100, log_dir=log_dir) timesteps = 1e5 agent.learn(total_timesteps=int(timesteps), callback=callback) plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "SAC: Generative Ensembles") plt.show()
def plot_results(log_folder, title='Learning Curve'): """ plot the results :param log_folder: (str) the save location of the results to plot :param title: (str) the title of the task to plot """ x, y = ts2xy(load_results(log_folder), 'timesteps') y = moving_average(y, window=50) # Truncate x x = x[len(x) - len(y):] fig = plt.figure(title) plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.title(title + " Smoothed") # built int results_plotter.plot_results([log_folder], 3e5, results_plotter.X_TIMESTEPS, "TD3 LunarLander") plt.show()
model = DDPG(CustomPolicy, env, verbose=1, batch_size=64, action_noise=action_noise) for i in range(step_iters): # run for step_iters * training_timesteps model.learn(total_timesteps=training_timesteps) model.save("./models/ddpg" + str((i + 1) * training_timesteps)) model.save_replay_buffer("./experiences/ddpg_experience" + str((i + 1) * training_timesteps)) #### Show (and record a video of) the model's performance ########################################## env_test = RLTetherAviary(gui=False, record=True) obs = env_test.reset() start = time.time() for i in range(10 * env_test.SIM_FREQ): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env_test.step(action) if done: break env_test.close() env.close() results_plotter.plot_results([os.path.join(os.getcwd(), log_dir)], step_iters * training_timesteps, results_plotter.X_TIMESTEPS, "DDPG") plot_results(os.path.join(os.getcwd(), log_dir), "DDPG")
env_kwargs={"t_final": 5000}, n_envs=1, monitor_dir=log_dir) # Parallel environments eval_env = gym.make(env_string, t_final=5000) # ------------------------------------------------------------------------- # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=200, deterministic=True, render=False) ransim_callback = CustomRansimCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=10 * 5 * 1e2, deterministic=True, render=False, plot_results=True) # ------------------------------------------------------------------------- model = A2C('MlpPolicy', env, verbose=1) timesteps = 2 * 100 * 5 * 1e2 # k*n_envs*T_Final/t_c model.learn(total_timesteps=int(timesteps), callback=ransim_callback) # ------------------------------------------------------------------------- plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "A2C ran-sim") plt.savefig(log_dir + 'A2C_ran-sim_rewards_plot.png', format="png") plt.show() msa = 1
env = AsistEnvGym(portal_data, room_data, victim_data, "as") env = Monitor(env, log_dir) n_actions = env.action_space.shape action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = PPO(MlpPolicy, env, verbose=1) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # model = DQN(MlpPolicy, env, verbose=1) timeSteps = 80000 model.learn(total_timesteps=timeSteps, callback=callback) plot_results([log_dir], timeSteps, results_plotter.X_TIMESTEPS, "TD3 LunarLander") plt.show() # obs = env.reset() # score = 0 # done = False # while not done: # action, _states = model.predict(obs, deterministic=True) # obs, rewards, done, info = env.step(action) # score += rewards # env.render() # with open("tmp.txt", 'w') as ff: # g = ff.write(str(env.visit_node_sequence)) # print(env.visit_node_sequence) # print("Victim_saved:", len(env.graph.safe_victim_list))
def using_callback_example(): # Using Callback: Monitoring Training. class SaveOnBestTrainingRewardCallback(BaseCallback): """ Callback for saving a model (the check is done every 'check_freq' steps) based on the training reward (in practice, we recommend using 'EvalCallback'). :param check_freq: :param log_dir: Path to the folder where the model will be saved. It must contains the file created by the 'Monitor' wrapper. :param verbose: Verbosity level. """ def __init__(self, check_freq: int, log_dir: str, verbose: int = 1): super(SaveOnBestTrainingRewardCallback, self).__init__(verbose) self.check_freq = check_freq self.log_dir = log_dir self.save_path = os.path.join(log_dir, "best_model") self.best_mean_reward = -np.inf def _init_callback(self) -> None: # Create folder if needed. if self.save_path is not None: os.makedirs(self.save_path, exist_ok=True) def _on_step(self) -> bool: if self.n_calls % self.check_freq == 0: # Retrieve training reward. x, y = ts2xy(load_results(self.log_dir), "timesteps") if len(x) > 0: # Mean training reward over the last 100 episodes. mean_reward = np.mean(y[-100:]) if self.verbose > 0: print(f"Num timesteps: {self.num_timesteps}") print( f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}" ) # New best model, you could save the agent here. if mean_reward > self.best_mean_reward: self.best_mean_reward = mean_reward # Example for saving best model. if self.verbose > 0: print(f"Saving new best model to {self.save_path}") self.model.save(self.save_path) return True # Create log dir. log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment. env = gym.make("LunarLanderContinuous-v2") env = Monitor(env, log_dir) # Add some action noise for exploration. n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # Because we use parameter noise, we should use a MlpPolicy with layer normalization. model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0) # Create the callback: check every 1000 steps. callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # Train the agent. timesteps = 1e5 model.learn(total_timesteps=int(timesteps), callback=callback) plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "TD3 LunarLander") plt.show()
return ProgressBarCallback(self.pbar) def __exit__(self, exc_type, exc_val, exc_tb): # close the callback self.pbar.n = self.total_timesteps self.pbar.update(0) self.pbar.close() # Create log dir log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = make_vec_env('foo-v0', n_envs=1, monitor_dir=log_dir) tic = time.perf_counter() # Create callbacks save_callback = SaveOnBestTrainingRewardCallback(check_freq=10000, log_dir=log_dir) model = stable_baselines3.DQN('MlpPolicy', env, verbose=0, learning_rate=1e-4) model = model.load('AAA', env) steps = 10e6 with ProgressBarManager(steps) as progress_callback: # This is equivalent to callback=CallbackList([progress_callback, auto_save_callback]) model = model.learn(steps, callback=[progress_callback, save_callback]) model.save('AAA') results_plotter.plot_results([log_dir], steps, results_plotter.X_TIMESTEPS, "TD3 LunarLander")
channel_type=Channels.RGB_ONLY) time_steps = 200000 name = "Offworld_DQN4" env = Monitor(env, log_dir) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) model = DQN("MlpPolicy", env, gamma=0.95, learning_rate=1e-3, verbose=0, buffer_size=1000, batch_size=16, exploration_fraction=0.9, exploration_final_eps=0.1, exploration_initial_eps=1.0, train_freq=1) print(type(callback)) #, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, name) plt.savefig(name + '.png') model.save(name) model = DQN.load(name) mean_reward = evaluate(model, num_steps=100) env.close()
default="CartPole-v1") args = parser.parse_args() log_dir = args.log_folder # Load results W = load_results(log_dir) # print("Results seed: ", W) # Save walltime to stats.csv df = pd.read_csv(log_dir + 'stats.csv') df["Train walltime (s)"] = W["t"].max() df.to_csv(log_dir + "stats.csv", index=False) # print(df) # Plot training rewards TIMESTEPS = 1e10 plot_results([log_dir], TIMESTEPS, X_TIMESTEPS, args.env) plt.savefig(log_dir + "reward_vs_timesteps.png") # plt.show() plot_results([log_dir], TIMESTEPS, X_EPISODES, args.env) plt.savefig(log_dir + "reward_vs_episodes.png") # plt.show() plot_results([log_dir], TIMESTEPS, X_WALLTIME, args.env) plt.savefig(log_dir + "reward_vs_walltime.png") # plt.show()
# For tensorflow imported with tensorboard warnings.filterwarnings("ignore", category=FutureWarning) import numpy as np import matplotlib.pyplot as plt from stable_baselines3.common.results_plotter import X_TIMESTEPS, plot_results parser = argparse.ArgumentParser() parser.add_argument('algo', type=str) parser.add_argument('env', type=str) parser.add_argument('exp_folder', type=str) args = parser.parse_args() algo = args.algo env = args.env log_path = os.path.join(args.exp_folder, algo) dirs = [ os.path.join(log_path, folder) for folder in os.listdir(log_path) if env in folder and os.path.isdir(os.path.join(log_path, folder)) ] try: plot_results(dirs, 2e6, X_TIMESTEPS, env) except Exception as e: print(e) plt.show()
# For tensorflow imported with tensorboard # import warnings # warnings.filterwarnings("ignore", category=FutureWarning) parser = argparse.ArgumentParser() parser.add_argument("algo", type=str) parser.add_argument("env", type=str) parser.add_argument("exp_folder", type=str) parser.add_argument("axis", choices=["steps", "episodes", "time"], type=str) args = parser.parse_args() algo = args.algo env = args.env log_path = os.path.join(args.exp_folder, algo) x_axis = {"steps": X_TIMESTEPS, "episodes": X_EPISODES, "time": X_WALLTIME}[args.axis] dirs = [ os.path.join(log_path, folder) for folder in os.listdir(log_path) if (env in folder and os.path.isdir(os.path.join(log_path, folder))) ] try: plot_results(dirs, 2e6, x_axis, env) except Exception as e: print(e) plt.show()
from stable_baselines3.common import results_plotter log_dir = "/Users/daniel/repos/CitadelsAI/logs" # results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "CitadelsAI") results_plotter.plot_results([log_dir], num_timesteps=200000, x_axis=results_plotter.X_TIMESTEPS, task_name="CitadelsAI") print('debug')
) # New best model, you could save the agent here if mean_reward > self.best_mean_reward: self.best_mean_reward = mean_reward # Example for saving best model if self.verbose > 0: print(f"Saving new best model to {self.save_path}.zip") self.model.save(self.save_path) return True log_dir = "./tmp/a2c-log" os.makedirs(log_dir, exist_ok=True) tb_logs = "./tmp/a2c-tb-log" os.makedirs(tb_logs, exist_ok=True) env = RCT(settings_path='configs/settings.yml') env = Monitor(env, log_dir) n_actions = env.action_space.shape[-1] #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) model = A2C(MlpPolicy, env, tensorboard_log=tb_logs, verbose=0) model.learn(total_timesteps=int(5e4), callback=callback) results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "A2C MlpPolicy RCT")
# Save trained model print(f"Saving to {save_path}") model.save(f"{save_path}/{env_id}") if hasattr(model, 'save_replay_buffer') and args.save_replay_buffer: print("Saving replay buffer") model.save_replay_buffer(save_path) if normalize: # Important: save the running average, for testing the agent we need that normalization model.get_vec_normalize_env().save( os.path.join(params_path, 'vecnormalize.pkl')) # Deprecated saving: # env.save_running_average(params_path) # plot training plot_results([save_path], n_timesteps, results_plotter.X_TIMESTEPS, "A2C ran-sim") plt.savefig(save_path + 'A2C_ransim_rewards_plot.png', format="png") plt.show() # plot evaluation file_path = os.path.join(save_path, 'evaluations.npz') #file_path = 'logs/a2c/ransim-v0_3/evaluations.npz' # np.load(file_path) plot_evaluation_results(file_path, n_timesteps, results_plotter.X_TIMESTEPS, "A2C_eval_ran-sim") plt.savefig(save_path + 'A2C_ransim_rewards_eval_plot.png', format="png") plt.show()
policy_kwargs=policy_kwargs, env=env, verbose=0, action_noise=action_noise, learning_starts=interval, tensorboard_log=parent_dir + "tensorboard/") print() model.learn(total_timesteps=interval * icount, log_interval=log_interval, tb_log_name="TD3_{}".format(time.strftime("%Y%m%d")), callback=callbackList) obs = env.reset() dones = False counter = [] while dones == False: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) counter.append(rewards) env.close() print("\nFinal rewards:") pp.pprint(env.cost()) # Plot the reward graph if useBestCallback: plot_results([log_dir], interval * icount, results_plotter.X_TIMESTEPS, "TD3 CityLearn") plt.savefig(log_dir + "/rewards.pdf")
def sb3_plot(): results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, exp_name)
# callback for model training # saves checkpoint if current version of model is better than all before... callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=CHECKPOINT_DIR, save_path=CHECKPOINT_DIR) # training total_timesteps = 150000 model.learn(total_timesteps=total_timesteps, callback=callback) # we don't have to save manually when we use the callback in the model.learn call # model.save(os.path.join(CHECKPOINT_DIR, "mlp_dqn_cartpole")) plot_results([CHECKPOINT_DIR], num_timesteps=total_timesteps, x_axis=results_plotter.X_TIMESTEPS, task_name="{} DQN on {}".format(policy_name, env_name), figsize=(8, 4)) plt.savefig(os.path.join(CHECKPOINT_DIR, "training_progress.png")) plt.show() # restore model from saved... del model model = DQN.load(os.path.join(CHECKPOINT_DIR, "best_model"), env=env) # testing obs = env.reset() for i in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action)