def save_results(arg_dict, model_name, env, model_logdir=None, show=False): if model_logdir is None: model_logdir = arg_dict["logdir"] print(f"model_logdir: {model_logdir}") results_plotter.EPISODES_WINDOW = 100 results_plotter.plot_results([model_logdir], arg_dict["steps"], results_plotter.X_TIMESTEPS, arg_dict["algo"] + " " + arg_dict["env_name"] + " reward") plt.gcf().set_size_inches(8, 6) plt.savefig(os.path.join(model_logdir, model_name) + '_reward_results.png') #plot_extended_results(model_logdir, 'd', results_plotter.X_TIMESTEPS, arg_dict["algo"] + " " + arg_dict["env_name"] + " distance", "Episode Distances") plt.gcf().set_size_inches(8, 6) plt.savefig(os.path.join(model_logdir, model_name) + '_distance_results.png') plt.close() plt.close() if isinstance(env, HERGoalEnvWrapper): results_plotter.plot_curves([(np.arange(len(env.env.episode_final_distance)),np.asarray(env.env.episode_final_distance))],'episodes',arg_dict["algo"] + " " + arg_dict["env_name"] + ' final step distance') else: results_plotter.plot_curves([(np.arange(len(env.unwrapped.episode_final_distance)),np.asarray(env.unwrapped.episode_final_distance))],'episodes',arg_dict["algo"] + " " + arg_dict["env_name"] + ' final step distance') plt.gcf().set_size_inches(8, 6) plt.ylabel("Step Distances") plt.savefig(os.path.join(model_logdir, model_name) + "_final_distance_results.png") plt.close() print("Congratulations! Training with {} timesteps succeed!".format(arg_dict["steps"])) if show: plt.show()
def train(self, symbol='JPM', sd=dt.datetime(2009, 1, 1), ed=dt.datetime(2010, 12, 31), time_steps=int(1e5), savepath=None, should_plot=False): # load data and indicators df = self._load_data([symbol], sd, ed) df_met = self._get_indicators(symbol, df) # set environment self.env = Monitor(LoanEnv(df_met), self.log_dir, allow_early_resets=True) # train model self.model = DQN(MlpPolicy, self.env, prioritized_replay=True, verbose=1) self.model.learn(total_timesteps=time_steps, callback=self.debugcb) # save and plot if savepath is not None: self.model.save(savepath) if should_plot: results_plotter.plot_results([self.log_dir], time_steps, results_plotter.X_TIMESTEPS, f'DQN {symbol}') plt.show()
def main(): """ Example usage in jupyter-notebook .. code-block:: python from stable_baselines import results_plotter %matplotlib inline results_plotter.plot_results(["./log"], 10e6, results_plotter.X_TIMESTEPS, "Breakout") Here ./log is a directory containing the monitor.csv files """ import argparse import os parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dirs', help='List of log directories', nargs='*', default=['./log']) parser.add_argument('--num_timesteps', type=int, default=int(10e6)) parser.add_argument('--xaxis', help='Varible on X-axis', default=X_TIMESTEPS) parser.add_argument('--task_name', help='Title of plot', default='Title') args = parser.parse_args() args.dirs = [os.path.abspath(folder) for folder in args.dirs] results_plotter.plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name) plt.show()
def start_the_game(): cyber = DDoS_env.cyberEnv(defense_list=global_settings.defense_space, obsv_list=global_settings.observation_space) cyber = Monitor(cyber, log_dir) defense_agent_decision_model = PPO2(MlpPolicy, cyber, verbose=1, gamma=0.95, ent_coef=0.1) callback = training_result.SaveOnBestTrainingRewardCallback( check_freq=global_settings.SAVE_RESULT_FREQ_DDoS, log_dir=log_dir) defense_agent_decision_model.learn( total_timesteps=global_settings.TOTAL_TRAIN_STEPS, callback=callback) # # Evaluate the agent # print("Evaluating the agent") # from stable_baselines.common.evaluation import evaluate_policy # mean_reward, std_reward = evaluate_policy(defense_agent_decision_model, defense_agent_decision_model.get_env(), n_eval_episodes=2) # print("Mean Reward %s Std Reward %s"%(mean_reward,std_reward)) ''' Plot the performance''' results_plotter.plot_results([log_dir], global_settings.TOTAL_TRAIN_STEPS, results_plotter.X_TIMESTEPS, "DDoS Reward Results") plt.show()
from stable_baselines import results_plotter import matplotlib.pyplot as plt log_dir = '/home/yliu2/blimp_ws/exp_log/SAC/HOVER/4act/exp1' SLEEP_RATE = 2 N_EPISODE = 5000 EPISODE_LENGTH = SLEEP_RATE * 30 #30 sec TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE results_plotter.plot_results([log_dir], TOTAL_TIMESTEPS, results_plotter.X_TIMESTEPS, "SAC BLIMP") plt.show()
gazebo.unpauseSim() # Create the vectorized environment env = DummyVecEnv([ make_env(env_id, i, num_cpu - 1, log_dir, gazebo, os.getpgid(gazebo_process.pid), os.getpgid(cf_process.pid)) for i in range(num_cpu) ]) env = VecNormalize(env) # Save best model every n steps and monitors performance # save_best_callback = SaveOnBestTrainingRewardCallback(check_freq=250, log_dir=log_dir) # Save model every n steps checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./' + log_dir, name_prefix='ppo2') # Train from scratch model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=200000, callback=checkpoint_callback) # Load trained params and continue training # model = PPO2.load(log_dir + '/best_model') # model.set_env(env) # model.learn(total_timesteps=200000, callback=save_best_callback, reset_num_timesteps=False) results_plotter.plot_results([log_dir], 200000, results_plotter.X_TIMESTEPS, "PPO Crazyflie") plt.show() env.close()
if (n_steps + 1) % 1000 == 0: # Evaluate policy training performance x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(log_dir + 'best_model.pkl') n_steps += 1 return True ################ TRAINING model.learn(total_timesteps=time_steps, callback=auto_save_callback, seed=args.random_seed) # print('save model') # savemodel(model, MODEL, ENVIRONMENT, DATE) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "RGBD Observation") plt.show() print('total time', time.time()-start)
import sys import numpy as np import tensorflow as tf import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from gym import wrappers from datetime import datetime import time import os from stable_baselines.bench import Monitor from stable_baselines.results_plotter import load_results, ts2xy from stable_baselines import results_plotter time_steps = 5000 ll = 5e-5 log_dir = "./logFiles" results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "IEEE 39 Bus load shedding w/SAC") plt.savefig( log_dir + '/IEEE_39Bus_loadshedding_SAC {}_{}.png'.format(str(time_steps), str(ll))) plt.show()
if self.verbose > 0: print("Saving new best model to {}".format( self.save_path)) self.model.save(self.save_path) return True # Create log dir log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make('SatelliteEnvironment-v0') env = Monitor(env, log_dir) # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # Because we use parameter noise, we should use a MlpPolicy with layer normalization model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # Train the agent time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "DDPG Satellite") plt.show()
def ttest_env(modelpath, modelname): for name in modelpath: os.makedirs(name, exist_ok=True) env = IdentityEnv(18, 18, 60) env = Monitor(env, name) e = DummyVecEnv([lambda: env]) if name == log_dir_a2c: model = A2C(policy="MlpPolicy", env=e, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "a2c Monitor") plt.show() if name == log_dir_acer: model = ACER(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "acer Monitor") plt.show() if name == log_dir_acktr: model = ACKTR(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "ACKTR Monitor") plt.show() if name == log_dir_dqn: model = DQN(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "DQN Monitor") plt.show() if name == log_dir_ppo1: model = PPO1(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "PPO1 Monitor") plt.show() if name == log_dir_poo2: model = PPO2(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "PPO2 Monitor") plt.show() if name == log_dir_trpo: model = TRPO(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "TRPO Monitor") plt.show()
weights = np.repeat(1.0, window) / window return np.convolve(values, weights, 'valid') def plot_results(log_folder, title='Learning Curve'): """ plot the results :param log_folder: (str) the save location of the results to plot :param title: (str) the title of the task to plot """ x, y = ts2xy(load_results(log_folder), 'timesteps') y = moving_average(y, window=50) # Truncate x x = x[len(x) - len(y):] fig = plt.figure(title) plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.title(title + " Smoothed") plt.show() from stable_baselines import results_plotter # Helper from the library results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "Hexworld Coverage") plot_results(log_dir)
self.model.save(self.save_path) return True # Create log dir log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make('LunarLanderContinuous-v2') # print('安装监督前',env) env = Monitor(env, log_dir) # print('安装监督后',env) # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # Because we use parameter noise, we should use a MlpPolicy with layer normalization model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # Train the agent time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) # print(results_plotter.X_TIMESTEPS) # print(results_plotter.X_EPISODES) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_EPISODES, "DDPG LunarLander") plt.show()
envname = "Pendulum-v0" envname = "LunarLanderContinuous-v2" envname = "BipedalWalker-v2" env = gym.make(envname) exp_name = env.spec._env_name + '-DPDQN2' log_dir = 'logs_dev/' + exp_name env = Monitor(env, log_dir, allow_early_resets=True) model = DPDQN2(env, verbose=1) print("time_steps_todo: " + str(time_steps)) model.learn(total_timesteps=int(time_steps)) copyfile(log_dir + ".monitor.csv", "logs_tmp/tmp.monitor.csv") results_plotter.plot_results(["logs_tmp"], time_steps, results_plotter.X_TIMESTEPS, log_dir.split("/")[1]) plt.show() os.makedirs("models", exist_ok=True) model.save("models/" + log_dir.split("/")[1]) # test env = gym.make(envname) log_dir = 'logs_test/' + exp_name env = Monitor(env, log_dir, allow_early_resets=True) model = DPDQN2.load("models/" + log_dir.split("/")[1], env) obs = env.reset() for i in range(time_steps_test):
# rewarda2c, eps_step=test_identity('a2c') # Episode=np.arange(1,len(rewarda2c)+1,1) # ResultTrain(Episode,Episode_reward=rewarda2c) # reward_acer, eps_step=test_identity('acer') # Episode=np.arange(1,len(reward_acer)+1,1) # ResultTrain(Episode,Episode_reward=reward_acer) # # reward_acktr, eps_step=test_identity('acktr') # Episode=np.arange(1,len(reward_acktr)+1,1) # ResultTrain(Episode,Episode_reward=reward_acktr) # reward_dqn, eps_step = test_identity('dqn') time_steps = 10000 results_plotter.plot_results([log_dir], time_steps, results_plotter.X_EPISODES, "DQN RESULT") plt.show() # Episode=np.arange(1,len(reward_dqn)+1,1) # print('DQN训练每回合的奖赏结果is',reward_dqn) # ResultTrain(Episode,Episode_reward=reward_dqn) # # reward_ppo1, eps_step=test_identity('ppo1') # Episode=np.arange(1,len(reward_ppo1)+1,1) # ResultTrain(Episode,Episode_reward=reward_ppo1) # # reward_ppo2, eps_step=test_identity('ppo2') # Episode=np.arange(1,len(reward_ppo2)+1,1) # ResultTrain(Episode,Episode_reward=reward_ppo2) # # reward_trpo, eps_step=test_identity('trpo') # Episode=np.arange(1,len(reward_trpo)+1,1)
print("Saving new best model") _locals['self'].save(log_dir + 'best_model.pkl') n_steps += 1 return True # Create log dir log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make('CartPole-v1') env = Monitor(env, log_dir, allow_early_resets=True) # Add some param noise for exploration # param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # Because we use parameter noise, we should use a MlpPolicy with layer normalization model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, param_noise=True, verbose=1) # Train the agent time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "DQN CartPole") plt.show()
from stable_baselines import results_plotter import matplotlib.pyplot as plt # Helper from the library results_plotter.plot_results([ '/home/constantin/Desktop/projects/disertation/rl_logs_1_1-20200120T201830Z-001/rl_logs_1_1/' ], 1e5, results_plotter.X_TIMESTEPS, "Test") plt.show()
from stable_baselines import results_plotter from matplotlib import pyplot as plt import time while (True): results_plotter.plot_results(["./log2"], 10e6, results_plotter.X_TIMESTEPS, "Breakout") plt.pause(10) plt.close()
} else: items = { "policy": MLP, batchsize: args.timesteps_per_batch, } model = algo(env=env, verbose=1, seed=args.SEED, **items) model.set_env(env) print("Training for ", args.total_timesteps) model.learn(total_timesteps=int(args.total_timesteps)) # library helper plot_results( [log_dir], int(args.total_timesteps), results_plotter.X_TIMESTEPS, str(args.algo_name) + "_" + identifer, ) plt.savefig("convergence_plot" + identifer + ".png") model.save("policy-" + identifer) else: model = algo.load("policy-" + identifer) obs = env.reset() done = False score = 0 while not done: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action)
batch_size=1024, buffer_size=int(5e5), verbose=0, param_noise=param_noise, action_noise=action_noise, tensorboard_log=parent_dir + "tensorboard/", n_cpu_tf_sess=multiprocessing.cpu_count()) model.learn(total_timesteps=interval * icount, log_interval=interval, tb_log_name="DDPG_{}".format(time.strftime("%Y%m%d")), callback=callbackList) obs = env.reset() dones = False counter = [] while dones == False: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) counter.append(rewards) env.close() print("\nFinal costs:") pp.pprint(env.cost()) # Plot the reward graph if useBestCallback: plot_results([log_dir], interval * icount, results_plotter.X_TIMESTEPS, "DDPG CityLearn") plt.savefig(log_dir + "/rewards.pdf")
# full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): model = PPO2(MlpPolicy, env, gamma=0.9, learning_rate=0.0086, nminibatches=64, verbose=1, tensorboard_log="./ppo2_filter_tensorboard/") callback = SaveOnBestTrainingRewardCallback(check_freq=40, log_dir=log_dir) time_steps = 3e4 start_time = time.time() model.learn(total_timesteps=int(time_steps), callback=callback) "---%s seconds ---" % (time.time() - start_time) # save the last model model.save("ppo2_LastModel") results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO2 LunarLander") plt.show() # done = False # while not done: # #action = ... # Your agent code here # obs, reward, done, info = env.step(env.action_sapce.sample()) # env.render() # for _ in range(10): # action = env.action_type.actions_indexes["IDLE"] # obs, reward, done, info = env.step(env.action_space.sample()) # env.render() # env = gym.make("overtaking-v0")
#env = CustomEnv(3, 6, "tcp://*:5556") # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) # Create log dir log_dir = "Logs/Custom_env/" os.makedirs(log_dir, exist_ok=True) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=500, log_dir=log_dir) #env = Monitor(env, log_dir) model = ACKTR(MlpPolicy, env, verbose=2) #model.load("DQN_agent") model.learn(total_timesteps=20000, callback=callback) model.save("temp_agent") a = input("Training completed") obs = env.reset() for _ in range(1000): action, _states = model.predict(obs, deterministic=True) probs = model.action_probability(obs) obs, rewards, dones, info = env.step(action) print("Observation:", obs, rewards, probs) results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "Lane Manager") plt.show()
# Instantiate the agent # model = DQN(EgoAttentionNetwork, env, learning_rate=1e-3, prioritized_replay=True, verbose=1) model = DQN("MlpPolicy", env, learning_rate=1e-3, prioritized_replay=True, verbose=1, tensorboard_log="./test_results/DQN_overtaking_tensorboard/" + TIMESTAMP) # create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # Train the agent time_steps = 1000 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "DQN OvertakingEnv") plt.show() # # Save the agent # model.save("dqn_overtaking") # del model # delete trained model to demonstrate loading # # # Load the trained agent # model = DQN.load("dqn_overtaking", env=env) # # # Evaluate the agent # mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) # # # Enjoy trained agent # obs = env.reset() # episode_reward = 0
def main(): args = get_args() choose_device(args.device) set_global_seeds(args.seed) env_id = args.env exp_id = args.exp_id algo = args.algo env_name = env_id[:-3] env_index = env_list.index(env_id) # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training env_kwargs = {} if args.env_kwargs is None else args.env_kwargs if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']): if 'rew_land' in env_kwargs: if (int(env_kwargs['rew_land']) in [500, 1000, 10000]): env_success[-1] = int(env_kwargs['rew_land']) else: raise ValueError( 'Given env reward not acceptable. Please try again') params = [exp_id, env_name.lower()] folder = [exp_id, env_name.lower(), args.algo.lower()] tensorboard_path, monitor_path, callback_path = None, None, None if args.tensorboard: tensorboard_path = "tensorboard/{}_{}".format(*params) make_dir(tensorboard_path) # if args.train_RL: # Begin training here (location of this condition also decides experiment performance) # Load hyperparameters from yaml file with open('hyperparams/{}.yml'.format(args.algo), 'r') as f: hyperparams_dict = yaml.safe_load(f) if env_id in list(hyperparams_dict.keys()): hyperparams = hyperparams_dict[env_id] else: raise ValueError("Hyperparameters not found for {}-{}".format( args.algo, env_id)) if args.hyperparams is not None: # Overwrite hyperparams if needed hyperparams.update(args.hyperparams) # OPTIONAL: Print saved hyperparams saved_hyperparams = OrderedDict([(key, hyperparams[key]) for key in sorted(hyperparams.keys())]) if args.verbose > 0: pprint(saved_hyperparams) if args.n_envs > 1: # if args.verbose: print("Overwriting n_envs with n={}".format(args.n_envs)) n_envs = args.n_envs else: n_envs = hyperparams.get('n_envs', 1) # choose Monitor log path according to multiprocessing setting if args.monitor: if n_envs == 1: monitor_path = 'logs/single/{}_{}_{}'.format(*folder) else: if algo not in ['dqn', 'her', 'sac', 'td3']: monitor_path = 'logs/multi/{}_{}_{}'.format(*folder) make_dir(monitor_path) if int(float(args.timesteps_RL)) > 0: # if args.verbose: print("Overwriting n_timesteps with n={}".format( int(float(args.timesteps_RL)))) n_timesteps = int(float(args.timesteps_RL)) else: n_timesteps = int(hyperparams['n_timesteps']) # Convert to python object if needed if 'policy_kwargs' in hyperparams.keys() and isinstance( hyperparams['policy_kwargs'], str): hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs']) if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] #To avoid error env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())): # hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate']) def create_env(n_envs, eval_env=False): if algo in ['a2c', 'acer', 'acktr', 'ppo2']: if n_envs > 1: env = SubprocVecEnv([ make_env(env_id, i, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)]) if env_wrapper is not None: env = env_wrapper(env) elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1): raise ValueError( "Error: {} does not support multiprocessing!".format(algo)) elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1): raise ValueError( "Error: {} uses MPI for multiprocessing!".format(algo)) else: env = make_vec_env(env_id, n_envs=n_envs, seed=args.seed, monitor_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) if args.normalize: # choose from multiple options # env = VecNormalize(env, clip_obs=np.inf) env = VecNormalize(env, norm_reward=False, clip_obs=np.inf) # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs) return env # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)]) # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)]) env = create_env(n_envs) # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder) save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[ env_index] save_freq, eval_freq = max(save_freq // n_envs, 1), max(eval_freq // n_envs, 1) make_dir(callback_path) if args.check_callback: callback.append( CheckpointCallback(save_freq=save_freq, save_path=callback_path, name_prefix='rl_model', verbose=1)) if args.eval_callback: callback.append( EvalCallback(create_env(1, eval_env=True), best_model_save_path=callback_path, log_path=callback_path, eval_freq=eval_freq, verbose=1)) model = (algo_list[args.algo])(env=env, seed=args.seed, tensorboard_log=tensorboard_path, n_cpu_tf_sess=1, verbose=args.verbose, **hyperparams) print('\nTraining {} on {} now... \n'.format(algo, env_id)) start_time = time.time() model.learn(total_timesteps=n_timesteps, callback=callback) total_time = time.time() - start_time if args.normalize: env.save(os.path.join(callback_path, "vec_normalize.pkl")) if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']): print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) else: print("Took {:.2f}s for single process version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)]) if args.normalize: env = VecNormalize.load( os.path.join(callback_path, "vec_normalize.pkl"), env) env.training = False env.norm_reward = False env.seed(args.seed) # Evaluate RL model - choose either best model or last available model model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model')) # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder)) model.set_env(env) evaluate('policy', model, env_id, env, algo, 100) if args.monitor: results_plotter.plot_results([monitor_path], n_timesteps, results_plotter.X_TIMESTEPS, "{} {}".format(algo, env_id)) plot_results(monitor_path) if args.test: print('\nTesting policy...\n') obs = env.reset() for _ in range(n_timesteps): action, _states = model.predict(obs, deterministic=True) if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, rewards, dones, info = env.step(action) episode_reward += rewards env.render() if dones: done_count += 1 success_count = check_success(env_index, env_success, success_count) total_reward += episode_reward episode_reward = 0 env.reset() print('\n{}/{} successful episodes'.format(success_count, done_count)) average_reward = total_reward / done_count print('\nAverage reward: {}'.format(average_reward)) env.close()
env = KukaCamGymEnv(renders=True, isDiscrete=True) # pybullet可以直接make env.cid = p.connect(p.DIRECT) env = Monitor(env, log_dir) # 添加基本的噪声参数 # # Add some param noise for exploration # param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # # Because we use parameter noise, we should use a MlpPolicy with layer normalization # model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0) # 设置超参 model = PPO2( MlpPolicy, env, verbose=1, tensorboard_log="./ppo2_kukaWithCam_tboard/") # verbose应该是一个决定运行状态的参数 # 每1000步检查一次callback # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # 开始训练 time_steps = 1e8 model.learn(total_timesteps=int(time_steps), callback=callback) env.close() p.disconnect() # 实时的绘制训练结果 results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO2 Kuka no Cam") plt.show()
"buffer_size": int(args.timesteps_per_batch) } else: items = { "policy": MLP, batchsize: args.timesteps_per_batch, } model = algo(env=env, verbose=1, seed=args.SEED, **items) model.set_env(env) model.learn(total_timesteps=int(args.total_timesteps)) # library helper plot_results( [log_dir], int(args.total_timesteps), results_plotter.X_TIMESTEPS, "TRPO muscle" + identifer, ) plt.savefig("convergence_plot" + identifer + ".png") model.save("policy-" + identifer) else: # Use trained policy for the simulation. model = TRPO.load("trpo_" + identifer) obs = env.reset() done = False score = 0 while not done: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action)
R = X['results'] E = X['ep_lengths'] av_reward = [] for i in range(len(R)): av_reward.append(np.mean(R[i, :])) plt.plot(T, av_reward) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.savefig(log_dir + "evaluations.png") # plt.show() # plot all training rewards results_plotter.plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "") plt.savefig(log_dir + "reward_vs_timesteps.png") # plt.show() results_plotter.plot_results([log_dir], timesteps, results_plotter.X_EPISODES, "") plt.savefig(log_dir + "reward_vs_episodes.png") # plt.show() results_plotter.plot_results([log_dir], timesteps, results_plotter.X_WALLTIME, "") plt.savefig(log_dir + "reward_vs_walltime.png") # plt.show() #### smoothed training rewards
os.makedirs(log_dir, exist_ok=True) callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir) env = gym.make('DeepRMSCA-v0', **env_args) # logs will be saved in log_dir/monitor.csv # in this case, on top of the usual monitored things, we also monitor service and bit rate blocking probabilities env = Monitor(env, log_dir + 'training', info_keywords=('service_blocking_rate_since_reset','bit_rate_blocking_rate_since_reset')) policy_args = dict(net_arch=5*[128], act_fun=tf.nn.elu) # the neural network has four layers with 150 neurons each agent = TRPO(MlpPolicy, env, verbose=0, tensorboard_log="./tb/TRPO-DeepRMSCA-v0/", policy_kwargs=policy_args, gamma=.95, learning_rate=10e-5) agent.learn(total_timesteps=100000, callback=callback) results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "DeepRMSCA TRPO") import matplotlib.pyplot as plt def moving_average(values, window): """ Smooth values by doing a moving average :param values: (numpy array) :param window: (int) :return: (numpy array) """ weights = np.repeat(1.0, window) / window return np.convolve(values, weights, 'valid') def plot_results(log_folder, title='Learning Curve'):
x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(log_dir + 'best_model.pkl') n_steps += 1 return True ################ TRAINING model.learn(total_timesteps=time_steps, seed=args.random_seed) # print('save model') # savemodel(model, MODEL, ENVIRONMENT, DATE) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "RGBD Observation with a Sparse Reward Function") plt.show() print('total time', time.time() - start)
import matplotlib.pyplot as plt from stable_baselines import results_plotter from config import TIME_STEPS log_dir = "./monitor_logs/" results_plotter.plot_results([log_dir], TIME_STEPS, results_plotter.X_TIMESTEPS, "Rewards over episodes") plt.show()
# for key, value in baselines_mlp_model.get_parameters().items(): # print(key, value.shape) # # th_model = copy_mlp_weights(baselines_mlp_model) # obs = env.reset() # while True: # action, states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # print(rewards, dones) # env.render() print(log_dir) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "Witches") plt.show() ## Output PPO2 ## https://stable-baselines.readthedocs.io/en/master/modules/ppo2.html # ------------------------------------- # | approxkl | 8.841733e-05 | # | clipfrac | 0.0 | # | ep_len_mean | 1.4 | mean episode length # | ep_reward_mean | 0.2 | mean reward per episode # | explained_variance | -0.0164 | # | fps | 2831 | # | n_updates | 99 | number of gradient updates