def run_model_stablebaseline(flow_params, num_cpus=1, rollout_size=50, num_steps=50): """Run the model for num_steps if provided. Parameters ---------- num_cpus : int number of CPUs used during training rollout_size : int length of a single rollout num_steps : int total number of training steps The total rollout length is rollout_size. Returns ------- stable_baselines.* the trained model """ if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) train_model = PPO2('MlpPolicy', env, verbose=1, n_steps=rollout_size) train_model.learn(total_timesteps=num_steps) return train_model
def run_model_stablebaseline3(flow_params, num_cpus=1, rollout_size=5, num_steps=5): from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines3 import PPO from stable_baselines3.ppo import MlpPolicy import torch.nn as nn if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) train_model = PPO(MlpPolicy, env=env, verbose=1, n_epochs=rollout_size, tensorboard_log="./PPO_tensorboard/", device="cuda") # cpu, gpu selection # automatically select gpu train_model.learn(total_timesteps=num_steps * rollout_size) # return train_model
def run_model(num_cpus=1, rollout_size=50, num_steps=50): """Run the model for num_steps if provided. The total rollout length is rollout_size.""" if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() env = DummyVecEnv([lambda: constructor]) # The algorithms require a vectorized environment to run else: env = SubprocVecEnv([env_constructor(params=flow_params, version=i) for i in range(num_cpus)]) model = PPO2('MlpPolicy', env, verbose=1, n_steps=rollout_size) model.learn(total_timesteps=num_steps) return model
def run_model(params, rollout_size=50, num_steps=50): """Perform the training operation. Parameters ---------- params : dict flow-specific parameters (see flow/utils/registry.py) rollout_size : int length of a single rollout num_steps : int total number of training steps Returns ------- stable_baselines.* the trained model """ constructor = env_constructor(params, version=0)() env = DummyVecEnv([lambda: constructor]) model = TRPO( 'MlpPolicy', env, verbose=2, timesteps_per_batch=rollout_size, gamma=0.999, policy_kwargs={ "net_arch": [100, 50, 25] }, ) model.learn(total_timesteps=num_steps) return model
def run_model(num_cpus=1, rollout_size=50, num_steps=50, use_inflows=False): """Run the model for num_steps if provided. The total rollout length is rollout_size.""" initial_config, net_params = setup_exps(use_inflows) # add the new parameters to flow_params flow_params['initial'] = initial_config flow_params['net'] = net_params if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() env = DummyVecEnv([lambda: constructor]) # The algorithms require a vectorized environment to run else: env = SubprocVecEnv([env_constructor(params=flow_params, version=i) for i in range(num_cpus)]) model = PPO2('MlpPolicy', env, verbose=1, n_steps=rollout_size) model.learn(total_timesteps=num_steps) return model
def train_stable_baselines(submodule, flags): """Train policies using the PPO algorithm in stable-baselines.""" from stable_baselines3.common.vec_env import DummyVecEnv flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. start_time = timeit.default_timer() # print experiment.json information print("=========================================") print('Beginning training.') print('Algorithm :', flags.algorithm) model = run_model_stablebaseline(flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps, flags.algorithm, flags.exp_config) stop_time = timeit.default_timer() run_time = stop_time - start_time print("Training is Finished") print("total runtime: ", run_time) # Save the model to a desired folder and then delete it to demonstrate # loading. print('Saving the trained model!') path = os.path.realpath(os.path.expanduser('~/baseline_results')) ensure_dir(path) save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print('Loading the trained model and testing it out!') if flags.exp_config.lower() == "ppo": from stable_baselines3 import PPO model = PPO.load(save_path) elif flags.exp_config.lower() == "ddpg": from stable_baselines3 import DDPG model = DDPG.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = True env = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print('the final reward is {}'.format(reward))
def train_stable_baselines3(submodule, flags): """Train policies using the PPO algorithm in stable-baselines3.""" from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3 import PPO import torch start_time = timeit.default_timer() flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. print("cuda is available: ", torch.cuda.is_available()) print('Beginning training.') print("==========================================") model = run_model_stablebaseline3(flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps) # Save the model to a desired folder and then delete it to demonstrate # loading. print('Saving the trained model!') path = os.path.realpath(os.path.expanduser('~/baseline_results')) ensure_dir(path) save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params # check time for choose GPU and CPU stop_time = timeit.default_timer() run_time = stop_time - start_time with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print('Loading the trained model and testing it out!') model.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = False flow_params['env'].horizon = 1500 # 150seconds operation env = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print("--------------------------------------------------------") flow_params['sim'].render = True simulation = Experiment(flow_params) simulation.run(num_runs=1) print('the final reward is {}'.format(reward)) print("total run_time:", run_time, "s")
def run_model_stablebaseline(flow_params: Dict, num_cpus: int = 1, rollout_size: int = 50, num_steps: int = 50): """ Run the model with stable_baselines for num_steps if provided. Parameters ---------- flow_params : dict flow-specific parameters num_cpus : int number of CPUs used during training rollout_size : int length of a single rollout num_steps : int total number of training steps The total rollout length is rollout_size. Returns ------- stable_baselines.* the trained model """ from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines import PPO2 if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) print_box('Initialising MlpPolicy in Stable Baselines') train_model = PPO2('MlpPolicy', env, verbose=1, n_steps=rollout_size, tensorboard_log='delete_ppo') train_model.learn(total_timesteps=num_steps) return train_model
def run_model_stablebaseline(flow_params, num_cpus=1, rollout_size=50, num_steps=50): """Run the model for num_steps if provided. Parameters ---------- flow_params : dict flow-specific parameters num_cpus : int number of CPUs used during training rollout_size : int length of a single rollout num_steps : int total number of training steps The total rollout length is rollout_size. Returns ------- stable_baselines.* the trained model """ from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines import DDPG from stable_baselines.deepq.policies import MlpPolicy from stable_baselines.common.noise import NormalActionNoise,OrnsteinUhlenbeckActionNoise,AdaptiveParamNoiseSpec if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([env_constructor(params=flow_params, version=i) for i in range(num_cpus)]) train_model = DDPG('MlpPolicy', env, verbose=1, param_noise=param_noise,action_noise=action_noise, tensorboard_log="./DDPG_cartpole_tensorboard/") train_model.learn(total_timesteps=num_steps) return train_model
def train_stable_baselines(submodule, flags): """Train policies using the PPO algorithm in stable-baselines.""" from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] exp_folder_name = os.path.join(os.getcwd(), exp_tag) if not os.path.exists(exp_folder_name): os.makedirs(exp_folder_name) result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. print_box('Beginning training.') model = run_model_stablebaseline(flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps) # Save the model to a desired folder and then delete it to demonstrate # loading. # NOTE changed file saving HERE print_box('Saving the trained model!') path = os.getcwd() save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print_box('Loading the trained model and testing it out!') model = PPO2.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = False env = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print('The final reward is {}'.format(reward))
def play_results(path, result_name): print('Loading the trained model and testing it out!') save_path = os.path.join(path, result_name) model = DQN.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = True env_con = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env_con]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print('the final reward is {}'.format(reward))
def train_stable_baselines(submodule, flags): """Train policies using the PPO algorithm in stable-baselines.""" from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import DDPG flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. print('Beginning training.') model = run_model_stablebaseline( flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps) # Save the model to a desired folder and then delete it to demonstrate # loading. print('Saving the trained model!') path = os.path.realpath(os.path.expanduser('~/baseline_results')) ensure_dir(path) save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print('Loading the trained model and testing it out!') model = DDPG.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = True env = env_constructor(params=flow_params, version=0)() n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print('the final reward is {}'.format(reward))
print('Saving the trained model!') model.save(save_path) # dump the flow params with open(os.path.join(path, args.result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) del model del flow_params # Replay the result by loading the model print('Loading the trained model and testing it out!') model = PPO2.load(save_path) flow_params = get_flow_params( os.path.join(path, args.result_name) + '.json') flow_params['sim'].render = True env_constructor = env_constructor(params=flow_params, version=0)() env = DummyVecEnv([ lambda: env_constructor ]) # The algorithms require a vectorized environment to run obs = env.reset() reward = 0 for i in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) reward += rewards print('the final reward is {}'.format(reward))
def run_model_stablebaseline(flow_params, num_cpus=1, rollout_size=50, num_steps=50, algorithm="ppo", exp_config=None): """Run the model for num_steps if provided. Parameters ---------- flow_params : dict flow-specific parameters num_cpus : int number of CPUs used during training rollout_size : int length of a single rollout num_steps : int total number of training steps The total rollout length is rollout_size. Returns ------- stable_baselines.* the trained model """ from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) if algorithm == "PPO": from stable_baselines3 import PPO train_model = PPO('MlpPolicy', env, verbose=1, n_steps=rollout_size) train_model.learn(total_timesteps=num_steps) print("Learning Process is Done.") return train_model elif algorithm == "DDPG": from stable_baselines3 import DDPG from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise import numpy as np if exp_config == 'singleagent_figure_eight': train_model = DDPG( 'MlpPolicy', env, verbose=1, n_episodes_rollout=rollout_size, learning_starts=3000, learning_rate=0.0001, action_noise=OrnsteinUhlenbeckActionNoise( mean=np.zeros(1), sigma=0.15 * np.ones(1), initial_noise=0.7 * np.ones(1)), tau=0.005, batch_size=128, tensorboard_log='tensorboard_ddpg', device='cuda', ) else: train_model = DDPG( 'MlpPolicy', env, verbose=1, n_episodes_rollout=rollout_size, learning_starts=1200, tensorboard_log='tensorboard_ddpg', learning_rate=0.0001, action_noise=OrnsteinUhlenbeckActionNoise( mean=np.zeros(1), sigma=0.15 * np.ones(1), initial_noise=0.7 * np.ones(1)), tau=0.005, batch_size=512, device='cpu', ) from tensorboard_baselines.callbacks_ddpg import TensorboardCallback train_model.learn( total_timesteps=num_steps, log_interval=2, eval_log_path='ddpg_log', eval_freq=2, eval_freq=10, #callback=[TensorboardCallback], ) print("Learning Process is Done.") return train_model
def run_model_stablebaseline(flow_params, args, model_params=None): """Run the model for num_steps if provided. Parameters ---------- flow_params : Flow related parameters from config. args: Training arguments from parser. Returns ------- stable_baselines.* the trained model """ constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) if model_params is None: if args.policy == 0: policy = MlpPolicy elif args.policy == 1: policy = LnMlpPolicy else: warnings.warn("Invalid policy type! Policy set to MlpPolicy.") policy = MlpPolicy dueling = None if args.dueling else dict(dueling=False) train_model = DQN( policy=policy, env=env, gamma=args.gamma, learning_rate=args.learning_rate, buffer_size=args.buffer_size, exploration_fraction=args.exploration_fraction, exploration_final_eps=args.exploration_final_eps, exploration_initial_eps=args.exploration_initial_eps, train_freq=args.train_freq, batch_size=args.batch_size, double_q=args.double_q, learning_starts=args.learning_starts, target_network_update_freq=args.target_network_update_freq, prioritized_replay=args.prioritized_replay, prioritized_replay_alpha=args.prioritized_replay_alpha, prioritized_replay_beta0=args.prioritized_replay_beta0, prioritized_replay_beta_iters=args.prioritized_replay_beta_iters, prioritized_replay_eps=args.prioritized_replay_eps, param_noise=args.param_noise, policy_kwargs=dueling, verbose=args.verbose, tensorboard_log=args.tensorboard_log, full_tensorboard_log=args.full_tensorboard_log) else: train_model = DQN( policy=model_params["policy"], env=env, gamma=model_params["gamma"], learning_rate=model_params["learning_rate"], buffer_size=model_params["buffer_size"], exploration_fraction=model_params["exploration_fraction"], exploration_final_eps=model_params["exploration_final_eps"], exploration_initial_eps=model_params["exploration_initial_eps"], train_freq=model_params["train_freq"], batch_size=model_params["batch_size"], double_q=model_params["double_q"], learning_starts=model_params["learning_starts"], target_network_update_freq=model_params[ "target_network_update_freq"], prioritized_replay=model_params["prioritized_replay"], prioritized_replay_alpha=model_params["prioritized_replay_alpha"], prioritized_replay_beta0=model_params["prioritized_replay_beta0"], prioritized_replay_beta_iters=model_params[ "prioritized_replay_beta_iters"], prioritized_replay_eps=model_params["prioritized_replay_eps"], param_noise=model_params["param_noise"], policy_kwargs=model_params["policy_kwargs"], verbose=model_params["verbose"], tensorboard_log=model_params["tensorboard_log"], full_tensorboard_log=model_params["full_tensorboard_log"]) train_model.learn(total_timesteps=args.num_steps) return train_model