def train_model(self): auto_save_callback = SaveOnBestTrainingRewardCallback( log_dir=self.log_dir) auto_save_callback_every_1000_steps = EveryNTimesteps( n_steps=1000, callback=auto_save_callback) self.environment = Monitor(self.environment, self.log_dir) self.model = self.algorithm('MlpPolicy', self.environment, verbose=1, tensorboard_log=self.log_dir) name = self.model_name + "_full_model" checkpoint_callback = SavePerformanceOnCheckpoints( resource_manager=self, name=name, checkpoint_results=self.checkpoint_results) checkpoint_callback_every_1000_steps = EveryNTimesteps( n_steps=1000, callback=checkpoint_callback) with ProgressBarManager(self.training_steps) as progress_callback: self.model.learn(total_timesteps=self.training_steps, callback=[ progress_callback, auto_save_callback_every_1000_steps, checkpoint_callback_every_1000_steps ]) self.save_episode_rewards_as_csv() model_path = os.path.abspath("models/" + name) self.model.save(model_path)
def train_stage2_model(self, environment_kwargs=None, policy_kwargs=None, hyperparams=None, training_steps=20000, model_name="", stage1_time=0): config = { "verbose": 1, "tensorboard_log": self.log_dir, "policy_kwargs": policy_kwargs } if hyperparams is not None: for key in hyperparams.keys(): config[key] = hyperparams[key] print(environment_kwargs) environment = ResourceAllocationEnvironment(self.ra_problem, **environment_kwargs) environment = Monitor(environment, self.log_dir) self.environment = environment model = PPO(MultiStageActorCritic, environment, **config) with ProgressBarManager(training_steps) as progress_callback: auto_save_callback = SaveOnBestTrainingRewardCallback( log_dir=self.log_dir) auto_save_callback_every_1000_steps = EveryNTimesteps( n_steps=1000, callback=auto_save_callback) checkpoint_callback = SavePerformanceOnCheckpoints( stage1_time=stage1_time, resource_manager=self, name=model_name, checkpoint_results=self.checkpoint_results) checkpoint_callback_every_1000_steps = EveryNTimesteps( n_steps=1000, callback=checkpoint_callback) callbacks = [ progress_callback, auto_save_callback_every_1000_steps, checkpoint_callback_every_1000_steps ] model.learn(total_timesteps=training_steps, callback=callbacks) self.environment = environment self.save_episode_rewards_as_csv() full_model_path = os.path.abspath("models/" + self.model_name + "_full_model") model.save(full_model_path) return model
def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # Dyn only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback( eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100 ) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) callback = CallbackList([checkpoint_callback, eval_callback, event_callback]) model.learn(500, callback=callback) model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) if os.path.exists(log_folder): shutil.rmtree(log_folder)
def train_adp_model(self, regional_policies=None, setup_start=0.): regions = self.regions environment = ADPResourceAllocationEnvironment( self.ra_problem, regions, regional_policies, abstract_action_to_direction=self.abstract_action_to_direction, n_locked_tasks=self.n_locked_tasks, n_abstract_actions=self.n_abstract_actions) environment = Monitor(environment, self.log_dir) self.environment = environment adp_model = self.algorithm('MlpPolicy', environment, verbose=1, tensorboard_log=self.log_dir) self.model = adp_model auto_save_callback = SaveOnBestTrainingRewardCallback( log_dir=self.log_dir) auto_save_callback_every_1000_steps = EveryNTimesteps( n_steps=1000, callback=auto_save_callback) name = self.model_name + "_full_model_multi" setup_time = time.time() - setup_start checkpoint_callback = SavePerformanceOnCheckpoints( stage1_time=setup_time, resource_manager=self, name=name, checkpoint_results=self.checkpoint_results) checkpoint_callback_every_1000_steps = EveryNTimesteps( n_steps=1000, callback=checkpoint_callback) training_steps = self.training_config["stage2_training_steps"] with ProgressBarManager(training_steps) as progress_callback: adp_model.learn(total_timesteps=training_steps, callback=[ progress_callback, auto_save_callback_every_1000_steps, checkpoint_callback_every_1000_steps ]) self.save_episode_rewards_as_csv() full_model_path = os.path.abspath("models/" + self.model_name + "_full_model") adp_model.save(full_model_path)
def evaluate_objective(config): tune_env = deepcopy(base_env) tune_monitor = OptimizationCallback(tune_env, EVAL_EPISODES, True) monitor_callback = EveryNTimesteps(n_steps=args.report_interval, callback=tune_monitor) tune_agent = agent("MlpPolicy", tune_env, **config) tune_agent.learn(total_timesteps=args.sample_timesteps, callback=monitor_callback)
def fit(self, env, episodes, verbose, episode_steps, callbacks, log_interval, agent_id=-1): """Mask the agent fit function To train the agent """ logger.info("herer") # self.model.learn(total_timesteps=100, log_interval=10) #FIXME: use the tb logname meaningful! #TODO: Write callback funcs here: # List of callback: # Checkpoint Callback: save the model every 10 episodes. checkpoint_callback = CheckpointCallback( save_freq=96, save_path=self.agent_helper.config_dir, name_prefix='rl_model') # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path. eval_env = env eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70, verbose=1) eval_callback_reward_threshold = EvalCallback( eval_env, callback_on_new_best=callback_on_best, verbose=1) # EveryNTimeSteps: to call every n time steps to save the model. checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') event_callback_after_n_steps = EveryNTimesteps( n_steps=500, callback=checkpoint_on_event) # StopTrainingOnMaxEpisodes: # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) # CallbackList: to call several callback together. callbacklist = CallbackList([checkpoint_callback, eval_callback]) logger.info(f"Model: {self.model.get_env()}") with ProgressBarManager(log_interval) as progress_callback: self.model.learn(total_timesteps=log_interval, callback=[progress_callback, checkpoint_callback]) # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10) # self.eval_writer(mean_reward, std_reward) pass
def main(cfg: DictConfig): env = get_env(None, cfg.env) model = DQN(MlpPolicy, env, **cfg.model, tensorboard_log='logs/', verbose=1) callbacks = [TensorboardCallback()] if cfg.self_play: self_play = EveryNTimesteps(cfg.n_update_selfplay, callback=SelfPlay('ckpts/', cfg.env)) callbacks.append(self_play) if cfg.ckpt_freq: ckpt_cb = CheckpointCallback(save_freq=cfg.ckpt_freq, save_path='ckpts/') callbacks.append(ckpt_cb) model.learn(total_timesteps=cfg.n_total_steps, callback=callbacks, tb_log_name=cfg.log_name)
def runner(agent, episode, checkpoint, env): # scores = np.genfromtxt(checkpoint+'/data.csv', delimiter=',') # checkpoint2 = checkpoint+'2' custom_callback = LoggerCallback(episode, checkpoint=checkpoint) checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=checkpoint, name_prefix='rl_model') callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=episode, verbose=1) event_callback = EveryNTimesteps(n_steps=1, callback=custom_callback) # load = os.path.abspath(checkpoint+'/rl_model_676000_steps') # print(load) # agent = DDPG.load(load, env) callback_list = CallbackList([event_callback, checkpoint_callback, callback_max_episodes]) # agent.learn(total_timesteps=100000000, callback=callback_list, reward_function=reward) agent.learn(total_timesteps=100000000, callback=callback_list) scores = custom_callback.rewards np.savetxt(checkpoint+'/data.csv', scores, delimiter=',') return scores
def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # DQN only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback( eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100, warn=False, ) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) # Stop training if max number of episodes is reached callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1) callback = CallbackList([ checkpoint_callback, eval_callback, event_callback, callback_max_episodes ]) model.learn(500, callback=callback) # Check access to local variables assert model.env.observation_space.contains(callback.locals["new_obs"][0]) # Check that the child callback was called assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"] assert event_callback.locals["new_obs"] is callback.locals["new_obs"] assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"] # Check that internal callback counters match models' counters assert event_callback.num_timesteps == model.num_timesteps assert event_callback.n_calls == model.num_timesteps model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) # Testing models that support multiple envs if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 # Pendulum-v0 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) model = model_class("MlpPolicy", envs, policy_kwargs=dict(net_arch=[32])) callback_max_episodes = StopTrainingOnMaxEpisodes( max_episodes=max_episodes, verbose=1) callback = CallbackList([callback_max_episodes]) model.learn(1000, callback=callback) # Check that the actual number of episodes and timesteps per env matches the expected one episodes_per_env = callback_max_episodes.n_episodes // n_envs assert episodes_per_env == max_episodes timesteps_per_env = model.num_timesteps // n_envs assert timesteps_per_env == max_episode_length if os.path.exists(log_folder): shutil.rmtree(log_folder)
def main(): if(StartFresh): # Create Environment env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() # Separate evaluation env eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # Create Model # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto") policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])]) model = PPO('MlpPolicy', env, learning_rate = 3e-5, n_steps=512, batch_size=128, n_epochs=20, gamma=0.99, gae_lambda = 0.9, clip_range = 0.4, vf_coef = 0.5, use_sde = True, sde_sample_freq = 4, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: print('duh') # tmp_test_name = 'SAC-Continued' # tb_log_name = tmp_test_name + '_' + env_name # tmp_log_dir = os.path.join('log', tmp_test_name) # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name) # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name) # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models') # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps') # # Load Enironment # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # env = VecNormalize.load(tmp_env_stats_path, env) # env.reset() # # Separate evaluation env # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env) # eval_env.reset() # # Load Model # # model = SAC.load(model_stats_path, tensorboard_log=tb_log) # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6) # # model.learning_rate = 1e-5 # model.set_env(env) if(DoTraining): checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path=best_path, log_path=best_path, eval_freq=eval_freq, deterministic=True, render=False) # Video Update Callback record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1) envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path) nStep_callback_list = CallbackList([record_callback, envSave_callback]) # nStep_callback_list = CallbackList([envSave_callback]) vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list) # Create the callback list callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback]) # callbacks = CallbackList([checkpoint_callback, eval_callback]) print(tb_log_name) model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, reset_num_timesteps=False, callback=callbacks) # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
policy_kwargs=policy_kwargs, verbose=1) else: # The noise objects for DDPG n_actions = env.action_space.shape[-1] # action_noise = NormalActionNoise(mean=np.zeros( # n_actions), sigma=args.action_noise * np.ones(n_actions)) # model = DDPG('MlpPolicy', env, action_noise=action_noise, batch_size=args.batch_size, # buffer_size=args.buffer_size, gamma=args.gamma, policy_kwargs=policy_kwargs, verbose=1) checkpoint_on_event = CheckpointCallback(save_freq=1, name_prefix=get_params_str( args.seed), save_path='./checkpoints/') event_callback = EveryNTimesteps(n_steps=args.checkpoint_every, callback=checkpoint_on_event) model.learn(total_timesteps=args.total_timesteps, log_interval=1, callback=event_callback) if (args.save_to): model.save(args.save_to) else: model.save("saved_models/" + get_params_str(f"envSeed-{args.seed}")) if args.evaluate_for: evaluate(model, env)
def main(): if(StartFresh): # Create Environment env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() # Separate evaluation env eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # Create Model model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log) else: print('duh') # tmp_test_name = 'SAC-Continued' # tb_log_name = tmp_test_name + '_' + env_name # tmp_log_dir = os.path.join('log', tmp_test_name) # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name) # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name) # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models') # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps') # # Load Enironment # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # env = VecNormalize.load(tmp_env_stats_path, env) # env.reset() # # Separate evaluation env # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env) # eval_env.reset() # # Load Model # # model = SAC.load(model_stats_path, tensorboard_log=tb_log) # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6) # # model.learning_rate = 1e-5 # model.set_env(env) if(DoTraining): checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path=best_path, log_path=best_path, eval_freq=eval_freq, deterministic=True, render=False) # Video Update Callback record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1) envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path) nStep_callback_list = CallbackList([record_callback, envSave_callback]) vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list) # Create the callback list callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback]) print(tb_log_name) model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, reset_num_timesteps=False, callback=callbacks) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)