def _init_callback( self, callback: MaybeCallback, eval_env: Optional[VecEnv] = None, eval_freq: int = 10000, n_eval_episodes: int = 5, log_path: Optional[str] = None, ) -> BaseCallback: """ :param callback: Callback(s) called at every step with state of the algorithm. :param eval_freq: How many steps between evaluations; if None, do not evaluate. :param n_eval_episodes: How many episodes to play per evaluation :param n_eval_episodes: Number of episodes to rollout during evaluation. :param log_path: Path to a folder where the evaluations will be saved :return: A hybrid callback calling `callback` and performing evaluation. """ # Convert a list of callbacks into a callback if isinstance(callback, list): callback = CallbackList(callback) # Convert functional callback to object if not isinstance(callback, BaseCallback): callback = ConvertCallback(callback) # Create eval callback in charge of the evaluation if eval_env is not None: eval_callback = EvalCallback(eval_env, best_model_save_path=log_path, log_path=log_path, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes) callback = CallbackList([callback, eval_callback]) callback.init_callback(self) return callback
def _init_callback(self, callback: Union[None, Callable, List[BaseCallback], BaseCallback], eval_env: Optional[VecEnv] = None, eval_freq: int = 10000, n_eval_episodes: int = 5, log_path: Optional[str] = None) -> BaseCallback: """ :param callback: (Union[callable, [BaseCallback], BaseCallback, None]) :return: (BaseCallback) """ # Convert a list of callbacks into a callback if isinstance(callback, list): callback = CallbackList(callback) # Convert functional callback to object if not isinstance(callback, BaseCallback): callback = ConvertCallback(callback) # Create eval callback in charge of the evaluation if eval_env is not None: eval_callback = EvalCallback(eval_env, best_model_save_path=log_path, log_path=log_path, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes) callback = CallbackList([callback, eval_callback]) callback.init_callback(self) return callback
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # Dyn only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback( eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100 ) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) callback = CallbackList([checkpoint_callback, eval_callback, event_callback]) model.learn(500, callback=callback) model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) if os.path.exists(log_folder): shutil.rmtree(log_folder)
def fit(self, env, episodes, verbose, episode_steps, callbacks, log_interval, agent_id=-1): """Mask the agent fit function To train the agent """ logger.info("herer") # self.model.learn(total_timesteps=100, log_interval=10) #FIXME: use the tb logname meaningful! #TODO: Write callback funcs here: # List of callback: # Checkpoint Callback: save the model every 10 episodes. checkpoint_callback = CheckpointCallback( save_freq=96, save_path=self.agent_helper.config_dir, name_prefix='rl_model') # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path. eval_env = env eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70, verbose=1) eval_callback_reward_threshold = EvalCallback( eval_env, callback_on_new_best=callback_on_best, verbose=1) # EveryNTimeSteps: to call every n time steps to save the model. checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') event_callback_after_n_steps = EveryNTimesteps( n_steps=500, callback=checkpoint_on_event) # StopTrainingOnMaxEpisodes: # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) # CallbackList: to call several callback together. callbacklist = CallbackList([checkpoint_callback, eval_callback]) logger.info(f"Model: {self.model.get_env()}") with ProgressBarManager(log_interval) as progress_callback: self.model.learn(total_timesteps=log_interval, callback=[progress_callback, checkpoint_callback]) # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10) # self.eval_writer(mean_reward, std_reward) pass
def lean( self, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "run", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ): callback = CallbackList([self.checkpoint_cb, callback]) self.model.learn(total_timesteps=self.args.time_steps, log_interval=self.config.sac_log_interval(), tb_log_name="racer_learnig_log", callback=callback) return self.model
def runner(agent, episode, checkpoint, env): # scores = np.genfromtxt(checkpoint+'/data.csv', delimiter=',') # checkpoint2 = checkpoint+'2' custom_callback = LoggerCallback(episode, checkpoint=checkpoint) checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=checkpoint, name_prefix='rl_model') callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=episode, verbose=1) event_callback = EveryNTimesteps(n_steps=1, callback=custom_callback) # load = os.path.abspath(checkpoint+'/rl_model_676000_steps') # print(load) # agent = DDPG.load(load, env) callback_list = CallbackList([event_callback, checkpoint_callback, callback_max_episodes]) # agent.learn(total_timesteps=100000000, callback=callback_list, reward_function=reward) agent.learn(total_timesteps=100000000, callback=callback_list) scores = custom_callback.rewards np.savetxt(checkpoint+'/data.csv', scores, delimiter=',') return scores
def __init__(self, path, env_cls, env_kwargs, agent_kwargs, steps_per_rollout, num_envs, callbacks=[]): self.folder = ExperimentFolder(path) self.agent, self.env = self.folder.get(env_cls, env_kwargs, agent_kwargs) self.steps_per_rollout = steps_per_rollout self.num_envs = num_envs store = lambda _: self.folder.store(self.agent, env_kwargs, agent_kwargs) self.get_callback = lambda save_freq: CallbackList(callbacks + [ EveryNRolloutsPlusStartFinishFunctionCallback(save_freq, store) ])
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) # if args.stats_path is None: envs = VecNormalize(envs) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs) learner.learn(total_timesteps=10000000, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs) learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def lean( self, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "run", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ): # NOTE Avoid NoneType object callback for Simulation. This is problem of subcommand.py . callback = CallbackList( [c for c in [self.checkpoint_cb, callback] if c is not None]) self.model.learn(total_timesteps=self.args.time_steps, log_interval=self.config.sac_log_interval(), tb_log_name="racer_learnig_log", callback=callback) return self.model
def train( model: BaseAlgorithm, timesteps: int, eval_env: GymEnv, model_path: Path ) -> None: """ Train agent moves in his environment. Learning will finish when agent performs given number of timesteps or when mean reward of 10 gameplays reachs value 1. :param model: RL agent :param timesteps: total number of steps to take (through all episodes) :param eval_env: evaluation environment :param model_path: location where model will be saved :param tb_log_name: the name of the run for tensorboard log """ mlflow_callback = MlflowCallback(model_path) reward_threshold_callback = StopTrainingOnRewardThreshold( reward_threshold=1 ) eval_callback = MlflowEvalCallback( eval_env=eval_env, callback_on_new_best=reward_threshold_callback ) callbacks = CallbackList([mlflow_callback, eval_callback]) model.learn(total_timesteps=timesteps, callback=callbacks)
def setup_train(config, setup_dirs=True): T.set_num_threads(1) if setup_dirs: for s in ["agents", "agents_cp", "tb"]: if not os.path.exists(s): os.makedirs(s) # Random ID of this session if config["default_session_ID"] is None: config["session_ID"] = ''.join( random.choices('ABCDEFGHJKLMNPQRSTUVWXYZ', k=3)) else: config["session_ID"] = config["default_session_ID"] stats_path = "agents/{}_vecnorm.pkl".format(config["session_ID"]) # Import correct env by name env_fun = my_utils.import_env(config["env_name"]) env = env_fun(config) model = make_model(config, env) checkpoint_callback = CheckpointCallback(save_freq=100000, save_path='agents_cp/', name_prefix=config["session_ID"], verbose=1) # Separate evaluation env config_eval = deepcopy(config) config_eval["animate"] = False eval_env = env_fun(config_eval) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, eval_freq=10000, deterministic=True, render=False) callback_list = CallbackList([checkpoint_callback, eval_callback]) return env, model, callback_list, stats_path
def train_alg(model_alg, reset_optimizers_between_envs, reset_optimizers_every_iter, buffer_size, subsave, iteration, last_round_no_mer, is_evolving, seed): seed_all(seed) training_timesteps = META_TRAINING_TIMESTEPS params = params_list if not is_evolving: params = [params[-1]] start_time = time() env = gym.make(env_name) eval_env = gym.make(env_name) final_eval_env = gym.make(env_name) final_parameters_dict = params_sampler.sample1_means() change_env_parameters(final_eval_env, parameter_dict=final_parameters_dict) tensorboard_path = subsave + '/tb_' + str(iteration) optimizer_kwargs = {} policy_kwargs = { 'optimizer_class': th.optim.Adam, 'optimizer_kwargs': optimizer_kwargs, } model = model_alg( MlpPolicy, env, verbose=0, buffer_size=buffer_size, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, learning_starts=LEARNING_STARTS, gradient_steps=GRADIENT_STEPS, policy_kwargs=policy_kwargs, mer_s=MER_S, mer_gamma=MER_GAMMA, monitor_wrapper=True, tensorboard_log=tensorboard_path, reset_optimizers_during_training=reset_optimizers_every_iter, seed=seed) for i_param, param in enumerate(params): log_name = 'run_' + str(i_param) if i_param == (len(params) - 1): if not is_evolving: training_timesteps = FINAL_TRAINING_TIMESTEPS + NUM_TRAINING_ENVS * META_TRAINING_TIMESTEPS else: training_timesteps = FINAL_TRAINING_TIMESTEPS log_name += '_final' change_env_parameters(env, eval_env, parameter_dict=param) if model_alg.__name__ == 'SACMER' and last_round_no_mer and ( i_param == (len(params) - 1)): is_reservoir = False is_mer = False else: # This will not have any effect on regular SAC is_reservoir = True is_mer = True model.update_env(env, monitor_wrapper=False, is_reservoir=is_reservoir, reset_optimizers=reset_optimizers_between_envs ) # environment already wrapped so # monitor_wrapper=False eval_callback = EvalCallback(eval_env, best_model_save_path=None, log_path=tensorboard_path + '/' + log_name + '/running_eval', eval_freq=EVAL_FREQ, n_eval_episodes=N_EVAL_EPISODES, deterministic=True, render=False) if is_evolving: final_eval_callback = EvalCallback(final_eval_env, best_model_save_path=None, log_path=tensorboard_path + '/' + log_name + '/final_eval', eval_freq=EVAL_FREQ, n_eval_episodes=N_EVAL_EPISODES, deterministic=True, render=False) else: final_eval_callback = EventCallback() model.learn(total_timesteps=training_timesteps, log_interval=1, reset_num_timesteps=False, tb_log_name=log_name, is_mer=is_mer, callback=CallbackList([eval_callback, final_eval_callback])) env.reset() eval_env.reset() if iteration == 0: # saving models fills up storage, so we only save one (which we will also probably not use) model.save(subsave + 'model_' + str(iteration)) print(f"Done. Total time = {time() - start_time} seconds.")
eval_freq=n_timesteps_episode * args.eval_freq, deterministic=True, render=False, n_eval_episodes=args.eval_length) callbacks.append(eval_callback) # Set up tensorboard logger if args.tensorboard: log_callback = LoggerCallback(sinergym_logger=bool(args.logger)) callbacks.append(log_callback) # lets change default dir for TensorboardFormatLogger only tb_path = args.tensorboard + '/' + name new_logger = configure(tb_path, ["tensorboard"]) model.set_logger(new_logger) callback = CallbackList(callbacks) # ---------------------------------------------------------------------------- # # TRAINING # # ---------------------------------------------------------------------------- # model.learn(total_timesteps=timesteps, callback=callback, log_interval=args.log_interval) model.save(env.simulator._env_working_dir_parent + '/' + name) # If the algorithm doesn't reset or close the environment, this script will do it in # order to correctly log all the simulation data (Energyplus + Sinergym # logs) if env.simulator._episode_existed: env.close()
def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # DQN only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback( eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100, warn=False, ) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) # Stop training if max number of episodes is reached callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1) callback = CallbackList([ checkpoint_callback, eval_callback, event_callback, callback_max_episodes ]) model.learn(500, callback=callback) # Check access to local variables assert model.env.observation_space.contains(callback.locals["new_obs"][0]) # Check that the child callback was called assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"] assert event_callback.locals["new_obs"] is callback.locals["new_obs"] assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"] # Check that internal callback counters match models' counters assert event_callback.num_timesteps == model.num_timesteps assert event_callback.n_calls == model.num_timesteps model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) # Testing models that support multiple envs if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 # Pendulum-v0 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) model = model_class("MlpPolicy", envs, policy_kwargs=dict(net_arch=[32])) callback_max_episodes = StopTrainingOnMaxEpisodes( max_episodes=max_episodes, verbose=1) callback = CallbackList([callback_max_episodes]) model.learn(1000, callback=callback) # Check that the actual number of episodes and timesteps per env matches the expected one episodes_per_env = callback_max_episodes.n_episodes // n_envs assert episodes_per_env == max_episodes timesteps_per_env = model.num_timesteps // n_envs assert timesteps_per_env == max_episode_length if os.path.exists(log_folder): shutil.rmtree(log_folder)
save_path=logger.output_dir, name_prefix='rl_model') savestats_callback = SaveNormalization(save_path=osp.join( logger.output_dir, "vec_normalization.pkl")) # If using normalize, must create this callback eval_callback = EvalCallback(eval_env=eval_env, n_eval_episodes=5, callback_on_new_best=savestats_callback, eval_freq=1000, best_model_save_path=osp.join( logger.output_dir, "best_model"), log_path=osp.join(logger.output_dir, "results")) callback = CallbackList([checkpoint_callback, eval_callback]) if custom_params['algo'] == 'sac': model = SAC(policy=custom_params['policy'], env=env, verbose=1, **custom_params['sac_parameters'], tensorboard_log=logger.output_dir) elif custom_params['algo'] == 'dqn': model = DQN(policy=custom_params['policy'], env=env, verbose=1, **custom_params['dqn_parameters'], tensorboard_log=logger.output_dir) elif custom_params['algo'] == 'a2c': model = A2C(policy=custom_params['policy'],
def run(config): log.info(f'Beginning run for experiment {config["EXPERIMENT_ID"]}') # TODO: clean up RESULTS_PATH = config['RESULTS_PATH'] EXPERIMENTS_PREFIX = f'{RESULTS_PATH}{config["EXPERIMENT_ID"]}{os.sep}' ARTIFACTS_PATH = f'{EXPERIMENTS_PREFIX}artifact{os.sep}' VIS_RESULTS_PATH = f'{EXPERIMENTS_PREFIX}vis{os.sep}' SAVE_GIF_PATH = f'{EXPERIMENTS_PREFIX}gif{os.sep}' WANN_OUT_PREFIX = f'{ARTIFACTS_PATH}wann{os.sep}' ALG_OUT_PREFIX = f'{ARTIFACTS_PATH}alg{os.sep}' NUM_WORKERS = config['NUM_WORKERS'] GAME_CONFIG = config['GAME_CONFIG'] AGENT_CONFIG = config['AGENT'] log.info('RUN CONFIG:') log.info(config) log.info('Experiment description:') log.info(config['DESCRIPTION']) paths = [ ARTIFACTS_PATH, VIS_RESULTS_PATH, SAVE_GIF_PATH, WANN_OUT_PREFIX, f'{ALG_OUT_PREFIX}checkpoint{os.sep}checkpoint-alg{os.sep}' ] for p in paths: if not os.path.isdir(p): os.makedirs(p) ENV_NAME = GAME_CONFIG.env_name games = {ENV_NAME: GAME_CONFIG} wtrain.init_games_config(games) if config['TRAIN_WANN']: if "parent" == mpi_fork(NUM_WORKERS + 1): os._exit(0) wann_param_config = config['WANN_PARAM_CONFIG'] wann_args = dict(hyperparam=wann_param_config, outPrefix=WANN_OUT_PREFIX, rank=rank, num_workers=NUM_WORKERS, games=games) device = config['DEVICE'] alg = None use_wann = None for i in range(1, config['NUM_EPOCHS'] + 1): if config['TRAIN_WANN']: wtrain.set_device(device) wtrain.run( wann_args, use_checkpoint=True if i > 1 or config['USE_PREV_EXPERIMENT'] else False, alg_critic=None if alg is None else alg.critic, alg_policy=None if alg is None else alg.policy, mem=None if alg is None else alg.replay_buffer, wann_batch_size=AGENT_CONFIG['wann_batch_size'], wann_bootstrap_default=AGENT_CONFIG['wann_bootstrap_default']) if rank == 0: if i <= 1: env = Monitor(task.make_env(ENV_NAME), f'{EXPERIMENTS_PREFIX}log') learn_params = AGENT_CONFIG['learn_params'] checkpoint_callback = CheckpointCallback( save_freq=learn_params['alg_checkpoint_interval'], save_path= f'{ALG_OUT_PREFIX}checkpoint{os.sep}checkpoint-alg') eval_env = task.make_env(ENV_NAME) eval_callback = EvalCallback( eval_env, best_model_save_path= f'{ALG_OUT_PREFIX}checkpoint{os.sep}eval-best-alg', log_path=f'{EXPERIMENTS_PREFIX}log{os.sep}checkpoint', eval_freq=learn_params['eval_interval']) cb = CallbackList([checkpoint_callback, eval_callback]) use_wann = config['USE_WANN'] if use_wann: wVec, aVec, _ = wnet.importNet( f'{WANN_OUT_PREFIX}_best.out') else: wVec, aVec = None, None # TODO: save/load if on wann or SAC optimize step for prev experiment starts if GAME_CONFIG.alg_type == task.ALG.SAC: if config['USE_PREV_EXPERIMENT']: alg = SAC.load( f'{config["PREV_EXPERIMENT_PATH"]}{os.sep}alg' ) # TODO: load SAC model here else: alg = SAC( AGENT_CONFIG['policy'], env, verbose=learn_params['log_verbose'], tensorboard_log= f'{EXPERIMENTS_PREFIX}log{os.sep}tb-log', buffer_size=learn_params['mem_size'], learning_rate=learn_params['learn_rate'], learning_starts=learn_params['start_steps'], batch_size=learn_params['train_batch_size'], tau=learn_params['tau'], gamma=learn_params['gamma'], train_freq=learn_params['n_trains_per_step'], target_update_interval=learn_params[ 'replay_sample_ratio'], gradient_steps=learn_params[ 'gradient_steps_per_step'], n_episodes_rollout=learn_params['episode_len'], target_entropy=learn_params['target_entropy'], device=device, use_wann=use_wann, wVec=wVec, aVec=aVec) else: raise Exception( f'Algorithm configured is not currently supported') # if alg is not None and use_wann: # alg.sync_buffer() if i > 1: alg.learning_starts = 0 if i % LOG_INTERVAL == 0: log.info( f'performing learning step {i}/{config["NUM_EPOCHS"]} complete...' ) log.info('PERFORMING ALG TRAIN STEP') alg.learn(total_timesteps=learn_params['timesteps'], log_interval=learn_params['log_interval'], callback=cb) alg.save( f'{ALG_OUT_PREFIX}checkpoint{os.sep}full-run-checkpoint{os.sep}checkpoint-step-{i}' ) else: return # return if subprocess if i % LOG_INTERVAL == 0: log.info(f'step {i}/{config["NUM_EPOCHS"]} complete') if rank == 0: # if main process if config["RENDER_TEST_GIFS"]: vid_len = config['VIDEO_LENGTH'] render_agent(alg, ENV_NAME, vid_len, SAVE_GIF_PATH, filename=f'{config["EXPERIMENT_ID"]}-agent.gif') render_agent(alg, ENV_NAME, vid_len, SAVE_GIF_PATH, filename='random.gif') if use_wann: wtrain.run(None, kill_slaves=True)
def main(): set_random_seed(RANDOM_SEED) t_start = time() name = "LargeFinalLayer" checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name) os.makedirs(checkpoint_path, exist_ok=True) log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name) os.makedirs(log_path, exist_ok=True) results_path = os.path.join(checkpoint_path, "results.json") env_args = dict( frame_skip=4, screen_size=84, terminal_on_life_loss=True, clip_reward=True, ) # Creates a gym environment for an atari game using the specified seed and number of environments # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors # for improved performance.. # train_env = make_atari_env(ENV_NAME, n_envs=N_ENVS, seed=RANDOM_SEED, wrapper_kwargs=env_args) def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **env_args) return env def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=None, vec_env_kwargs=None, monitor_kwargs=None, ) train_env = make_env(0, N_ENVS) eval_env = make_env(1, 1) # required by models in baselines train_env = VecTransposeImage(train_env) eval_env = VecTransposeImage(eval_env) # setup callback to save model at fixed intervals save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=checkpoint_path, name_prefix=name) stop_callback = StopTrainingOnRewardThreshold( reward_threshold=EVAL_THRESHOLD) time_callback = TimeLimitCallback(max_time=TIME_LIMIT) best_callback = EvalCallback( eval_env, eval_freq=EVAL_FREQ, best_model_save_path=checkpoint_path, callback_on_new_best=stop_callback, ) list_callback = CallbackList([save_callback, best_callback, time_callback]) model = PPO( CnnPolicy, train_env, verbose=VERBOSE, batch_size=BATCH_SIZE, seed=RANDOM_SEED, tensorboard_log=log_path, learning_rate=LEARNING_RATE, n_steps=UPDATE_STEPS, n_epochs=N_EPOCHS, ent_coef=ENT_COEF, vf_coef=VF_COEF, clip_range=CLIP_RANGE, device=DEVICE_TYPE, policy_kwargs=dict(features_extractor_class=FeatureExtractor), ) config_path = os.path.join(checkpoint_path, "cnn_config") zip_path = os.path.join(checkpoint_path, "model.zip") # output the model config to a file for easier viewing with open(config_path, "w") as file: file.write(f"{name}\n") file.write(str(model.policy.features_extractor.cnn)) print("Beginning training...") model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run") # model.learn(TRAIN_STEPS, tb_log_name="run") model.save(zip_path) del train_env # del eval_env time_taken = time() - t_start print("Beginning evaluation...") # score of the game, standard deviation of multiple runs reward_mean, reward_std = evaluate_policy(model, make_env(2, 1)) with open(results_path, "w") as handle: handle.write(json.dumps((reward_mean, reward_std, time_taken)))
def evaluate(individual: Individual, device: Union[torch.device, str] = "auto") -> Tuple[int]: """ Evaluate a single individual model and return it's mean score after the training time is elapsed. Models are trained and evaluated for a number of timestamps as parameterized in the constants at the top of the file. :param individual: The individual to evaluate. :return: """ t_start = time() layers = individual.weights name = individual.encode() checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name) if os.path.exists(checkpoint_path): return (random.randint(MIN_SCORE, MAX_SCORE), ) os.makedirs(checkpoint_path, exist_ok=True) log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name) os.makedirs(log_path, exist_ok=True) results_path = os.path.join(checkpoint_path, "results.json") if not os.path.exists(results_path): env_args = dict( frame_skip=4, screen_size=84, terminal_on_life_loss=True, clip_reward=True, ) # Creates a gym environment for an atari game using the specified seed and number of environments # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors # for improved performance.. def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **env_args) return env def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=SubprocVecEnv, vec_env_kwargs=None, monitor_kwargs=None, ) train_env = make_env(0, N_ENVS) eval_env = make_env(1, 1) # required by models in baselines train_env = VecTransposeImage(train_env) eval_env = VecTransposeImage(eval_env) # setup callback to save model at fixed intervals save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=checkpoint_path, name_prefix=name) stop_callback = StopTrainingOnRewardThreshold( reward_threshold=EVAL_THRESHOLD) time_callback = TimeLimitCallback(max_time=TIME_LIMIT) best_callback = EvalCallback( eval_env, eval_freq=EVAL_FREQ, best_model_save_path=checkpoint_path, callback_on_new_best=stop_callback, ) list_callback = CallbackList( [save_callback, best_callback, time_callback]) model = PPO( CnnPolicy, train_env, verbose=VERBOSE, batch_size=BATCH_SIZE, seed=RANDOM_SEED * 7, tensorboard_log=log_path, learning_rate=LEARNING_RATE, n_steps=UPDATE_STEPS, n_epochs=N_EPOCHS, ent_coef=ENT_COEF, vf_coef=VF_COEF, clip_range=CLIP_RANGE, device=device, policy_kwargs=dict(features_extractor_class=VariableBenchmark, features_extractor_kwargs=dict(layers=layers)), ) config_path = os.path.join(checkpoint_path, "cnn_config") zip_path = os.path.join(checkpoint_path, "model.zip") # output the model config to a file for easier viewing with open(config_path, "w") as file: file.write(f"{name}\n") file.write(str(model.policy.features_extractor.cnn)) print("Beginning training...") model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run") model.save(zip_path) del train_env del eval_env time_taken = time() - t_start print("Beginning evaluation...") # score of the game, standard deviation of multiple runs reward_mean, reward_std = evaluate_policy(model, make_env(2, 1)) with open(results_path, "w") as handle: handle.write(json.dumps((reward_mean, reward_std, time_taken))) else: reward_mean, reward_std, time_taken = json.load(open( results_path, "r")) reward_mean = abs(MIN_SCORE) + reward_mean value = (reward_mean * weighted_time(time_taken), ) print(f"Evaluated {name} with a score of {value} in {(time_taken):.2f}s") return value
for task in reward_threshold.keys(): TASK_NAME = task checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='rl_model') callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100 * 150 /2, verbose=1) env = gym.make(TASK_NAME) log_dir = "./logs" env_m = monitor.Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env_m]) env = VecNormalize(env, norm_obs=True, norm_reward=True) # Stop training when the model reaches the reward threshold callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold[TASK_NAME], verbose=1) eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, verbose=1) callback = CallbackList([callback_max_episodes, eval_callback]) model = A2C('MlpPolicy', env, verbose=1,policy_kwargs=dict(net_arch=model_def)) st = time.time() model.learn(total_timesteps=100 * 150 * 10000, callback=callback) elapse_time = time.time() - st with open("./outdir/"+TASK_NAME + ".plt", "wb") as fd: chkpt = { "elapse_time": elapse_time, "reward_threshold" : reward_threshold, "reward_list" : env_m.get_episode_rewards(), "timestep_list": env_m.get_episode_lengths(), "runtime_list" : env_m.get_episode_times(), "totall_steps": env_m.get_total_steps() }
def main(): if(StartFresh): # Create Environment env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() # Separate evaluation env eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # Create Model model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log) else: print('duh') # tmp_test_name = 'SAC-Continued' # tb_log_name = tmp_test_name + '_' + env_name # tmp_log_dir = os.path.join('log', tmp_test_name) # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name) # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name) # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models') # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps') # # Load Enironment # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # env = VecNormalize.load(tmp_env_stats_path, env) # env.reset() # # Separate evaluation env # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env) # eval_env.reset() # # Load Model # # model = SAC.load(model_stats_path, tensorboard_log=tb_log) # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6) # # model.learning_rate = 1e-5 # model.set_env(env) if(DoTraining): checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path=best_path, log_path=best_path, eval_freq=eval_freq, deterministic=True, render=False) # Video Update Callback record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1) envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path) nStep_callback_list = CallbackList([record_callback, envSave_callback]) vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list) # Create the callback list callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback]) print(tb_log_name) model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, reset_num_timesteps=False, callback=callbacks) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
def main(): if(StartFresh): # Create Environment env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() # Separate evaluation env eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # Create Model # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto") policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])]) model = PPO('MlpPolicy', env, learning_rate = 3e-5, n_steps=512, batch_size=128, n_epochs=20, gamma=0.99, gae_lambda = 0.9, clip_range = 0.4, vf_coef = 0.5, use_sde = True, sde_sample_freq = 4, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: print('duh') # tmp_test_name = 'SAC-Continued' # tb_log_name = tmp_test_name + '_' + env_name # tmp_log_dir = os.path.join('log', tmp_test_name) # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name) # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name) # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models') # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps') # # Load Enironment # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # env = VecNormalize.load(tmp_env_stats_path, env) # env.reset() # # Separate evaluation env # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env) # eval_env.reset() # # Load Model # # model = SAC.load(model_stats_path, tensorboard_log=tb_log) # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6) # # model.learning_rate = 1e-5 # model.set_env(env) if(DoTraining): checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path=best_path, log_path=best_path, eval_freq=eval_freq, deterministic=True, render=False) # Video Update Callback record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1) envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path) nStep_callback_list = CallbackList([record_callback, envSave_callback]) # nStep_callback_list = CallbackList([envSave_callback]) vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list) # Create the callback list callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback]) # callbacks = CallbackList([checkpoint_callback, eval_callback]) print(tb_log_name) model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, reset_num_timesteps=False, callback=callbacks) # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
n_epochs= 10, # 10, # number of passes to do over the whole rollout buffer (of size 2048*n_cpus) during one training iter create_eval_env=False, # todo seed=None, verbose=2, tensorboard_log="./ppo_logs/") # evaluate # mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True) # print(f"mean_reward={mean_reward:.2f} +/- {std_reward}") # save a checkpoint every n steps checkpoint_callback = CheckpointCallback(save_freq=49_000, save_path='./ppo_checkpoints/', name_prefix='debug_model') callbacks = CallbackList([checkpoint_callback, CustomCallback()]) # cf /Users/nathan/opt/anaconda3/envs/vae/lib/python3.7/site-packages/stable_baselines3/common/callbacks.py # to make own checkpoints to have more control # the save_freq of cp_callback here doesnt take parallelism into account, so like 10k train steps with # 3 agents will only be 3333 steps for cp callback not enough to reach 5k (save freq) # train model.learn(total_timesteps=10_000_000, callback=callbacks) # evaluate # mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True) # print(f"mean_reward={mean_reward:.2f} +/- {std_reward}") # ------------------------------------------------------------------------------------------------------ # model = PPO.load("model_save") # env = CoinrunEnv()