def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # DQN only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) # Stop training if max number of episodes is reached callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1) callback = CallbackList([ checkpoint_callback, eval_callback, event_callback, callback_max_episodes ]) model.learn(500, callback=callback) # Check access to local variables assert model.env.observation_space.contains(callback.locals["new_obs"][0]) # Check that the child callback was called assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"] assert event_callback.locals["new_obs"] is callback.locals["new_obs"] assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"] # Check that internal callback counters match models' counters assert event_callback.num_timesteps == model.num_timesteps assert event_callback.n_calls == model.num_timesteps model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) # Testing models that support multiple envs if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 # Pendulum-v0 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) model = model_class("MlpPolicy", envs, policy_kwargs=dict(net_arch=[32])) callback_max_episodes = StopTrainingOnMaxEpisodes( max_episodes=max_episodes, verbose=1) callback = CallbackList([callback_max_episodes]) model.learn(1000, callback=callback) # Check that the actual number of episodes and timesteps per env matches the expected one episodes_per_env = callback_max_episodes.n_episodes // n_envs assert episodes_per_env == max_episodes timesteps_per_env = model.num_timesteps // n_envs assert timesteps_per_env == max_episode_length if os.path.exists(log_folder): shutil.rmtree(log_folder)
tasks.check_name(name) save_parameter(save_model_path, args) env_builder = importlib.import_module('{}.env_builder'.format(name)) env = env_builder.build_env(enable_randomizer=True, version=version, enable_rendering=False, control_mode=control_mode) eval_env = env_builder.build_env(enable_randomizer=True, version=version, enable_rendering=False, control_mode=control_mode) eval_callback = EvalCallback(eval_env, best_model_save_path=save_model_path, log_path=save_model_path, eval_freq=1000, deterministic=True, render=False) policy_kwargs = dict(activation_fn=torch.nn.ReLU, net_arch=net_arch) model = SAC('MlpPolicy', env, verbose=1, tensorboard_log=save_model_path, policy_kwargs=policy_kwargs, buffer_size=buffer_size, batch_size=batch_size, learning_starts=learning_starts, ent_coef=ent_coef) if args.load_from_best: model = SAC.load(load_model_path) model.set_env(env)
# FOR REFERENCE # policies[0] = fixed_rock_policy, # policies[1] = fixed_paper_policy, # policies[2] = fixed_scissors_policy, # policies[3] = copycat_policy, # policies[4] = random_policy, # policies[5] = aggressive_policy, # policies[6] = passive_policy, opp_policies = policies meta_env = gym.make('rps-meta-v0') test_meta_env = gym.make('rps-meta-v0') eval_callback = EvalCallback(test_meta_env, eval_freq=1000, deterministic=True, render=False) policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[30,30]) n_steps, batch_size, n_epochs = 50, 50, 10 model = PPO("MlpPolicy", meta_env, policy_kwargs=policy_kwargs, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, verbose=0) #model.learn(total_timesteps=100000, callback=eval_callback, meta_learn=False) # no meta learning model.learn(total_timesteps=100000, callback=eval_callback, meta_learn=True) # meta learning opponent_policies = [ np.array([0,1,2,0,1]), np.array([1,2,2,1,0]), np.array([2,1,0,0,0]), #np.array([2,2,1,1,0]), #np.array([0,1,2,2,2]),
def test_sac_phase(): reward = [] for i in [2000, 4000, 6000, 8000, 10000]: model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=5000, verbose=0, create_eval_env=True, buffer_size=i, ent_coef=0, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)), batch_size=32) env = model.env eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/alpha5_phase', eval_freq=250, n_eval_episodes=100, deterministic=True, render=False) model.learn(total_timesteps=20000, callback=eval_callback) reward.append(eval_callback.last_mean_reward) definition = 200 portrait = np.zeros((definition, definition)) state_min = env.observation_space.low state_max = env.observation_space.high for index_t, t in enumerate(np.linspace(-np.pi, np.pi, num=definition)): for index_td, td in enumerate( np.linspace(state_min[2], state_max[2], num=definition)): state = torch.Tensor([[np.cos(t), np.sin(t), td]]) action = model.policy.forward(state) portrait[definition - (1 + index_td), index_t] = model.critic.q1_forward(state, action) plt.figure(figsize=(10, 10)) plt.imshow(portrait, cmap="inferno", extent=[-180, 180, state_min[2], state_max[2]], aspect='auto') plt.rc('axes', titlesize=12) plt.xlabel('angle') plt.ylabel('velocity') plt.title( "critic, last mean reward = {:.2f} +/- {:.2f}, replay size = {}". format(reward[-1], eval_callback.last_std, i)) plt.colorbar(label="critic value") plt.scatter([0], [0]) plt.show() definition = 200 portrait = np.zeros((definition, definition)) state_min = env.observation_space.low state_max = env.observation_space.high portrait = np.zeros((definition, definition)) for index_t, t in enumerate(np.linspace(-np.pi, np.pi, num=definition)): for index_td, td in enumerate( np.linspace(state_min[2], state_max[2], num=definition)): state = torch.Tensor([[np.cos(t), np.sin(t), td]]) probs = model.policy.forward(state) action = probs.data.numpy().astype(float) portrait[definition - (1 + index_td), index_t] = action plt.figure(figsize=(10, 10)) plt.imshow(portrait, cmap="coolwarm", extent=[-180, 180, state_min[2], state_max[2]], aspect='auto') plt.title( "action, last mean reward = {:.2f} +/- {:.2f}, replay size = {}". format(reward[-1], eval_callback.last_std, i)) plt.colorbar(label="action") plt.rc('axes', titlesize=12) plt.xlabel('angle') plt.ylabel('velocity') plt.scatter([0], [0]) plt.show() return reward
def train_alg(model_alg, reset_optimizers, buffer_size, subsave, iteration, last_round_no_mer, is_evolving, gradient_steps=GRADIENT_STEPS, params_list=params_list): training_timesteps = META_TRAINING_TIMESTEPS params = params_list if not is_evolving: params = [params[-1]] start_time = time() env = gym.make(env_name) eval_env = gym.make(env_name) final_eval_env = gym.make(env_name) final_parameters_dict = params[-1] change_env_parameters(final_eval_env, parameter_dict=final_parameters_dict) tensorboard_path = subsave + '/tb_' + str(iteration) optimizer_kwargs = {} policy_kwargs = { 'optimizer_class': th.optim.Adam, 'optimizer_kwargs': optimizer_kwargs, } model = model_alg(MlpPolicy, env, verbose=1, buffer_size=buffer_size, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, learning_starts=LEARNING_STARTS, gradient_steps=gradient_steps, policy_kwargs=policy_kwargs, mer_s=MER_S, mer_gamma=MER_GAMMA, monitor_wrapper=True, tensorboard_log=tensorboard_path) # Add memories from all buffers to current one, filling it up entirely in the process for replay_buffer in replay_buffers_list: model.add_memories_from_another_replay_mem(replay_buffer) for i_param, param in enumerate(params): log_name = 'run_' + str(i_param) if i_param == (len(params) - 1): training_timesteps = FINAL_TRAINING_TIMESTEPS log_name += '_final' change_env_parameters(env, eval_env, parameter_dict=param) if model_alg.__name__ == 'SACMER' and last_round_no_mer and ( i_param == (len(params) - 1)): is_reservoir = False is_mer = False else: # This will not have any effect on regular SAC is_reservoir = True is_mer = True model.update_env( env, monitor_wrapper=False, is_reservoir=is_reservoir, reset_optimizers=reset_optimizers ) # environment already wrapped so monitor_wrapper=False eval_callback = EvalCallback(eval_env, best_model_save_path=None, log_path=tensorboard_path + '/' + log_name + '/running_eval', eval_freq=EVAL_FREQ, n_eval_episodes=N_EVAL_EPISODES, deterministic=True, render=False) if is_evolving: final_eval_callback = EvalCallback(final_eval_env, best_model_save_path=None, log_path=tensorboard_path + '/' + log_name + '/final_eval', eval_freq=EVAL_FREQ, n_eval_episodes=N_EVAL_EPISODES, deterministic=True, render=False) else: final_eval_callback = EventCallback() # empty callback model.learn(total_timesteps=training_timesteps, log_interval=1, reset_num_timesteps=False, tb_log_name=log_name, is_mer=is_mer, callback=CallbackList([eval_callback, final_eval_callback])) env.reset() eval_env.reset() # if iteration == 0: # saving models fills up storage, so we only save one (which we will also probably not use) model.save(subsave + 'model_' + str(iteration), include=['replay_buffer']) print(f"Done. Total time = {time() - start_time} seconds.")
from torch import nn from stable_baselines3 import PPO from stable_baselines3.common.callbacks import EvalCallback env = gym.make('HumanoidBasicEnv-v0') eval_env = gym.make('HumanoidBasicEnv-v0') policy_kwargs = dict(activation_fn=nn.ReLU, net_arch=[1024, 512]) # model = PPO.load('walking_agent', env=env) model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log='./walk/logs/') # Save the best model periodically during training bestModelCallback = EvalCallback(eval_env=eval_env, eval_freq=10000, log_path='./walk/logs/', best_model_save_path='./walk/logs/') model.learn(total_timesteps=200, eval_freq=4000, eval_env=eval_env, tb_log_name='static_run', callback=bestModelCallback) model.save('static_agent') env.close()
def evaluate(individual: Individual, device: Union[torch.device, str] = "auto") -> Tuple[int]: """ Evaluate a single individual model and return it's mean score after the training time is elapsed. Models are trained and evaluated for a number of timestamps as parameterized in the constants at the top of the file. :param individual: The individual to evaluate. :return: """ t_start = time() layers = individual.weights name = individual.encode() checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name) if os.path.exists(checkpoint_path): return (random.randint(MIN_SCORE, MAX_SCORE), ) os.makedirs(checkpoint_path, exist_ok=True) log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name) os.makedirs(log_path, exist_ok=True) results_path = os.path.join(checkpoint_path, "results.json") if not os.path.exists(results_path): env_args = dict( frame_skip=4, screen_size=84, terminal_on_life_loss=True, clip_reward=True, ) # Creates a gym environment for an atari game using the specified seed and number of environments # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors # for improved performance.. def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **env_args) return env def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=SubprocVecEnv, vec_env_kwargs=None, monitor_kwargs=None, ) train_env = make_env(0, N_ENVS) eval_env = make_env(1, 1) # required by models in baselines train_env = VecTransposeImage(train_env) eval_env = VecTransposeImage(eval_env) # setup callback to save model at fixed intervals save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=checkpoint_path, name_prefix=name) stop_callback = StopTrainingOnRewardThreshold( reward_threshold=EVAL_THRESHOLD) time_callback = TimeLimitCallback(max_time=TIME_LIMIT) best_callback = EvalCallback( eval_env, eval_freq=EVAL_FREQ, best_model_save_path=checkpoint_path, callback_on_new_best=stop_callback, ) list_callback = CallbackList( [save_callback, best_callback, time_callback]) model = PPO( CnnPolicy, train_env, verbose=VERBOSE, batch_size=BATCH_SIZE, seed=RANDOM_SEED * 7, tensorboard_log=log_path, learning_rate=LEARNING_RATE, n_steps=UPDATE_STEPS, n_epochs=N_EPOCHS, ent_coef=ENT_COEF, vf_coef=VF_COEF, clip_range=CLIP_RANGE, device=device, policy_kwargs=dict(features_extractor_class=VariableBenchmark, features_extractor_kwargs=dict(layers=layers)), ) config_path = os.path.join(checkpoint_path, "cnn_config") zip_path = os.path.join(checkpoint_path, "model.zip") # output the model config to a file for easier viewing with open(config_path, "w") as file: file.write(f"{name}\n") file.write(str(model.policy.features_extractor.cnn)) print("Beginning training...") model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run") model.save(zip_path) del train_env del eval_env time_taken = time() - t_start print("Beginning evaluation...") # score of the game, standard deviation of multiple runs reward_mean, reward_std = evaluate_policy(model, make_env(2, 1)) with open(results_path, "w") as handle: handle.write(json.dumps((reward_mean, reward_std, time_taken))) else: reward_mean, reward_std, time_taken = json.load(open( results_path, "r")) reward_mean = abs(MIN_SCORE) + reward_mean value = (reward_mean * weighted_time(time_taken), ) print(f"Evaluated {name} with a score of {value} in {(time_taken):.2f}s") return value
env = create_env(n_envs) # Create test env if needed, do not normalize reward eval_env = None if args.eval_freq > 0: # Account for the number of parallel environments args.eval_freq = max(args.eval_freq // n_envs, 1) if 'NeckEnv' in env_id: # Use the training env as eval env when using the neck # because there is only one robot # there will be an issue with the reset eval_callback = EvalCallback(env, callback_on_new_best=None, best_model_save_path=save_path, log_path=save_path, eval_freq=args.eval_freq) callbacks.append(eval_callback) else: # Do not normalize the rewards of the eval env old_kwargs = None if normalize: if len(normalize_kwargs) > 0: old_kwargs = normalize_kwargs.copy() normalize_kwargs['norm_reward'] = False else: normalize_kwargs = {'norm_reward': False} if args.verbose > 0: print("Creating test environment")
# hyperparams buffer_size = max(total_timesteps // 100, 500) learning_starts = max(total_timesteps // 1000, 100) train_freq = 1 target_update_interval = 100 exploration_fraction = (learning_starts + 1000) / total_timesteps # evaluations parameters eval_env = gym.make(env_name) eval_env = Monitor(eval_env) eval_freq = max(1000, total_timesteps // 20) eval_log_path = "eval_logs/dqnclippedreg_{}_{}_{}_{}".format( env_name, loss_type, seed, time_int) eval_callback = EvalCallback(eval_env, log_path=eval_log_path, eval_freq=eval_freq, deterministic=True, render=False, n_eval_episodes=25) if env_name == 'MountainCar-v0': buffer_size = 10000 # max(total_timesteps // 100, 500) learning_starts = 1000 # max(total_timesteps // 1000, 100) learning_rate = 4e-3 batch_size = 128 gamma = 0.98 train_freq = 16 target_update_interval = 600 gradient_steps = 8 exploration_fraction = 0.2 # (learning_starts + 1000)/total_timesteps exploration_final_eps = 0.07
# by frank tian, 2021-1-16 from stable_baselines3 import DQN import gym_flappy_bird from stable_baselines3.common.callbacks import EvalCallback import gym env = gym.make("FlappyBirdFeature-v1") eval_env = gym.make("FlappyBirdFeature-v1") eval_callback = EvalCallback(eval_env=eval_env, eval_freq=5000, log_path="logs", best_model_save_path="logs") model = DQN(policy="MlpPolicy", env=env, batch_size=32, buffer_size=1000000, learning_starts=50000, tensorboard_log="log") print(model.policy) if __name__ == "__main__": model.learn(int(1e7), callback=eval_callback)
target_update_interval=10000, learning_starts=200000, buffer_size=500000, max_grad_norm=10, exploration_fraction=0.1, exploration_final_eps=0.01, device="cuda", tensorboard_log="./tb_logs/", ) # Create an evaluation callback with the same env, called every 10000 iterations callbacks = [] eval_callback = EvalCallback( env, callback_on_new_best=None, n_eval_episodes=5, best_model_save_path=".", log_path=".", eval_freq=10000, ) callbacks.append(eval_callback) kwargs = {} kwargs["callback"] = callbacks # Train for a certain number of timesteps model.learn(total_timesteps=5e5, tb_log_name="dqn_airsim_car_run_" + str(time.time()), **kwargs) # Save policy weights model.save("dqn_airsim_car_policy")
print(f"Saving VecNormalize to {self.save_path}") return True checkpoint_callback = CheckpointCallback(save_freq=30000, save_path=logger.output_dir, name_prefix='rl_model') savestats_callback = SaveNormalization(save_path=osp.join( logger.output_dir, "vec_normalization.pkl")) # If using normalize, must create this callback eval_callback = EvalCallback(eval_env=eval_env, n_eval_episodes=5, callback_on_new_best=savestats_callback, eval_freq=1000, best_model_save_path=osp.join( logger.output_dir, "best_model"), log_path=osp.join(logger.output_dir, "results")) callback = CallbackList([checkpoint_callback, eval_callback]) if custom_params['algo'] == 'sac': model = SAC(policy=custom_params['policy'], env=env, verbose=1, **custom_params['sac_parameters'], tensorboard_log=logger.output_dir) elif custom_params['algo'] == 'dqn': model = DQN(policy=custom_params['policy'], env=env,
tensorboard_log='results/tb/', verbose=1) # In[12]: eval_env = gym.make("hover-aviary-v0", aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, obs=ObservationType.KIN, act=ActionType.RPM) # In[13]: EPISODE_REWARD_THRESHOLD = -0 callback_on_best = StopTrainingOnRewardThreshold( reward_threshold=EPISODE_REWARD_THRESHOLD, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path='results/', log_path='results/', eval_freq=int(2000 / os.cpu_count()), deterministic=True, render=False) # In[ ]: model.learn(total_timesteps=int(50e6), callback=eval_callback, log_interval=100)
# FOR REFERENCE # policies[0] = fixed_rock_policy, # policies[1] = fixed_paper_policy, # policies[2] = fixed_scissors_policy, # policies[3] = copycat_policy, # policies[4] = random_policy, # policies[5] = aggressive_policy, # policies[6] = passive_policy, opp_policies = policies multitask_env = gym.make('rps-multitask-v0', opp_policies=opp_policies) test_multitask_env = gym.make('rps-multitask-v0', opp_policies=opp_policies) eval_callback = EvalCallback(test_multitask_env, eval_freq=1000, deterministic=True, render=False) policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[8,8]) n_steps, batch_size, n_epochs = 10, 10, 10 model = PPO("MlpPolicy", multitask_env, policy_kwargs=policy_kwargs, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, verbose=0) model.learn(total_timesteps=20000, callback=eval_callback) rewards_fixed_rock = multitask_env.run_sim(policies[0], 50, model, 0) rewards_fixed_paper = multitask_env.run_sim(policies[1], 50, model, 1) rewards_fixed_scissors = multitask_env.run_sim(policies[2], 50, model, 2) rewards_copycat = multitask_env.run_sim(policies[3], 50, model, 3) rewards_random = multitask_env.run_sim(policies[4], 50, model, 4)
env, verbose=1, buffer_size=100_000, batch_size=256, learning_rate=0.0003, learning_starts=1024, gamma=0.95, ent_coef='auto', policy_kwargs=policy_kwargs, train_freq=512, gradient_steps=-1, device="cpu") eval_callback = EvalCallback(eval_env, best_model_save_path=best_save_path, log_path=log_dir, eval_freq=1024, deterministic=True, render=False) timesteps = 5_000_000 model.learn(timesteps, callback=eval_callback) model.save(save_path) # Evaluate env.close() env = BaselinifyWrapper( TimeLimit(gym.make("PepperReachCam-v0", gui=True, dense=True), max_episode_steps=100)) model = SAC.load(save_path) obs = env.reset()
return env env = create_env(n_envs) # Create test env if needed, do not normalize reward eval_env = None if args.eval_freq > 0 and not args.optimize_hyperparameters: # Account for the number of parallel environments args.eval_freq = max(args.eval_freq // n_envs, 1) if 'NeckEnv' in env_id: # Use the training env as eval env when using the neck # because there is only one robot # there will be an issue with the reset eval_callback = EvalCallback(env, callback_on_new_best=None, best_model_save_path=save_path, log_path=save_path, eval_freq=args.eval_freq) callbacks.append(eval_callback) else: if args.verbose > 0: print("Creating test environment") # save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=params_path) # eval_callback = EvalCallback(create_env(1, eval_env=True), callback_on_new_best=save_vec_normalize, # best_model_save_path=save_path, n_eval_episodes=args.eval_episodes, # log_path=save_path, eval_freq=args.eval_freq, # deterministic=not is_atari) # save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=params_path) eval_callback = EvalCallback(env, best_model_save_path=save_path, n_eval_episodes=args.eval_episodes, log_path=save_path, eval_freq=args.eval_freq,
rew_threshold=13.5, task_mode=params['task_mode'], verbose=1) eval_env = Monitor(FlatlandEnv(task_manager, PATHS.get('robot_setting'), PATHS.get('robot_as'), params['reward_fnc'], params['discrete_action_space'], goal_radius=1.00, max_steps_per_episode=350), PATHS.get('eval'), info_keywords=("done_reason", )) eval_cb = EvalCallback(eval_env, n_eval_episodes=20, eval_freq=10000, log_path=PATHS.get('eval'), best_model_save_path=PATHS.get('model'), deterministic=True, callback_on_new_best=trainstage_cb) # determine mode if args.custom_mlp: # custom mlp flag model = PPO("MlpPolicy", env, policy_kwargs=dict(net_arch=args.net_arch, activation_fn=get_act_fn(args.act_fn)), gamma=gamma, n_steps=n_steps, ent_coef=ent_coef, learning_rate=learning_rate,
seed=0) if env_name == "tune-aviary-v0": eval_env = make_vec_env(TuneAviary, env_kwargs=sa_env_kwargs, n_envs=1, seed=0) eval_env = VecTransposeImage(eval_env) #### Train the model ####################################### # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=filename+'-logs/', name_prefix='rl_model') callback_on_best = StopTrainingOnRewardThreshold( reward_threshold=EPISODE_REWARD_THRESHOLD, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path=filename + '/', log_path=filename + '/', eval_freq=int(2000 / ARGS.cpu), deterministic=True, render=False) model.learn( total_timesteps=35000, #int(1e12), callback=eval_callback, log_interval=100, ) #### Save the model ######################################## model.save(filename + '/success_model.zip') print(filename) #### Print training progression ############################ with np.load(filename + '/evaluations.npz') as data:
def train_alg(model_alg, reset_optimizers_between_envs, reset_optimizers_every_iter, buffer_size, subsave, iteration, last_round_no_mer, is_evolving, seed): seed_all(seed) training_timesteps = META_TRAINING_TIMESTEPS params = params_list if not is_evolving: params = [params[-1]] start_time = time() env = gym.make(env_name) eval_env = gym.make(env_name) final_eval_env = gym.make(env_name) final_parameters_dict = params_sampler.sample1_means() change_env_parameters(final_eval_env, parameter_dict=final_parameters_dict) tensorboard_path = subsave + '/tb_' + str(iteration) optimizer_kwargs = {} policy_kwargs = { 'optimizer_class': th.optim.Adam, 'optimizer_kwargs': optimizer_kwargs, } model = model_alg(MlpPolicy, env, verbose=0, buffer_size=buffer_size, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, learning_starts=LEARNING_STARTS, gradient_steps=GRADIENT_STEPS, policy_kwargs=policy_kwargs, mer_s=MER_S, mer_gamma=MER_GAMMA, monitor_wrapper=True, tensorboard_log=tensorboard_path, reset_optimizers_during_training=reset_optimizers_every_iter, seed=seed ) for i_param, param in enumerate(params): log_name = 'run_' + str(i_param) if i_param == (len(params) - 1): if not is_evolving: training_timesteps = FINAL_TRAINING_TIMESTEPS + NUM_TRAINING_ENVS * META_TRAINING_TIMESTEPS else: training_timesteps = FINAL_TRAINING_TIMESTEPS log_name += '_final' change_env_parameters(env, eval_env, parameter_dict=param) if model_alg.__name__ == 'DQNMER' and last_round_no_mer and (i_param == (len(params) - 1)): is_reservoir = False is_mer = False else: # This will not have any effect on regular DQN is_reservoir = True is_mer = True model.update_env(env, monitor_wrapper=False, is_reservoir=is_reservoir, reset_optimizers=reset_optimizers_between_envs) # environment already wrapped so # monitor_wrapper=False eval_callback = EvalCallback(eval_env, best_model_save_path=None, log_path=tensorboard_path + '/' + log_name + '/running_eval', eval_freq=EVAL_FREQ, n_eval_episodes=N_EVAL_EPISODES, deterministic=True, render=False) if is_evolving: final_eval_callback = EvalCallback(final_eval_env, best_model_save_path=None, log_path=tensorboard_path + '/' + log_name + '/final_eval', eval_freq=EVAL_FREQ, n_eval_episodes=N_EVAL_EPISODES, deterministic=True, render=False) else: final_eval_callback = EventCallback() model.learn(total_timesteps=training_timesteps, log_interval=1, reset_num_timesteps=False, tb_log_name=log_name, is_mer=is_mer, callback=CallbackList([eval_callback, final_eval_callback])) env.reset() eval_env.reset() if iteration == 0: # saving models fills up storage, so we only save one (which we will also probably not use) model.save(subsave + 'model_' + str(iteration)) print(f"Done. Total time = {time() - start_time} seconds.")
PATHS = get_paths(AGENT_NAME, args) if args.n is None: n_timesteps = 6000 else: n_timesteps = args.n # instantiate gym environment n_envs = 1 task = get_predefined_task("random") env = DummyVecEnv([lambda: FlatlandEnv(task, PATHS.get('robot_setting'), PATHS.get('robot_as'), discrete_action_space)] * n_envs) # instantiate eval environment eval_env = Monitor(FlatlandEnv(task, PATHS.get('robot_setting'), PATHS.get('robot_as'), discrete_action_space), PATHS.get('eval')) eval_env = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=250, log_path=PATHS.get('eval'), best_model_save_path=PATHS.get('model'), deterministic=True) # determine mode if args.custom_mlp: # custom mlp flag model = PPO("MlpPolicy", env, policy_kwargs = dict(net_arch = args.net_arch, activation_fn = get_act_fn(args.act_fn)), gamma = gamma, n_steps = n_steps, ent_coef = ent_coef, learning_rate = learning_rate, vf_coef = vf_coef, max_grad_norm = max_grad_norm, gae_lambda = gae_lambda, batch_size = batch_size, n_epochs = n_epochs, clip_range = clip_range, tensorboard_log = PATHS.get('tb'), verbose = 1) elif args.agent is not None: # predefined agent flag if args.agent == "MLP_ARENA2D":
def main(): if(StartFresh): # Create Environment env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() # Separate evaluation env eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # Create Model model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log) else: print('duh') # tmp_test_name = 'SAC-Continued' # tb_log_name = tmp_test_name + '_' + env_name # tmp_log_dir = os.path.join('log', tmp_test_name) # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name) # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name) # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models') # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps') # # Load Enironment # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # env = VecNormalize.load(tmp_env_stats_path, env) # env.reset() # # Separate evaluation env # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env) # eval_env.reset() # # Load Model # # model = SAC.load(model_stats_path, tensorboard_log=tb_log) # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6) # # model.learning_rate = 1e-5 # model.set_env(env) if(DoTraining): checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path=best_path, log_path=best_path, eval_freq=eval_freq, deterministic=True, render=False) # Video Update Callback record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1) envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path) nStep_callback_list = CallbackList([record_callback, envSave_callback]) vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list) # Create the callback list callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback]) print(tb_log_name) model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, reset_num_timesteps=False, callback=callbacks) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
def main(): if(StartFresh): # Create Environment env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() # Separate evaluation env eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # Create Model # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto") policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])]) model = PPO('MlpPolicy', env, learning_rate = 3e-5, n_steps=512, batch_size=128, n_epochs=20, gamma=0.99, gae_lambda = 0.9, clip_range = 0.4, vf_coef = 0.5, use_sde = True, sde_sample_freq = 4, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: print('duh') # tmp_test_name = 'SAC-Continued' # tb_log_name = tmp_test_name + '_' + env_name # tmp_log_dir = os.path.join('log', tmp_test_name) # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name) # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name) # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models') # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps') # # Load Enironment # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # env = VecNormalize.load(tmp_env_stats_path, env) # env.reset() # # Separate evaluation env # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env) # eval_env.reset() # # Load Model # # model = SAC.load(model_stats_path, tensorboard_log=tb_log) # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6) # # model.learning_rate = 1e-5 # model.set_env(env) if(DoTraining): checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path=best_path, log_path=best_path, eval_freq=eval_freq, deterministic=True, render=False) # Video Update Callback record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1) envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path) nStep_callback_list = CallbackList([record_callback, envSave_callback]) # nStep_callback_list = CallbackList([envSave_callback]) vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list) # Create the callback list callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback]) # callbacks = CallbackList([checkpoint_callback, eval_callback]) print(tb_log_name) model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, reset_num_timesteps=False, callback=callbacks) # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
default=-1, type=int, ) args = parser.parse_args() env_id = args.env n_timesteps = args.n_timesteps save_path = f"{args.algo}_{env_id}" # Instantiate and wrap the environment env = gym.make(env_id) # Create the evaluation environment and callbacks eval_env = Monitor(gym.make(env_id)) callbacks = [EvalCallback(eval_env, best_model_save_path=save_path)] # Save a checkpoint every n steps if args.save_freq > 0: callbacks.append( CheckpointCallback(save_freq=args.save_freq, save_path=save_path, name_prefix="rl_model")) algo = { "sac": SAC, "td3": TD3, }[args.algo] n_actions = env.action_space.shape[0]
from stable_baselines3.common.callbacks import EvalCallback BUFFER_SIZE = int(1e6) LEARNING_STARTS = int(1e4) BATCH_SIZE = 64 ENT_COEF = 0.05 ENV_NAME = 'Walker2DBulletEnv-v0' TIME_STEPS = 100000 env = gym.make(ENV_NAME) eval_env = gym.make(ENV_NAME) eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) model = SAC('MlpPolicy', env, verbose=1, tensorboard_log="./log/", buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, learning_starts=LEARNING_STARTS, ent_coef=ENT_COEF) model.learn(total_timesteps=TIME_STEPS, callback=eval_callback) env.render() obs = env.reset()
def main(env, args): global model # Fix random seeds and number of threads np.random.seed(args.seed) if args.recodex: models = [] for path in args.load_from: models.append(SAC.load(path)) while True: state, done = env.reset(start_evaluation=True), False ret = 0 while not done: action = np.sum(np.array( list( map(lambda m: m.predict(state, deterministic=True)[0], models))), axis=0) / len(models)**0.5 # print(action) # action, _states = model.predict(state, deterministic=True) # action, _states = model.predict(state) ## TODO delete before submitting if not args.no_render: env.render() state, reward, done, _ = env.step(action) ret += reward print("Episode return:", ret) else: tensorboard_log_dir = None if args.tensorboard_log_dir is None else os.path.join( args.tensorboard_log_dir, get_exp_name()) model = SAC("MlpPolicy", env, learning_rate=lr_schedule, buffer_size=args.buffer_size, learning_starts=args.learning_starts, n_episodes_rollout=args.train_episodes, batch_size=args.batch_size, tau=args.tau, gamma=args.gamma, train_freq=args.train_freq, gradient_steps=args.gradient_steps, ent_coef="auto" if args.ent_coef == "auto" else float(args.ent_coef), use_sde=False, policy_kwargs=dict(log_std_init=-3, net_arch=args.net_arch, use_expln=True), tensorboard_log=tensorboard_log_dir, rew_skip_thres=args.rew_skip_thres, seed=args.seed) model.verbose = 2 callbacks = [ CheckpointCallback(20000, "checkpoints", name_prefix=get_exp_name()), EvalCallback( gym.make(getEnvName()), callback_on_new_best=SaveBestModelCallback( save_path="best/" + get_exp_name() + "_best_model.zip"), eval_freq=20000, n_eval_episodes=5, deterministic=True), EpisodeCallback(env, model) ] print(args.log_interval) model.learn(args.timesteps, log_interval=args.log_interval, callback=callbacks) # Final evaluation env = wrappers.EvaluationWrapper(gym.make(getEnvName()), evaluate_for=200, seed=args.seed) while True: state, done = env.reset(start_evaluation=True), False while not done: action, _states = model.predict(state, deterministic=True) state, reward, done, _ = env.step(action) model.save(get_exp_name())