def test_ddpg_normalization(): """ Test that observations and returns normalizations are properly saved and loaded. """ param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=0.05) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, param_noise=param_noise) model.learn(1000) obs_rms_params = model.sess.run(model.obs_rms_params) ret_rms_params = model.sess.run(model.ret_rms_params) model.save('./test_ddpg') loaded_model = DDPG.load("test_ddpg") obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params) ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params) for param, param_loaded in zip(obs_rms_params + ret_rms_params, obs_rms_params_2 + ret_rms_params_2): assert np.allclose(param, param_loaded) del model, loaded_model if os.path.exists("./test_ddpg"): os.remove("./test_ddpg")
def test_identity_ddpg(): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) std = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std), desired_action_stddev=float(std)) model = DDPG("MlpPolicy", env, gamma=0.0, param_noise=param_noise, memory_limit=int(1e6)) model.learn(total_timesteps=20000, seed=0) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials # Free memory del model, env
def sample_ddpg_params(trial): """ Sampler for DDPG hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical( 'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1) # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256]) buffer_size = trial.suggest_categorical( 'memory_limit', [int(1e4), int(1e5), int(1e6)]) noise_type = trial.suggest_categorical( 'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param']) noise_std = trial.suggest_uniform('noise_std', 0, 1) normalize_observations = trial.suggest_categorical( 'normalize_observations', [True, False]) normalize_returns = trial.suggest_categorical('normalize_returns', [True, False]) hyperparams = { 'gamma': gamma, 'actor_lr': learning_rate, 'critic_lr': learning_rate, 'batch_size': batch_size, 'memory_limit': buffer_size, 'normalize_observations': normalize_observations, 'normalize_returns': normalize_returns } if noise_type == 'adaptive-param': hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) # Apply layer normalization when using parameter perturbation hyperparams['policy_kwargs'] = dict(layer_norm=True) elif noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = gym.make(env) env = Monitor(env, log_dir + '/', allow_early_resets=True) # Delete keys so the dict can be pass to the model constructor # policy = kwargs['policy'] policy = 'MlpPolicy' # n_timesteps = kwargs['n_timesteps'] n_timesteps = int(1e6) noise_type = None # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) continue_model = False if continue_model is True: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, # action_noise=param_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def train_identity_ddpg(): env = DummyVecEnv([lambda: IdentityEnvBox(eps = 0.5)]) std = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std), desired_action_stddev=float(std)) model = DDPG("MlpPolicy", env, gamma=0.0, param_noise=param_noise, memory_limit=int(1e6)) model.learn(total_timesteps=20000, seed=0) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials del model, env
super(CustomPolicy, self).__init__(*args, **kwargs, layers=[256, 256, 256], act_fun=tf.nn.relu, feature_extraction="mlp") register_policy('CustomPolicy', CustomPolicy) # Define model if AGENT_ALGORITHM == "DQN": # Add some param noise for exploration #param_noise = None action_noise = None param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01) # Because we use parameter noise, we should use a MlpPolicy with layer normalization model = DQN(policy="CustomPolicy", env=env, param_noise=param_noise, verbose=1, tensorboard_log=global_path + "tb") # Load if pretrained if PRETRAINED_MODEL: del model model = DQN.load(global_path + pretrained_model_name, policy=CustomPolicy, env=env)
return env env = create_env(n_envs) # Stop env processes to free memory if args.optimize_hyperparameters and n_envs > 1: env.close() # Parse noise string for DDPG and SAC if algo_ in ['ddpg', 'sac', 'td3' ] and hyperparams.get('noise_type') is not None: noise_type = hyperparams['noise_type'].strip() noise_std = hyperparams['noise_std'] n_actions = env.action_space.shape[0] if 'adaptive-param' in noise_type: assert algo_ == 'ddpg', 'Parameter is not supported by SAC' hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) elif 'normal' in noise_type: if 'lin' in noise_type: hyperparams['action_noise'] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=hyperparams.get('noise_std_final', 0.0) * np.ones(n_actions), max_steps=n_timesteps) else: hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) elif 'ornstein-uhlenbeck' in noise_type: hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions),
# Create and wrap the environment env_id = 'UR5Gripper-v0' num_cpu = 4 # Number of processes to use env = gym.make('UR5Gripper-v0') # Create the vectorized environment # env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) env = Monitor(env, log_dir, allow_early_resets=True) # env = SubprocVecEnv([make_mujoco_env(env_id, i) for i in range(num_cpu)]) # env = SubprocVecEnv([lambda: env]) env = DummyVecEnv([lambda: env]) # env = SubprocVecEnv([lambda: gym.make('UR5Gripper-v0') for i in range(num_cpu)]) # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # Because we use parameter noise, we should use a MlpPolicy with layer normalization # model = DDPG(MlpPolicy, env, param_noise=param_noise, verbose=1, tensorboard_log=log_dir) # model = PPO2(MlpPolicy, env, verbose=1) # model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) # Random Agent, before training mean_reward_before_train = evaluate(model, num_steps=1000) # Train the agent model.learn(total_timesteps=int(1e7), callback=callback) mean_reward_after_train = evaluate(model, num_steps=1000) obs = env.reset() for _ in range(1000):
import pytest from stable_baselines import A2C, ACER, ACKTR, DeepQ, DDPG, PPO1, PPO2, TRPO from stable_baselines.ddpg import AdaptiveParamNoiseSpec from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox from stable_baselines.common.vec_env import DummyVecEnv PARAM_NOISE_DDPG = AdaptiveParamNoiseSpec(initial_stddev=float(0.2), desired_action_stddev=float(0.2)) # Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acer': lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'deepq': lambda e: DeepQ(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ddpg': lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG). learn(total_timesteps=1000), 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000), }
def main(args): envconfig_string = args.envconfig custom_envconfig = _preprocess_custom_envconfig( args.envconfig) if args.envconfig is not None else {} env_id = 'gym_auv:' + args.env env_name = env_id.split(':')[-1] if ':' in env_id else env_id envconfig = gym_auv.SCENARIOS[env_name][ 'config'] if env_name in gym_auv.SCENARIOS else {} envconfig.update(custom_envconfig) NUM_CPU = multiprocessing.cpu_count() EXPERIMENT_ID = str(int(time())) + args.algo.lower() model = { 'ppo': PPO2, 'ddpg': DDPG, 'td3': TD3, 'a2c': A2C, 'acer': ACER, 'acktr': ACKTR, 'sac': SAC, 'trpo': TRPO }[args.algo.lower()] if args.mode == 'play': agent = model.load(args.agent) if args.agent is not None else None envconfig_play = envconfig.copy() envconfig_play['show_indicators'] = True #envconfig_play['autocamera3d'] = False env = create_env(env_id, envconfig_play, test_mode=True, render_mode=args.render, pilot=args.pilot, verbose=True) print('Created environment instance') if args.scenario: env.load(args.scenario) vec_env = DummyVecEnv([lambda: env]) recorded_env = VecVideoRecorder( vec_env, args.video_dir, record_video_trigger=lambda x: x == 0, video_length=args.recording_length, name_prefix=(args.env if args.video_name == 'auto' else args.video_name)) print(args.video_dir, args.video_name) play_scenario(env, recorded_env, args, agent=agent) recorded_env.env.close() elif (args.mode == 'enjoy'): agent = model.load(args.agent) figure_folder = os.path.join(DIR_PATH, 'logs', 'enjoys', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) scenario_folder = os.path.join(figure_folder, 'scenarios') os.makedirs(scenario_folder, exist_ok=True) video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env, EXPERIMENT_ID) os.makedirs(video_folder, exist_ok=True) env = create_env(env_id, envconfig, test_mode=True, render_mode=args.render, pilot=args.pilot) if args.scenario: env.load(args.scenario) vec_env = DummyVecEnv([lambda: env]) recorded_env = VecVideoRecorder( vec_env, video_folder, record_video_trigger=lambda x: x == 0, video_length=args.recording_length, name_prefix=(args.env if args.video_name == 'auto' else args.video_name)) obs = recorded_env.reset() state = None t_steps = 0 ep_number = 1 done = [False for _ in range(vec_env.num_envs)] for _ in range(args.recording_length): if args.recurrent: action, _states = agent.predict( observation=obs, state=state, mask=done, deterministic=not args.stochastic) state = _states else: action, _states = agent.predict( obs, deterministic=not args.stochastic) obs, reward, done, info = recorded_env.step(action) recorded_env.render() t_steps += 1 if t_steps % 800 == 0 or done: if not done: env.save_latest_episode(save_history=False) gym_auv.reporting.plot_trajectory( env, fig_dir=scenario_folder, fig_prefix=(args.env + '_ep{}_step{}'.format(ep_number, t_steps))) gym_auv.reporting.plot_trajectory( env, fig_dir=scenario_folder, fig_prefix=( args.env + '_ep{}_step{}_local'.format(ep_number, t_steps)), local=True) if done: ep_number += 1 recorded_env.close() elif (args.mode == 'train'): figure_folder = os.path.join(DIR_PATH, 'logs', 'figures', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) scenario_folder = os.path.join(figure_folder, 'scenarios') os.makedirs(scenario_folder, exist_ok=True) video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env, EXPERIMENT_ID) recording_length = 8000 os.makedirs(video_folder, exist_ok=True) agent_folder = os.path.join(DIR_PATH, 'logs', 'agents', args.env, EXPERIMENT_ID) os.makedirs(agent_folder, exist_ok=True) tensorboard_log = os.path.join(DIR_PATH, 'logs', 'tensorboard', args.env, EXPERIMENT_ID) tensorboard_port = 6006 if (args.nomp or model == DDPG or model == TD3 or model == SAC or model == TRPO): num_cpu = 1 vec_env = DummyVecEnv( [lambda: create_env(env_id, envconfig, pilot=args.pilot)]) else: num_cpu = NUM_CPU vec_env = SubprocVecEnv([ make_mp_env(env_id, i, envconfig, pilot=args.pilot) for i in range(num_cpu) ]) if (args.agent is not None): agent = model.load(args.agent) agent.set_env(vec_env) else: if (model == PPO2): if args.recurrent: hyperparams = { # 'n_steps': 1024, # 'nminibatches': 32, # 'lam': 0.95, # 'gamma': 0.99, # 'noptepochs': 10, # 'ent_coef': 0.0, # 'learning_rate': 0.0003, # 'cliprange': 0.2, 'n_steps': 1024, 'nminibatches': 1, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'learning_rate': 2e-3, } class CustomLSTMPolicy(MlpLstmPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[ 256, 256, 'lstm', dict(vf=[64], pi=[64]) ], **_kwargs) agent = PPO2(CustomLSTMPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) else: hyperparams = { # 'n_steps': 1024, # 'nminibatches': 32, # 'lam': 0.95, # 'gamma': 0.99, # 'noptepochs': 10, # 'ent_coef': 0.0, # 'learning_rate': 0.0003, # 'cliprange': 0.2, 'n_steps': 1024, 'nminibatches': 32, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'learning_rate': 2e-4, } #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[64, 64, 64]) #policy_kwargs = dict(net_arch=[64, 64, 64]) layers = [256, 128, 64] #layers = [64, 64] policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)]) agent = PPO2(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams, policy_kwargs=policy_kwargs) #dataset = ExpertDataset(expert_path='gail_expert.npz', traj_limitation=1, batch_size=128) #print('Pretraining {} agent on "{}"'.format(args.algo.upper(), env_id)) #agent.pretrain(dataset, n_epochs=1000) #print('Done pretraining {} agent on "{}"'.format(args.algo.upper(), env_id)) elif (model == DDPG): # rl-baselines-zoo inspired: # hyperparams = { # 'memory_limit': 50000, # 'normalize_observations': True, # 'normalize_returns': False, # 'gamma': 0.98, # 'actor_lr': 0.00156, # 'critic_lr': 0.00156, # 'batch_size': 256, # 'param_noise': AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # } hyperparams = { 'memory_limit': 1000000, 'normalize_observations': True, 'normalize_returns': False, 'gamma': 0.98, 'actor_lr': 0.00156, 'critic_lr': 0.00156, 'batch_size': 256, 'param_noise': AdaptiveParamNoiseSpec(initial_stddev=0.287, desired_action_stddev=0.287) } agent = DDPG(LnMlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) elif (model == TD3): # rl-baselines-zoo inspired: # hyperparams = { # 'batch_size': 256, # 'buffer_size': 50000, # 'learning_starts': 1000 # } hyperparams = { 'buffer_size': 1000000, 'train_freq': 1000, 'gradient_steps': 1000, 'learning_starts': 10000 } action_noise = NormalActionNoise(mean=np.zeros(2), sigma=0.1 * np.ones(2)) agent = TD3(stable_baselines.td3.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, action_noise=action_noise, **hyperparams) elif model == A2C: # rl-baselines-zoo inspired: # hyperparams = { # 'n_steps': 5, # 'gamma': 0.995, # 'ent_coef': 0.00001, # 'learning_rate': 0.00083, # 'lr_schedule': 'linear' # } # layers = [256, 128, 64] hyperparams = { 'n_steps': 16, 'gamma': 0.99, 'ent_coef': 0.001, 'learning_rate': 2e-4, 'lr_schedule': 'linear' } layers = [64, 64] policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)]) agent = A2C(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams, policy_kwargs=policy_kwargs) elif model == ACER: agent = ACER(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) elif model == ACKTR: # rl-baselines-zoo inspired: # hyperparams = { # 'gamma': 0.99, # 'n_steps': 16, # 'ent_coef': 0.0, # 'learning_rate': 0.06, # 'lr_schedule': 'constant' # } # agent = ACKTR(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) agent = ACKTR(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) elif model == SAC: # rl-baselines-zoo inspired: # hyperparams = { # 'batch_size': 256, # 'learning_starts': 1000 # } # agent = SAC(stable_baselines.sac.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) agent = SAC(stable_baselines.sac.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) elif model == TRPO: agent = TRPO(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) print('Training {} agent on "{}"'.format(args.algo.upper(), env_id)) n_updates = 0 n_episodes = 0 def callback(_locals, _globals): nonlocal n_updates nonlocal n_episodes sys.stdout.write('Training update: {}\r'.format(n_updates)) sys.stdout.flush() _self = _locals['self'] vec_env = _self.get_env() class Struct(object): pass report_env = Struct() report_env.history = [] report_env.config = envconfig report_env.nsensors = report_env.config[ "n_sensors_per_sector"] * report_env.config["n_sectors"] report_env.sensor_angle = 2 * np.pi / (report_env.nsensors + 1) report_env.last_episode = vec_env.get_attr('last_episode')[0] report_env.config = vec_env.get_attr('config')[0] report_env.obstacles = vec_env.get_attr('obstacles')[0] env_histories = vec_env.get_attr('history') for episode in range(max(map(len, env_histories))): for env_idx in range(len(env_histories)): if (episode < len(env_histories[env_idx])): report_env.history.append( env_histories[env_idx][episode]) report_env.episode = len(report_env.history) + 1 total_t_steps = _self.get_env().get_attr( 'total_t_steps')[0] * num_cpu agent_filepath = os.path.join(agent_folder, str(total_t_steps) + '.pkl') if model == PPO2: recording_criteria = n_updates % 10 == 0 report_criteria = True _self.save(agent_filepath) elif model == A2C or model == ACER or model == ACKTR or model == SAC or model == TRPO: save_criteria = n_updates % 100 == 0 recording_criteria = n_updates % 1000 == 0 report_criteria = True if save_criteria: _self.save(agent_filepath) elif model == DDPG or model == TD3: save_criteria = n_updates % 10000 == 0 recording_criteria = n_updates % 50000 == 0 report_criteria = report_env.episode > n_episodes if save_criteria: _self.save(agent_filepath) if report_env.last_episode is not None and len( report_env.history) > 0 and report_criteria: try: #gym_auv.reporting.plot_trajectory(report_env, fig_dir=scenario_folder, fig_prefix=args.env + '_ep_{}'.format(report_env.episode)) gym_auv.reporting.report(report_env, report_dir=figure_folder) #vec_env.env_method('save', os.path.join(scenario_folder, '_ep_{}'.format(report_env.episode))) except OSError as e: print("Ignoring reporting OSError:") print(repr(e)) if recording_criteria: if args.pilot: cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --pilot {} --envconfig {}{}'.format( args.env, agent_filepath, video_folder, args.env + '-' + str(total_t_steps), recording_length, args.algo, args.pilot, envconfig_string, ' --recurrent' if args.recurrent else '') else: cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --envconfig {}{}'.format( args.env, agent_filepath, video_folder, args.env + '-' + str(total_t_steps), recording_length, args.algo, envconfig_string, ' --recurrent' if args.recurrent else '') subprocess.Popen(cmd) n_episodes = report_env.episode n_updates += 1 agent.learn(total_timesteps=1500000, tb_log_name='log', callback=callback) elif (args.mode in ['policyplot', 'vectorfieldplot', 'streamlinesplot']): figure_folder = os.path.join(DIR_PATH, 'logs', 'plots', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) agent = PPO2.load(args.agent) if args.testvals: testvals = json.load(open(args.testvals, 'r')) valuegrid = list(ParameterGrid(testvals)) for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env = create_env(env_id, envconfig, test_mode=True, pilot=args.pilot) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) print('Running {} test for {}...'.format( args.mode, valuedict_str)) if args.mode == 'policyplot': gym_auv.reporting.plot_actions(env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) elif args.mode == 'vectorfieldplot': gym_auv.reporting.plot_vector_field( env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) elif args.mode == 'streamlinesplot': gym_auv.reporting.plot_streamlines( env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) else: env = create_env(env_id, envconfig, test_mode=True, pilot=args.pilot) with open(os.path.join(figure_folder, 'config.json'), 'w') as f: json.dump(env.config, f) if args.mode == 'policyplot': gym_auv.reporting.plot_actions(env, agent, fig_dir=figure_folder) elif args.mode == 'vectorfieldplot': gym_auv.reporting.plot_vector_field(env, agent, fig_dir=figure_folder) elif args.mode == 'streamlinesplot': gym_auv.reporting.plot_streamlines(env, agent, fig_dir=figure_folder) print('Output folder: ', figure_folder) elif args.mode == 'test': figure_folder = os.path.join(DIR_PATH, 'logs', 'tests', args.env, EXPERIMENT_ID) scenario_folder = os.path.join(figure_folder, 'scenarios') video_folder = os.path.join(figure_folder, 'videos') os.makedirs(figure_folder, exist_ok=True) os.makedirs(scenario_folder, exist_ok=True) os.makedirs(video_folder, exist_ok=True) if not args.onlyplot: agent = model.load(args.agent) def create_test_env(video_name_prefix, envconfig=envconfig): print('Creating test environment: ' + env_id) env = create_env(env_id, envconfig, test_mode=True, render_mode=args.render if args.video else None, pilot=args.pilot) vec_env = DummyVecEnv([lambda: env]) if args.video: video_length = min(500, args.recording_length) recorded_env = VecVideoRecorder(vec_env, video_folder, record_video_trigger=lambda x: (x % video_length) == 0, video_length=video_length, name_prefix=video_name_prefix) active_env = recorded_env if args.video else vec_env return env, active_env failed_tests = [] def run_test(id, reset=True, report_dir=figure_folder, scenario=None, max_t_steps=None, env=None, active_env=None): nonlocal failed_tests if env is None or active_env is None: env, active_env = create_test_env(video_name_prefix=args.env + '_' + id) if scenario is not None: obs = active_env.reset() env.load(args.scenario) print('Loaded', args.scenario) else: if reset: obs = active_env.reset() else: obs = env.observe() gym_auv.reporting.plot_scenario(env, fig_dir=scenario_folder, fig_postfix=id, show=args.onlyplot) if args.onlyplot: return cumulative_reward = 0 t_steps = 0 if max_t_steps is None: done = False else: done = t_steps > max_t_steps while not done: action, _states = agent.predict( obs, deterministic=not args.stochastic) obs, reward, done, info = active_env.step(action) if args.video: active_env.render() t_steps += 1 cumulative_reward += reward[0] report_msg = '{:<20}{:<20}{:<20.2f}{:<20.2%}\r'.format( id, t_steps, cumulative_reward, info[0]['progress']) sys.stdout.write(report_msg) sys.stdout.flush() if args.save_snapshots and t_steps % 1000 == 0 and not done: env.save_latest_episode(save_history=False) for size in (20, 50, 100, 200, 300, 400, 500): gym_auv.reporting.plot_trajectory( env, fig_dir=scenario_folder, fig_prefix=(args.env + '_t_step_' + str(t_steps) + '_' + str(size) + '_' + id), local=True, size=size) elif done: gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id)) env.close() gym_auv.reporting.report(env, report_dir=report_dir, lastn=-1) #gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id)) #env.save(os.path.join(scenario_folder, id)) if env.collision: failed_tests.append(id) with open(os.path.join(figure_folder, 'failures.txt'), 'w') as f: f.write(', '.join(map(str, failed_tests))) return copy.deepcopy(env.last_episode) print('Testing scenario "{}" for {} episodes.\n '.format( args.env, args.episodes)) report_msg_header = '{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}'.format( 'Episode', 'Timesteps', 'Cum. Reward', 'Progress', 'Collisions', 'CT-Error [m]', 'H-Error [deg]') print(report_msg_header) print('-' * len(report_msg_header)) if args.testvals: testvals = json.load(open(args.testvals, 'r')) valuegrid = list(ParameterGrid(testvals)) if args.scenario: if args.testvals: episode_dict = {} for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env, active_env = create_test_env(envconfig=customconfig) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) colorval = -np.log10( valuedict['reward_lambda']) #should be general rep_subfolder = os.path.join(figure_folder, valuedict_str) os.makedirs(rep_subfolder, exist_ok=True) for episode in range(args.episodes): last_episode = run_test(valuedict_str + '_ep' + str(episode), report_dir=rep_subfolder) episode_dict[valuedict_str] = [last_episode, colorval] print('Plotting all') gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_all_agents'), episode_dict=episode_dict) else: run_test("ep0", reset=True, scenario=args.scenario) else: if args.testvals: episode_dict = {} agent_index = 1 for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env, active_env = create_test_env(envconfig=customconfig) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) colorval = np.log10( valuedict['reward_lambda']) #should be general rep_subfolder = os.path.join(figure_folder, valuedict_str) os.makedirs(rep_subfolder, exist_ok=True) for episode in range(args.episodes): last_episode = run_test(valuedict_str + '_ep' + str(episode), report_dir=rep_subfolder) episode_dict['Agent ' + str(agent_index)] = [last_episode, colorval] agent_index += 1 gym_auv.reporting.plot_trajectory(env, fig_dir=figure_folder, fig_prefix=(args.env + '_all_agents'), episode_dict=episode_dict) else: env, active_env = create_test_env(video_name_prefix=args.env) for episode in range(args.episodes): run_test('ep' + str(episode), env=env, active_env=active_env) if args.video and active_env: active_env.close()