def parse(): parser = argparse.ArgumentParser() parser.add_argument("--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument("--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)" ) parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)" ) parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) args = parser.parse_args() return args
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument( "--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument( "-f", "--log-folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument( "--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument( "-n", "--n-eval-steps", help="Number of evaluation timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument( "--n-envs", help="number of environments", default=1, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument( "--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( '--render', help="1: Render environment, 0: don't render", type=int, choices=[0, 1], default=0) parser.add_argument( '--deterministic', help="1: Use deterministic actions, 0: Use stochastic actions", type=int, choices=[0, 1], default=0) parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument( "--stochastic", action="store_true", default=False, help="Use stochastic actions (for DDPG/DQN/SAC)") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument( "--seed", help="Random generator seed", type=int, default=0) parser.add_argument( "--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help="Additional external Gym environemnt package modules to import (e.g. gym_minigrid)") parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") parser.add_argument( '--log-info', help="1: Log information at each evaluation steps and save, 0: don't log", type=int, choices=[0, 1], default=0) parser.add_argument( "--plot-dim", help="Plot end effector and goal position in real time (0: Don't plot, 2: 2D (default), 3: 3D)", type=int, default=0, choices=[0, 2, 3]) args = parser.parse_args() ################################# # Prepare log if needed if args.log_info: log_df = pd.DataFrame() log_dict = OrderedDict() # Prepare plot if needed if args.plot_dim == 2: fig, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(5, 10)) elif args.plot_dim == 3: fig = plt.figure() ax = fig.gca(projection='3d') # Going through custom gym packages to let them register # in the global registry for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.log_folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: # pytype: disable=module-attr loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=args.render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) model = ALGOS[algo].load(model_path, env=env, **kwargs) obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in off_policy_algos and not args.stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 successes = [] # For HER, monitor success rate episode_nb = 0 success_threshold_50 = 0.05 success_threshold_20 = 0.02 success_threshold_10 = 0.01 success_threshold_5 = 0.005 success_threshold_2 = 0.002 success_threshold_1 = 0.001 success_threshold_05 = 0.0005 ep_success_list_50 = [] ep_success_list_20 = [] ep_success_list_10 = [] ep_success_list_5 = [] ep_success_list_2 = [] ep_success_list_1 = [] ep_success_list_05 = [] success_list_50 = [] success_list_20 = [] success_list_10 = [] success_list_5 = [] success_list_2 = [] success_list_1 = [] success_list_05 = [] # Moved render flag outside the loop (Pierre) if args.render: env.render("human") for t in range(args.n_eval_steps): action, state = model.predict( obs, state=state, deterministic=deterministic) obs, reward, done, infos = env.step(action) # Slow down simulation when rendering (Pierre) if args.render: if "widowx" in env_id: time.sleep(1. / 30.) else: env.render() if "widowx" in env_id: # Update episode success list ep_success_list_50 = calc_ep_success( success_threshold_50, ep_success_list_50, infos) ep_success_list_20 = calc_ep_success( success_threshold_20, ep_success_list_20, infos) ep_success_list_10 = calc_ep_success( success_threshold_10, ep_success_list_10, infos) ep_success_list_5 = calc_ep_success( success_threshold_5, ep_success_list_5, infos) ep_success_list_2 = calc_ep_success( success_threshold_2, ep_success_list_2, infos) ep_success_list_1 = calc_ep_success( success_threshold_1, ep_success_list_1, infos) ep_success_list_05 = calc_ep_success( success_threshold_05, ep_success_list_05, infos) episode_reward += reward[0] ep_len += 1 # Real time plot if args.plot_dim == 2: goal = infos[0]['goal_position'] tip = infos[0]['tip_position'] ax1.cla() ax1.plot(goal[0], goal[2], marker='o', color='g', linestyle='', markersize=10, label="goal", alpha=0.5) ax1.plot(tip[0], tip[2], marker='x', color='r', linestyle='', markersize=10, label="end effector", mew=3) circ_1_50 = plt.Circle( (goal[0], goal[2]), radius=success_threshold_50, edgecolor='g', facecolor='w', linestyle='--', label="50 mm") circ_1_20 = plt.Circle( (goal[0], goal[2]), radius=success_threshold_20, edgecolor='b', facecolor='w', linestyle='--', label="20 mm") circ_1_10 = plt.Circle( (goal[0], goal[2]), radius=success_threshold_10, edgecolor='m', facecolor='w', linestyle='--', label="10 mm") circ_1_5 = plt.Circle( (goal[0], goal[2]), radius=success_threshold_5, edgecolor='r', facecolor='w', linestyle='--', label="5 mm") ax1.add_patch(circ_1_50) ax1.add_patch(circ_1_20) ax1.add_patch(circ_1_10) ax1.add_patch(circ_1_5) ax1.set_xlim([-0.25, 0.25]) ax1.set_ylim([0, 0.5]) ax1.set_xlabel("x (m)", fontsize=15) ax1.set_ylabel("z (m)", fontsize=15) ax2.cla() ax2.plot(goal[1], goal[2], marker='o', color='g', linestyle='', markersize=10, alpha=0.5) ax2.plot( tip[1], tip[2], marker='x', color='r', linestyle='', markersize=10, mew=3) circ_2_50 = plt.Circle( (goal[1], goal[2]), radius=success_threshold_50, edgecolor='g', facecolor='w', linestyle='--') circ_2_20 = plt.Circle( (goal[1], goal[2]), radius=success_threshold_20, edgecolor='b', facecolor='w', linestyle='--') circ_2_10 = plt.Circle( (goal[1], goal[2]), radius=success_threshold_10, edgecolor='m', facecolor='w', linestyle='--') circ_2_5 = plt.Circle( (goal[1], goal[2]), radius=success_threshold_5, edgecolor='r', facecolor='w', linestyle='--') ax2.add_patch(circ_2_50) ax2.add_patch(circ_2_20) ax2.add_patch(circ_2_10) ax2.add_patch(circ_2_5) ax2.set_xlim([-0.25, 0.25]) ax2.set_ylim([0, 0.5]) ax2.set_xlabel("y (m)", fontsize=15) ax2.set_ylabel("z (m)", fontsize=15) ax1.legend(loc='upper left', bbox_to_anchor=( 0, 1.2), ncol=3, fancybox=True, shadow=True) fig.suptitle("timestep " + str(ep_len) + " | distance to target: " + str(round(infos[0]['new_distance'] * 1000, 1)) + " mm") plt.pause(0.01) # plt.show() elif args.plot_dim == 3: goal = infos[0]['goal_position'] tip = infos[0]['tip_position'] ax.cla() ax.plot([goal[0]], [goal[1]], zs=[goal[2]], marker='o', color='g', linestyle='', markersize=10, alpha=0.5) ax.plot([tip[0]], [tip[1]], zs=[tip[2]], marker='x', color='r', linestyle='', markersize=10, mew=3) ax.set_xlim([-0.2, 0.2]) ax.set_ylim([-0.2, 0.2]) ax.set_zlim([0, 0.5]) ax.set_xlabel("x (m)", fontsize=15) ax.set_ylabel("y (m)", fontsize=15) ax.set_zlabel("z (m)", fontsize=15) fig.suptitle("timestep " + str(ep_len) + " | distance to target: " + str(round(infos[0]['new_distance'] * 1000, 1)) + " mm") plt.pause(0.01) # plt.show() if args.log_info: log_dict['episode'] = episode_nb log_dict['timestep'] = t log_dict['action_1'] = action[0][0] log_dict['action_2'] = action[0][1] log_dict['action_3'] = action[0][2] log_dict['action_4'] = action[0][3] log_dict['action_5'] = action[0][4] log_dict['action_6'] = action[0][5] log_dict['old_joint_pos_1'] = infos[0]['old_joint_pos'][0] log_dict['old_joint_pos_2'] = infos[0]['old_joint_pos'][1] log_dict['old_joint_pos_3'] = infos[0]['old_joint_pos'][2] log_dict['old_joint_pos_4'] = infos[0]['old_joint_pos'][3] log_dict['old_joint_pos_5'] = infos[0]['old_joint_pos'][4] log_dict['old_joint_pos_6'] = infos[0]['old_joint_pos'][5] log_dict['new_joint_pos_1'] = infos[0]['new_joint_pos'][0] log_dict['new_joint_pos_2'] = infos[0]['new_joint_pos'][1] log_dict['new_joint_pos_3'] = infos[0]['new_joint_pos'][2] log_dict['new_joint_pos_4'] = infos[0]['new_joint_pos'][3] log_dict['new_joint_pos_5'] = infos[0]['new_joint_pos'][4] log_dict['new_joint_pos_6'] = infos[0]['new_joint_pos'][5] log_dict['joint_vel_1'] = infos[0]['joint_vel'][0] log_dict['joint_vel_2'] = infos[0]['joint_vel'][1] log_dict['joint_vel_3'] = infos[0]['joint_vel'][2] log_dict['joint_vel_4'] = infos[0]['joint_vel'][3] log_dict['joint_vel_5'] = infos[0]['joint_vel'][4] log_dict['joint_vel_6'] = infos[0]['joint_vel'][5] log_dict['joint1_min'] = -3.1 log_dict['joint1_max'] = 3.1 log_dict['joint2_min'] = -1.571 log_dict['joint2_max'] = 1.571 log_dict['joint3_min'] = -1.571 log_dict['joint3_max'] = 1.571 log_dict['joint4_min'] = -1.745 log_dict['joint4_max'] = 1.745 log_dict['joint5_min'] = -2.617 log_dict['joint5_max'] = 2.617 log_dict['joint6_min'] = 0.003 log_dict['joint6_max'] = 0.03 log_dict['action_low1'] = env.action_space.low[0] log_dict['action_low2'] = env.action_space.low[1] log_dict['action_low3'] = env.action_space.low[2] log_dict['action_low4'] = env.action_space.low[3] log_dict['action_low5'] = env.action_space.low[4] log_dict['action_low6'] = env.action_space.low[5] log_dict['action_high1'] = env.action_space.high[0] log_dict['action_high2'] = env.action_space.high[1] log_dict['action_high3'] = env.action_space.high[2] log_dict['action_high4'] = env.action_space.high[3] log_dict['action_high5'] = env.action_space.high[4] log_dict['action_high6'] = env.action_space.high[5] log_dict['reward'] = reward[0] log_dict['return'] = episode_reward log_dict['new_distance'] = infos[0]['new_distance'] log_dict['old_distance'] = infos[0]['old_distance'] log_dict['target_x'] = infos[0]['goal_position'][0] log_dict['target_y'] = infos[0]['goal_position'][1] log_dict['target_z'] = infos[0]['goal_position'][2] log_dict['tip_y'] = infos[0]['tip_position'][1] log_dict['tip_x'] = infos[0]['tip_position'][0] log_dict['tip_z'] = infos[0]['tip_position'][2] log_dict['done'] = done[0] # log_dict['obs'] = obs # log_dict['obs_space_low'] = env.observation_space.low # log_dict['obs_space_high'] = env.observation_space.high log_df = log_df.append(log_dict, ignore_index=True) if args.n_envs == 1: if done and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed # print(f"Episode Reward: {episode_reward:.2f}") # commented by Pierre # print("Episode Length", ep_len) # commented by Pierre episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_nb += 1 if "widowx" in env_id: # append the last element of the episode success list when # episode is done success_list_50 = calc_success_list( ep_success_list_50, success_list_50) success_list_20 = calc_success_list( ep_success_list_20, success_list_20) success_list_10 = calc_success_list( ep_success_list_10, success_list_10) success_list_5 = calc_success_list( ep_success_list_5, success_list_5) success_list_2 = calc_success_list( ep_success_list_2, success_list_2) success_list_1 = calc_success_list( ep_success_list_1, success_list_1) success_list_05 = calc_success_list( ep_success_list_05, success_list_05) # If the episode is successful and it starts from an # unsucessful step, calculate reach time reachtime_list_50 = calc_reach_time(ep_success_list_50) reachtime_list_20 = calc_reach_time(ep_success_list_20) reachtime_list_10 = calc_reach_time(ep_success_list_10) reachtime_list_5 = calc_reach_time(ep_success_list_5) reachtime_list_2 = calc_reach_time(ep_success_list_2) reachtime_list_1 = calc_reach_time(ep_success_list_1) reachtime_list_05 = calc_reach_time(ep_success_list_05) if args.log_info: log_df = log_df[log_dict.keys()] # sort columns # add estimated tip velocity and acceleration (according to # the documentation, 1 timestep = 240 Hz) log_df['est_vel'] = log_df['new_distance'].diff() * 240 log_df['est_vel'].loc[0] = 0 # initial velocity is 0 log_df['est_acc'] = log_df['est_vel'].diff() * 240 log_df['est_acc'].loc[0] = 0 # initial acceleration is 0 log_df.to_csv( log_path + "/res_episode_" + str(episode_nb) + ".csv", index=False) # slow # log_df.to_pickle(log_path+"/res_episode_"+str(episode)+".pkl") # # fast # Reset for the new episode episode_reward = 0.0 ep_len = 0 state = None ep_success_list_50 = [] ep_success_list_20 = [] ep_success_list_10 = [] ep_success_list_5 = [] ep_success_list_2 = [] ep_success_list_1 = [] ep_success_list_05 = [] # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) # Alternatively, you can add a check to wait for the end of the # episode if done: obs = env.reset() if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_lengths) > 0: print( f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}") if args.verbose > 0 and len(episode_rewards) > 0: print( f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}") if "widowx" in env_id: SR_mean_50, RT_mean_50 = calc_mean_successratio_reachtime( success_threshold_50, success_list_50, reachtime_list_50) SR_mean_20, RT_mean_20 = calc_mean_successratio_reachtime( success_threshold_20, success_list_20, reachtime_list_20) SR_mean_10, RT_mean_10 = calc_mean_successratio_reachtime( success_threshold_10, success_list_10, reachtime_list_10) SR_mean_5, RT_mean_5 = calc_mean_successratio_reachtime( success_threshold_5, success_list_5, reachtime_list_5) SR_mean_2, RT_mean_2 = calc_mean_successratio_reachtime( success_threshold_2, success_list_2, reachtime_list_2) SR_mean_1, RT_mean_1 = calc_mean_successratio_reachtime( success_threshold_1, success_list_1, reachtime_list_1) SR_mean_05, RT_mean_05 = calc_mean_successratio_reachtime( success_threshold_05, success_list_05, reachtime_list_05) # log metrics to stats.csv d = { "Eval mean reward": np.mean(episode_rewards), "Eval std": np.std(episode_rewards), "success ratio 50mm": SR_mean_50, "Average reach time 50mm": RT_mean_50, "success ratio 20mm": SR_mean_20, "Average reach time 20mm": RT_mean_20, "success ratio 10mm": SR_mean_10, "Average reach time 10mm": RT_mean_10, "success ratio 5mm": SR_mean_5, "Average reach time 5mm": RT_mean_5, "success ratio 2mm": SR_mean_2, "Average reach time 2mm": RT_mean_2, "success ratio 1mm": SR_mean_1, "Average reach time 1mm": RT_mean_1, "success ratio 0.5mm": SR_mean_05, "Average reach time 0.5mm": RT_mean_05 } # print("path:", log_path) df = pd.DataFrame(d, index=[0]) df.to_csv(log_path + "/stats.csv", index=False) # Workaround for https://github.com/openai/gym/issues/893 if args.render: if args.n_envs == 1 and "Bullet" not in env_id and isinstance( env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecEnvWrapper): env = env.venv if isinstance(env, DummyVecEnv): env.envs[0].env.close() else: env.close() else: # SubprocVecEnv env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v1') parser.add_argument('-f', '--folder', help='Log folder', type=str, default='trained_agents') parser.add_argument('--algo', help='RL Algorithm', default='ppo2', type=str, required=False, choices=list(ALGOS.keys())) # parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=1000, # type=int) parser.add_argument('-n', '--n-episodes', help='number of episodes to collect', default=20, type=int) parser.add_argument('--n-envs', help='number of environments', default=1, type=int) parser.add_argument( '--exp-id', help='Experiment ID (default: -1, no exp folder, 0: latest)', default=-1, type=int) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1, type=int) parser.add_argument( '--no-render', action='store_true', default=False, help='Do not render the environment (useful for tests)') # for deterministic (bool type) parser.add_argument('--deterministic', dest='deterministic', action='store_true') parser.add_argument('--no-deterministic', dest='deterministic', action='store_false') parser.set_defaults(deterministic=True) # true by default # parser.add_argument('--deterministic', action='store_true', default=False, # help='Use deterministic actions') # parser.add_argument('--stochastic', action='store_true', default=False, # help='Use stochastic actions (for DDPG/DQN/SAC)') parser.add_argument( '--norm-reward', action='store_true', default=False, help='Normalize reward if applicable (trained with VecNormalize)') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) parser.add_argument('--reward-log', help='Where to log reward', default='', type=str) parser.add_argument( '--gym-packages', type=str, nargs='+', default=[], help= 'Additional external Gym environemnt package modules to import (e.g. gym_minigrid)' ) args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print('Loading latest experiment, id={}'.format(args.exp_id)) # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, '{}_{}'.format(env_id, args.exp_id)) else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), "The {} folder was not found".format( log_path) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) log_dir = args.reward_log if args.reward_log != '' else None if algo in ['dqn', 'ddpg', 'sac', 'td3']: args.n_envs = 1 set_global_seeds(args.seed) is_atari = 'NoFrameskip' in env_id env = create_test_env(env_id, n_envs=args.n_envs, is_atari=is_atari, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams) model_path = find_saved_model(algo, log_path, env_id) model = ALGOS[algo].load(model_path, env=env) # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) # deterministic = args.deterministic or algo in ['dqn', 'ddpg', 'sac', 'her', 'td3'] and not args.stochastic deterministic = args.deterministic save_dir = os.path.join("expert_trajs_by_info_deterministic_with_std", algo) if not os.path.isdir(save_dir): os.makedirs(save_dir) runner(env, env_id, model, args.n_episodes, deterministic, save=True, save_dir=save_dir)
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=300, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = ExperimentManager.is_atari(env_id) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) model = ALGOS[algo].load(model_path, env=env, **kwargs) obs = env.reset() # The following 3 variables are used to create GIFS (the images stacked to create a GIF are those acquired by the vision sensor) images = [] obs = env.reset() img = env.render(mode='rgb') # Deterministic by default except for atari games stochastic = args.stochastic or is_atari and not args.deterministic deterministic = not stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 successes = [] for _ in range(args.n_timesteps): images.append(img) action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, infos = env.step(action) img = model.env.render(mode='rgb') if not args.no_render: env.render("human") episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get('episode') if episode_infos is not None: print("Atari Episode Score: {:.2f}".format( episode_infos['r'])) print("Atari Episode Length", episode_infos['l']) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print("Episode Reward: {:.2f}".format(episode_reward)) print("Episode Length", ep_len) state = None episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 # Reset also when the goal is achieved when using HER if done: print("Success?", infos[0].get('is_success', False)) # Alternatively, you can add a check to wait for the end of the episode if done: obs = env.reset() successes.append(infos[0].get('is_success', False)) episode_reward = 0.0 ep_len = 0 # Creation of a GIF and saving of rewards and successes in a csv file : imageio.mimsave( 'dVRL_10ep.gif', [np.array(img) for i, img in enumerate(images) if i % 2 == 0], fps=5) savetxt('ep_rewards.csv', episode_rewards, delimiter=',') savetxt('ep_success.csv', successes, delimiter=',') if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_rewards) > 0: print( f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}" ) if args.verbose > 0 and len(episode_lengths) > 0: print( f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}" ) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and "Bullet" not in env_id and not is_atari and isinstance( env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecEnvWrapper): env = env.venv if isinstance(env, DummyVecEnv): env.envs[0].env.close() else: env.close() else: # SubprocVecEnv env.close()
d = {'macro_ratio': macro_probs, 'return': episode_rewards, 'success': successes} for key in d: f.write(key + '\n') for v in d[key]: f.write(str(v) + ' ') f.write('\n\n') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, nargs='+', default=["CartPole-v1"], help='environment ID(s)') parser.add_argument('--domain', type=str, default="", help='domain name for DM-control suite') parser.add_argument('--task', type=str, default="", help='task name for DM-control suite') parser.add_argument('-tb', '--tensorboard-log', help='Tensorboard log dir', default='', type=str) parser.add_argument('-i', '--trained-agent', help='Path to a pretrained agent to continue training', default='', type=str) parser.add_argument('--algo', help='RL Algorithm', default='ppo2', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='Overwrite the number of timesteps', default=-1, type=int) parser.add_argument('--log-interval', help='Override log interval (default: -1, no change)', default=-1, type=int) parser.add_argument('-f', '--log-folder', help='Log folder', type=str, default='logs') parser.add_argument('--seed', help='Random generator seed', type=int, default=-1) parser.add_argument('--n-trials', help='Number of trials for optimizing hyperparameters', type=int, default=10) parser.add_argument('-optimize', '--optimize-hyperparameters', action='store_true', default=False, help='Run hyperparameters search') parser.add_argument('--n-jobs', help='Number of parallel jobs when optimizing hyperparameters', type=int, default=1) parser.add_argument('--sampler', help='Sampler to use when optimizing hyperparameters', type=str, default='tpe', choices=['random', 'tpe', 'skopt']) parser.add_argument('--pruner', help='Pruner to use when optimizing hyperparameters', type=str, default='median', choices=['halving', 'median', 'none']) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1,
tensorboard_log="tensorboard/", n_cpu_tf_sess=None, **hyperparams) model.learn(total_timesteps=time_steps, callback=callbacks, tb_log_name=full_tag, log_interval=10) model.save(current_dir + "/" + full_tag + "_final") if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-a', '--algo', help='RL Algorithm', default='DQN', type=lambda x: str(x).upper(), required=False, choices=list(ALGOS.keys())) parser.add_argument('-t', '--tag', help='Name of configuration tag used for algorithm parameters ' ' Default: TUNED', default='TUNED', choices=['CLEAN', 'TUNED', 'NOISE'], type=lambda x: str(x).upper(), required=False) parser.add_argument('-s', '--suf', help='Suffix added for nametag of trained model', default='', type=str, required=False) parser.add_argument('-l', '--lockRot', help='Should lock rotation of targeted object Default: True', required=False, default=True, type=lambda x: (str(x).lower() == 'true'), choices=[True, False]) parser.add_argument('--saveFreq', help='Save checkpoint model every n steps (if negative, no checkpoint)', default=25000, type=int) parser.add_argument('--evalNum', help='Number of episodes to use for evaluation', default=25, type=int) parser.add_argument('--evalFreq', help='Evaluate the model every n steps (if negative, no evaluation', default=50000, type=int) args = parser.parse_args()
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v1') parser.add_argument('-f', '--folder', help='Log folder', type=str, default='rl-trained-agents') parser.add_argument('--algo', help='RL Algorithm', default='ppo', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=1000, type=int) parser.add_argument( '--num-threads', help='Number of threads for PyTorch (-1 to use default)', default=-1, type=int) parser.add_argument('--n-envs', help='number of environments', default=1, type=int) parser.add_argument( '--exp-id', help='Experiment ID (default: 0: latest, -1: no exp folder)', default=0, type=int) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1, type=int) parser.add_argument( '--no-render', action='store_true', default=False, help='Do not render the environment (useful for tests)') parser.add_argument('--deterministic', action='store_true', default=False, help='Use deterministic actions') parser.add_argument( '--load-best', action='store_true', default=False, help='Load best model instead of last model if available') parser.add_argument( '--load-checkpoint', type=int, help='Load checkpoint instead of last model if available, ' 'you must pass the number of timesteps corresponding to it') parser.add_argument('--stochastic', action='store_true', default=False, help='Use stochastic actions (for DDPG/DQN/SAC)') parser.add_argument( '--norm-reward', action='store_true', default=False, help='Normalize reward if applicable (trained with VecNormalize)') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) parser.add_argument('--reward-log', help='Where to log reward', default='', type=str) parser.add_argument( '--gym-packages', type=str, nargs='+', default=[], help= 'Additional external Gym environemnt package modules to import (e.g. gym_minigrid)' ) parser.add_argument( '--env-kwargs', type=str, nargs='+', action=StoreDict, help='Optional keyword argument to pass to the env constructor') args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print('Loading latest experiment, id={}'.format(args.exp_id)) # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, '{}_{}'.format(env_id, args.exp_id)) else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ['zip']: model_path = os.path.join(log_path, f'{env_id}.{ext}') found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") if algo in ['dqn', 'ddpg', 'sac', 'td3']: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = 'NoFrameskip' in env_id stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) env_kwargs = {} if args.env_kwargs is None else args.env_kwargs log_dir = args.reward_log if args.reward_log != '' else None env = create_test_env(env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs) model = ALGOS[algo].load(model_path, env=env) obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in [ 'dqn', 'ddpg', 'sac', 'her', 'td3' ] and not args.stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 # For HER, monitor success rate successes = [] for _ in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) # Random Agent # action = [env.action_space.sample()] # Clip Action to avoid out of bound errors if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, reward, done, infos = env.step(action) if not args.no_render: env.render('human') episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get('episode') if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos['l']) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print(f"Episode Reward: {episode_reward:.2f}") print("Episode Length", ep_len) episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get('is_success') is not None: if args.verbose > 1: print("Success?", infos[0].get('is_success', False)) # Alternatively, you can add a check to wait for the end of the episode if done: obs = env.reset() if infos[0].get('is_success') is not None: successes.append(infos[0].get('is_success', False)) episode_reward, ep_len = 0.0, 0 if args.verbose > 0 and len(successes) > 0: print("Success rate: {:.2f}%".format(100 * np.mean(successes))) if args.verbose > 0 and len(episode_rewards) > 0: print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards))) if args.verbose > 0 and len(episode_lengths) > 0: print("Mean episode length: {:.2f} +/- {:.2f}".format( np.mean(episode_lengths), np.std(episode_lengths))) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if (args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance(env, VecEnv)): # DummyVecEnv # Unwrap env while isinstance(env, VecEnvWrapper): env = env.venv if isinstance(env, DummyVecEnv): env.envs[0].env.close() else: env.close() else: # SubprocVecEnv env.close()
parser.add_argument('-tb', '--tensorboard-log', help='Tensorboard log dir', default='', type=str) parser.add_argument('-i', '--trained-agent', help='Path to a pretrained agent to continue training', default='', type=str) parser.add_argument('--algo', help='RL Algorithm', default='ppo2', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='Overwrite the number of timesteps', default=-1, type=int) parser.add_argument('--log-interval', help='Override log interval (default: -1, no change)', default=-1, type=int) parser.add_argument('-f', '--log-folder', help='Log folder', type=str, default='logs') parser.add_argument('--seed',
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v1') parser.add_argument('-f', '--folder', help='Log folder', type=str, default='trained_agents') parser.add_argument('--algo', help='RL Algorithm', default='ppo2', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=1000, type=int) parser.add_argument('--n-envs', help='number of environments', default=1, type=int) parser.add_argument( '--exp-id', help='Experiment ID (default: -1, no exp folder, 0: latest)', default=-1, type=int) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1, type=int) parser.add_argument( '--no-render', action='store_true', default=False, help='Do not render the environment (useful for tests)') parser.add_argument('--deterministic', action='store_true', default=False, help='Use deterministic actions') parser.add_argument('--stochastic', action='store_true', default=False, help='Use stochastic actions (for DDPG/DQN/SAC)') parser.add_argument( '--norm-reward', action='store_true', default=False, help='Normalize reward if applicable (trained with VecNormalize)') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) parser.add_argument('--reward-log', help='Where to log reward', default='', type=str) parser.add_argument( '--gym-packages', type=str, nargs='+', default=[], help= 'Additional external Gym environemnt package modules to import (e.g. gym_minigrid)' ) args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print('Loading latest experiment, id={}'.format(args.exp_id)) # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, '{}_{}'.format(env_id, args.exp_id)) else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), "The {} folder was not found".format( log_path) model_path = find_saved_model(algo, log_path, env_id) if algo in ['dqn', 'ddpg', 'sac', 'td3']: args.n_envs = 1 set_global_seeds(args.seed) is_atari = 'NoFrameskip' in env_id stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) log_dir = args.reward_log if args.reward_log != '' else None env = create_test_env(env_id, n_envs=args.n_envs, is_atari=is_atari, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams) # ACER raises errors because the environment passed must have # the same number of environments as the model was trained on. load_env = None if algo == 'acer' else env model = ALGOS[algo].load(model_path, env=load_env) obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in [ 'dqn', 'ddpg', 'sac', 'her', 'td3' ] and not args.stochastic episode_reward = 0.0 episode_rewards = [] ep_len = 0 # For HER, monitor success rate successes = [] for _ in range(args.n_timesteps): action, _ = model.predict(obs, deterministic=deterministic) # Random Agent # action = [env.action_space.sample()] # Clip Action to avoid out of bound errors if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, reward, done, infos = env.step(action) if not args.no_render: env.render('human') episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get('episode') if episode_infos is not None: print("Atari Episode Score: {:.2f}".format( episode_infos['r'])) print("Atari Episode Length", episode_infos['l']) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print("Episode Reward: {:.2f}".format(episode_reward)) print("Episode Length", ep_len) episode_rewards.append(episode_reward) episode_reward = 0.0 ep_len = 0 # Reset also when the goal is achieved when using HER if done or infos[0].get('is_success', False): if args.algo == 'her' and args.verbose > 1: print("Success?", infos[0].get('is_success', False)) # Alternatively, you can add a check to wait for the end of the episode # if done: obs = env.reset() if args.algo == 'her': successes.append(infos[0].get('is_success', False)) episode_reward, ep_len = 0.0, 0 if args.verbose > 0 and len(successes) > 0: print("Success rate: {:.2f}%".format(100 * np.mean(successes))) if args.verbose > 0 and len(episode_rewards) > 0: print("Mean reward: {:.2f}".format(np.mean(episode_rewards))) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance( env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecNormalize) or isinstance( env, VecFrameStack): env = env.venv env.envs[0].env.close() else: # SubprocVecEnv env.close()
def main(): seed = 0 num_samples = 20 parser = argparse.ArgumentParser() parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v1') parser.add_argument('-f', '--folder', help='Log folder', type=str, default='rl-baselines-zoo/trained_agents') parser.add_argument('--algo', help='RL Algorithm', default='dqn', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=2000, type=int) parser.add_argument('--n-envs', help='number of environments', default=1, type=int) parser.add_argument( '--exp-id', help='Experiment ID (default: -1, no exp folder, 0: latest)', default=-1, type=int) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1, type=int) parser.add_argument( '--no-render', action='store_true', default=False, help='Do not render the environment (useful for tests)') parser.add_argument('--deterministic', action='store_true', default=False, help='Use deterministic actions') parser.add_argument('--stochastic', action='store_true', default=False, help='Use stochastic actions (for DDPG/DQN/SAC)') parser.add_argument( '--load-best', action='store_true', default=False, help='Load best model instead of last model if available') parser.add_argument( '--norm-reward', action='store_true', default=False, help='Normalize reward if applicable (trained with VecNormalize)') args = parser.parse_args() env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print('Loading latest experiment, id={}'.format(args.exp_id)) # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, '{}_{}'.format(env_id, args.exp_id)) else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), "The {} folder was not found".format( log_path) model_path = find_saved_model(algo, log_path, env_id, load_best=args.load_best) if algo in ['dqn', 'ddpg', 'sac', 'td3']: args.n_envs = 1 set_global_seeds(seed) is_atari = 'NoFrameskip' in env_id stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) log_dir = None env_kwargs = {} env = create_test_env(env_id, n_envs=args.n_envs, is_atari=is_atari, stats_path=stats_path, seed=seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs) # ACER raises errors because the environment passed must have # the same number of environments as the model was trained on. load_env = None if algo == 'acer' else env model = ALGOS[algo].load(model_path, env=load_env) env = gym.make('CartPole-v1') obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in [ 'dqn', 'ddpg', 'sac', 'her', 'td3' ] and not args.stochastic episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 # For HER, monitor success rate successes = [] state = None embedder = indicator_feature halfspaces = {} for i in range(num_samples): print("+" * 10) #sample random state to start in #TODO: maybe reset with random actions? How to make it realistic? Does it matter. Let's just try random for now to test weird edge cases. obs = env.reset(uniform=True) #sample more uniformly than typical print("start state", obs) # input() #obs = env.reset_state(env.observation_space.sample()) #rollout once for each action and compute feature counts start_state = obs.copy() fcount_vectors = [] init_actions = [] ##rollout code: for init_action in range(env.action_space.n): print("ACTION", init_action) obs = env.reset(start_state=start_state) print("init state", obs) env.render() # input() ep_ret = 0 fcounts = embedder(start_state) #do initial action obs, r, done, info = env.step(init_action) # take a random action fcounts += embedder(obs) #TODO: discount?? ep_ret += r #print(r, obs) if done: print("final state", obs) print("return", ep_ret) print("fcounts", fcounts) fcount_vectors.append(fcounts) init_actions.append(init_action) continue #run tester policy thereafter while True: #env.render() #TODO: sample within allowable range of angle and position action, state = model.predict(obs, state=state, deterministic=deterministic) # Random Agent # action = [env.action_space.sample()] # Clip Action to avoid out of bound errors if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) #a = env.action_space.sample() #print(obs, action) obs, r, done, info = env.step(action) # take a random action fcounts += embedder(obs) #print(obs) #print(done) ep_ret += r #print(r, obs) if done: print("final state", obs) print("return", ep_ret) print("fcounts", fcounts) fcount_vectors.append(fcounts) init_actions.append(init_action) break print("action {} over {} => fcount diff = {}".format( init_actions[0], init_actions[1], fcount_vectors[0] - fcount_vectors[1])) halfspaces[state, init_actions[0], init_actions[1]] = fcount_vectors[0] - fcount_vectors[1] # input() #TODO: put this inside one of the value alignment verification classes to get sa_fcount_diffs and hopefully reuse that code #then visualize test cases # input() # for _ in range(args.n_timesteps): # action, state = model.predict(obs, state=state, deterministic=deterministic) # # Random Agent # # action = [env.action_space.sample()] # # Clip Action to avoid out of bound errors # if isinstance(env.action_space, gym.spaces.Box): # action = np.clip(action, env.action_space.low, env.action_space.high) # obs, reward, done, infos = env.step(action) # if not args.no_render: # env.render('human') # episode_reward += reward # ep_len += 1 # if args.n_envs == 1: # # For atari the return reward is not the atari score # # so we have to get it from the infos dict # if is_atari and infos is not None and args.verbose >= 1: # episode_infos = infos.get('episode') # if episode_infos is not None: # print("Atari Episode Score: {:.2f}".format(episode_infos['r'])) # print("Atari Episode Length", episode_infos['l']) # if done and not is_atari and args.verbose > 0: # # NOTE: for env using VecNormalize, the mean reward # # is a normalized reward when `--norm_reward` flag is passed # print("Episode Reward: {:.2f}".format(episode_reward)) # print("Episode Length", ep_len) # state = None # episode_rewards.append(episode_reward) # episode_lengths.append(ep_len) # episode_reward = 0.0 # ep_len = 0 # # Reset also when the goal is achieved when using HER # if done or infos.get('is_success', False): # if args.algo == 'her' and args.verbose > 1: # print("Success?", infos[0].get('is_success', False)) # # Alternatively, you can add a check to wait for the end of the episode # # if done: # obs = env.reset() # if args.algo == 'her': # successes.append(infos[0].get('is_success', False)) # episode_reward, ep_len = 0.0, 0 # if args.verbose > 0 and len(successes) > 0: # print("Success rate: {:.2f}%".format(100 * np.mean(successes))) # if args.verbose > 0 and len(episode_rewards) > 0: # print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards))) # if args.verbose > 0 and len(episode_lengths) > 0: # print("Mean episode length: {:.2f} +/- {:.2f}".format(np.mean(episode_lengths), np.std(episode_lengths))) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance( env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecNormalize) or isinstance( env, VecFrameStack): env = env.venv env.envs[0].env.close() else: # SubprocVecEnv env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v1') parser.add_argument('-f', '--folder', help='Log folder', type=str, default='trained_agents') parser.add_argument('--algo', help='RL Algorithm', default='ppo2', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=1000, type=int) parser.add_argument('--n-envs', help='number of environments', default=1, type=int) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1, type=int) parser.add_argument( '--no-render', action='store_true', default=False, help='Do not render the environment (useful for tests)') parser.add_argument('--deterministic', action='store_true', default=False, help='Use deterministic actions') parser.add_argument( '--norm-reward', action='store_true', default=False, help='Normalize reward if applicable (trained with VecNormalize)') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) parser.add_argument('--reward-log', help='Where to log reward', default='', type=str) args = parser.parse_args() env_id = args.env algo = args.algo folder = args.folder model_path = "{}/{}/{}.pkl".format(folder, algo, env_id) # Sanity checks assert os.path.isdir(folder + '/' + algo), "The {}/{}/ folder was not found".format( folder, algo) assert os.path.isfile( model_path), "No model found for {} on {}, path: {}".format( algo, env_id, model_path) if algo in ['dqn', 'ddpg']: args.n_envs = 1 set_global_seeds(args.seed) is_atari = 'NoFrameskip' in env_id stats_path = "{}/{}/{}/".format(folder, algo, env_id) if not os.path.isdir(stats_path): stats_path = None log_dir = args.reward_log if args.reward_log != '' else None env = create_test_env(env_id, n_envs=args.n_envs, is_atari=is_atari, stats_path=stats_path, norm_reward=args.norm_reward, seed=args.seed, log_dir=log_dir, should_render=not args.no_render) model = ALGOS[algo].load(model_path) obs = env.reset() # Force deterministic for DQN and DDPG deterministic = args.deterministic or algo in ['dqn', 'ddpg'] running_reward = 0.0 ep_len = 0 for _ in range(args.n_timesteps): action, _ = model.predict(obs, deterministic=deterministic) # Random Agent # action = [env.action_space.sample()] # Clip Action to avoid out of bound errors if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, reward, done, infos = env.step(action) if not args.no_render: env.render('human') running_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get('episode') if episode_infos is not None: print("Atari Episode Score: {:.2f}".format( episode_infos['r'])) print("Atari Episode Length", episode_infos['l']) if done and not is_atari and args.verbose >= 1: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print("Episode Reward: {:.2f}".format(running_reward)) print("Episode Length", ep_len) running_reward = 0.0 ep_len = 0 # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and not 'Bullet' in env_id and not is_atari: # DummyVecEnv # Unwrap env while isinstance(env, VecNormalize) or isinstance( env, VecFrameStack): env = env.venv env.envs[0].env.close() else: # SubprocVecEnv env.close()
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("-tb", "--tensorboard-log", help="Tensorboard log dir", default="", type=str) parser.add_argument("--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) # parser.add_argument("--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default='0', type=str) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument( "--render-mode", default='step', help="Whether to render at each step or at the end of an episode") parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--info-freq", help="Frequency on which info valuers are logged", type=int, default=10) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == '0': args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id != '0' and args.exp_id != '-1': log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") else: print(f"Loading model for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = ExperimentManager.is_atari(env_id) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) # Check if we are running python 3.8+ # we need to patch saved model under python 3.6/3.7 to load them newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8 custom_objects = {} if newer_python_version: custom_objects = { "learning_rate": 0.0, "lr_schedule": lambda _: 0.0, "clip_range": lambda _: 0.0, } model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, **kwargs) # tb_path = '' # for i in range(0,100000,1): # tb_path = os.path.join(args.tensorboard_log, env_id, algo.upper() + "_" + str(i)) # if not os.path.exists(tb_path): # break # print("algo=",algo, " logdir=", tb_path) # writer = SummaryWriter(log_dir=tb_path) obs = env.reset() # Deterministic by default except for atari games stochastic = args.stochastic or is_atari and not args.deterministic deterministic = not stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 ep_count = 0 # For HER, monitor success rate successes = [] sbcommon_utils.configure_logger(args.verbose, os.path.join(args.tensorboard_log, env_id), algo.upper(), reset_num_timesteps=True) xlsx_logpath = os.path.join( args.tensorboard_log, env_id) if logger.get_dir() is None else logger.get_dir() xlsx_logger = Xlsx_Logger(xlsx_logpath, env_id) with open(os.path.join(xlsx_logpath, 'args.yaml'), 'w') as file: yaml.dump(args, file) fig: plt.Figure = None info_freq = args.info_freq try: for step in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, infos = env.step(action) episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # log info variables to tensorboard if (step % info_freq == 0 or done) and type(infos[0]) is dict: if not args.no_render: if not done and args.render_mode == 'step': fig = env.render("human") elif done and args.render_mode == 'episode': fig = env.envs[0].rendered_episode xlsx_logger.set_step_ep(ep_count, step) for key in infos[0]: if key == 'episode' or key == 'terminal_observation' or key == 'render': continue val = infos[0].get(key) logger.record("eval/" + key, val, exclude='stdout') xlsx_logger.log(key, val) if fig is not None: log_fig = logger.Figure(fig, False) logger.record("eval/figure", log_fig, exclude='stdout') # writer.add_scalar("eval/"+key, val, step) logger.dump(step=step) # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get("episode") if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos["l"]) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print("Episode #{}, step#{}".format(ep_count, step)) print(f" Episode Reward: {episode_reward:.2f}") print(" Episode Length", ep_len) episode_rewards.append(episode_reward) logger.record("eval/ep_len", ep_len, exclude='stdout') logger.record("eval/ep_reward", episode_reward, exclude='stdout') xlsx_logger.log('ep_len', ep_len) xlsx_logger.log('reward', episode_reward) logger.dump(step=step) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 ep_count += 1 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 ep_count += 1 # if (not args.no_render) and args.render_mode=='step': # fig = env.render("human") # else: # fig = None except KeyboardInterrupt: pass logger.dump(step=step) xlsx_logger.close() if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_rewards) > 0: print(f"{len(episode_rewards)} Episodes") print( f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}" ) if args.verbose > 0 and len(episode_lengths) > 0: print( f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}" ) env.close()
import argparse import os import ur5e_env from stable_baselines3.common.vec_env import DummyVecEnv, VecEnvWrapper, VecVideoRecorder from utils import ALGOS, create_test_env, get_latest_run_id, get_saved_hyperparams parser = argparse.ArgumentParser() parser.add_argument("--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("-o", "--output-folder", help="Output folder", type=str, default="logs/videos/") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument("--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) args = parser.parse_args() env_id = args.env algo = args.algo folder = args.folder video_folder = args.output_folder seed = args.seed deterministic = args.deterministic video_length = args.n_timesteps n_envs = args.n_envs if args.exp_id == 0:
# if env_.val_reachability < avg_val_reachability: if env_.val_reachability < val_median_reachability: print("Improved the model at episode {}!".format(episode_)) self_.save(OUTPUT_PATH + "val_model_episode_{}".format(episode_), cloudpickle=True) env_.val_reachability = val_median_reachability n_steps += 1 return True if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, nargs='+', default=["gym_sacrum_nav:sacrum_nav-v2"], help='environment ID(s)') parser.add_argument('-tb', '--tensorboard-log', help='Tensorboard log dir', default='./runs/', type=str) parser.add_argument('-i', '--trained-agent', help='Path to a pretrained agent to continue training', default='', type=str) parser.add_argument('--algo', help='RL Algorithm', default='dqn', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='Overwrite the number of timesteps', default=-1, type=int) parser.add_argument('--log-interval', help='Override log interval (default: -1, no change)', default=-1, type=int) parser.add_argument('-f', '--log-folder', help='Log folder', type=str, default='logs') parser.add_argument('--data-folder', help='Data folder', type=str, default='./data/') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) parser.add_argument('--n-trials', help='Number of trials for optimizing hyperparameters', type=int, default=10) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO, 2: debug)', default=1, type=int) parser.add_argument('--gym-packages', type=str, nargs='+', default=[], help='Additional external Gym environemnt package modules to import (e.g. gym_minigrid)') args = parser.parse_args() env_params['data_path'] = args.data_folder env_ids = args.env set_global_seeds(args.seed)
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("--env", help="environment ID", type=str, default="Walker2DBulletEnv-v0") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument("--deterministic", action="store_true", default=True, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions (for DDPG/DQN/SAC)") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environemnt package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") # === # parser.add_argument("--load-checkpoint", type=str, help="pass the path of zip file corresponding to it") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--dataset", type=str, default="dataset/walker2d_v6") parser.add_argument("--body-id", type=int, default=0) args = parser.parse_args() dataset_name, env_id, train_files, train_params, train_names, test_files, test_params, test_names = load_dataset.load_dataset( args.dataset, seed=0, shuffle=False, train_proportion=1) # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) # env_id = args.env algo = args.algo log_path = args.folder # if args.exp_id == 0: # args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) # print(f"Loading latest experiment, id={args.exp_id}") # # Sanity checks # if args.exp_id > 0: # log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") # else: # log_path = os.path.join(folder, algo) # assert os.path.isdir(log_path), f"The {log_path} folder was not found" # found = False # for ext in ["zip"]: # model_path = os.path.join(log_path, f"{env_id}.{ext}") # found = os.path.isfile(model_path) # if found: # break # if args.load_best: # model_path = os.path.join(log_path, "best_model.zip") # found = os.path.isfile(model_path) # if args.load_checkpoint is not None: # model_path = os.path.join(log_path, f"rl_model_{args.load_checkpoint}_steps.zip") # found = os.path.isfile(model_path) # if not found: # raise ValueError(f"No model found for {algo} on {env_id}, path: {model_path}") model_path = args.load_checkpoint if algo in ["dqn", "ddpg", "sac", "td3", "tqc"]: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = "NoFrameskip" in env_id stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing # env_kwargs = {} # args_path = os.path.join(log_path, env_id, "args.yml") # if os.path.isfile(args_path): # with open(args_path, "r") as f: # loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr # if loaded_args["env_kwargs"] is not None: # env_kwargs = loaded_args["env_kwargs"] # # overwrite with command line arguments # if args.env_kwargs is not None: # env_kwargs.update(args.env_kwargs) args.watch_eval = True env_kwargs = { "xml": train_files[args.body_id], "param": train_params[args.body_id], "render": args.watch_eval, } log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in ["dqn", "ddpg", "sac", "her", "td3", "tqc"]: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) model = ALGOS[algo].load(model_path, env=env, **kwargs) obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in [ "dqn", "ddpg", "sac", "her", "td3", "tqc" ] and not args.stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 # For HER, monitor success rate successes = [] for _ in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) # Random Agent # action = [env.action_space.sample()] # Clip Action to avoid out of bound errors if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, reward, done, infos = env.step(action) sleep(0.01) if not args.no_render: env.render("human") episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get("episode") if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos["l"]) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print(f"Episode Reward: {episode_reward:.2f}") print("Episode Length", ep_len) episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) # Alternatively, you can add a check to wait for the end of the episode if done: obs = env.reset() if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 if args.verbose > 0 and len(successes) > 0: print("Success rate: {:.2f}%".format(100 * np.mean(successes))) if args.verbose > 0 and len(episode_rewards) > 0: print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards))) if args.verbose > 0 and len(episode_lengths) > 0: print("Mean episode length: {:.2f} +/- {:.2f}".format( np.mean(episode_lengths), np.std(episode_lengths))) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and "Bullet" not in env_id and not is_atari and isinstance( env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecEnvWrapper): env = env.venv if isinstance(env, DummyVecEnv): env.envs[0].env.close() else: env.close() else: # SubprocVecEnv env.close()
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("--env", help="environment ID", type=str, default="CartPole-v1") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="rl-trained-agents") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) parser.add_argument( "--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)") parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available") parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environment package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join( log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError( f"No model found for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = ExperimentManager.is_atari(env_id) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None print(env_kwargs) env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) # Check if we are running python 3.8+ # we need to patch saved model under python 3.6/3.7 to load them newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8 custom_objects = {} if newer_python_version: custom_objects = { "learning_rate": 0.0, "lr_schedule": lambda _: 0.0, "clip_range": lambda _: 0.0, } model = ALGOS[algo].load(model_path, env=env, custom_objects=custom_objects, **kwargs) obs = env.reset() # Deterministic by default except for atari games stochastic = args.stochastic or is_atari and not args.deterministic deterministic = not stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 # For HER, monitor success rate successes = [] plt.figure(f"Enjoy {env_id}") plt.title(f"{env_id}", fontsize=14) plt.xlabel(f"Timesteps", fontsize=14) # plt.ylabel("Score", fontsize=14) observations = [] rewards = [] infos = [] try: for _ in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, info = env.step(action) if not args.no_render: env.render("human") episode_reward += reward[0] ep_len += 1 observations.append(obs) rewards.append(reward) infos.append(info[0].get("coating")) if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get("episode") if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos["l"]) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print(f"Episode Reward: {episode_reward:.2f}") print("Episode Length", ep_len) episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 except KeyboardInterrupt: pass if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_rewards) > 0: print(f"{len(episode_rewards)} Episodes") print( f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}" ) if args.verbose > 0 and len(episode_lengths) > 0: print( f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}" ) env.close() gesamt = 0 gesamt_mit = 0 for el in rewards: if (el > 0): gesamt += el gesamt_mit += el print(f"Gesamt reward: {gesamt}") print(f"Gesamt reward mit: {gesamt_mit}") plt.plot(np.arange(len(observations)), rewards, label="reward", linewidth=1) plt.plot(np.arange(len(observations)), [obs[0][3] * 202 + 8 for obs in observations], label="coating_dist", linewidth=1) plt.plot(np.arange(len(observations)), [obs[0][1] * 202 + 8 for obs in observations], label="coating_targets", linewidth=1) plt.plot(np.arange(len(observations)), infos, label="coating_real", linewidth=1) plt.plot(np.arange(len(observations)), [obs[0][4] * 700 for obs in observations], label="pressure", linewidth=1) plt.legend() plt.show()
def get_train_args(): parser = argparse.ArgumentParser() parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("--env", type=str, default="CartPole-v1", help="environment ID") parser.add_argument("--hyperparameters", type=str, default="Walker2DBulletEnv-v0", help="") parser.add_argument("-tb", "--tensorboard-log", help="Tensorboard log dir", default="tb", type=str) parser.add_argument( "--truncate-last-trajectory", help= "When using HER with online sampling the last trajectory in the replay buffer will be truncated after reloading the replay buffer.", default=True, type=bool, ) parser.add_argument("-n", "--n-timesteps", help="Overwrite the number of timesteps", default=-1, type=int) parser.add_argument( "--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=1, type=int) parser.add_argument("--log-interval", help="Override log interval (default: -1, no change)", default=-1, type=int) parser.add_argument( "--eval-freq", help="Evaluate the agent every n steps (if negative, no evaluation)", default=10000, type=int) parser.add_argument("--eval-episodes", help="Number of episodes to use for evaluation", default=5, type=int) parser.add_argument( "--save-freq", help="Save the model every n steps (if negative, no checkpoint)", default=-1, type=int) parser.add_argument("--save-replay-buffer", help="Save the replay buffer too (when applicable)", action="store_true", default=False) parser.add_argument("-f", "--log-folder", help="Log folder", type=str, default="logs") parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--n-trials", help="Number of trials for optimizing hyperparameters", type=int, default=10) parser.add_argument("-optimize", "--optimize-hyperparameters", action="store_true", default=False, help="Run hyperparameters search") parser.add_argument( "--n-jobs", help="Number of parallel jobs when optimizing hyperparameters", type=int, default=1) parser.add_argument( "--sampler", help="Sampler to use when optimizing hyperparameters", type=str, default="tpe", choices=["random", "tpe", "skopt"], ) parser.add_argument( "--pruner", help="Pruner to use when optimizing hyperparameters", type=str, default="median", choices=["halving", "median", "none"], ) parser.add_argument("--n-startup-trials", help="Number of trials before using optuna sampler", type=int, default=10) parser.add_argument( "--n-evaluations", help="Number of evaluations for hyperparameter optimization", type=int, default=20) parser.add_argument( "--storage", help="Database storage path if distributed optimization should be used", type=str, default=None) parser.add_argument("--study-name", help="Study name for distributed optimization", type=str, default=None) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help= "Additional external Gym environment package modules to import (e.g. gym_minigrid)" ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor") parser.add_argument( "-params", "--hyperparams", type=str, nargs="+", action=StoreDict, help="Overwrite hyperparameter (e.g. learning_rate:0.01 train_freq:10)" ) parser.add_argument("--watch-train", action="store_true", default=False) parser.add_argument("--watch-eval", action="store_true", default=False) parser.add_argument( "--powercoeff", type=float, nargs=3, default=[1., 1., 1.], help="Only useful for adjusting powercoeff. Default is [1 1 1].") parser.add_argument("--single-idx", type=int, default=-1, help="The body id that will be trained.") parser.add_argument("--single-group", type=int, default=-1, help="The group of body that will be trained.") parser.add_argument("--dataset", type=str, default="dataset/walker2d_v6", help="Path to dataset") return parser.parse_args()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v1') parser.add_argument('-f', '--folder', help='Log folder', type=str, default='trained_agents') parser.add_argument('--algo', help='RL Algorithm', default='ppo2', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=1000, type=int) parser.add_argument('--n-envs', help='number of environments', default=1, type=int) parser.add_argument('--exp-id', help='Experiment ID (default: -1, no exp folder, 0: latest)', default=-1, type=int) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1, type=int) parser.add_argument('--no-render', action='store_true', default=False, help='Do not render the environment (useful for tests)') parser.add_argument('--deterministic', action='store_true', default=False, help='Use deterministic actions') parser.add_argument('--stochastic', action='store_true', default=False, help='Use stochastic actions (for DDPG/DQN/SAC)') parser.add_argument('--load-best', action='store_true', default=False, help='Load best model instead of last model if available') parser.add_argument('--norm-reward', action='store_true', default=False, help='Normalize reward if applicable (trained with VecNormalize)') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) parser.add_argument('--reward-log', help='Where to log reward', default='', type=str) parser.add_argument('--gym-packages', type=str, nargs='+', default=[], help='Additional external Gym environemnt package modules to import (e.g. gym_minigrid)') parser.add_argument('--render-pybullet', help='Slow down Pybullet simulation to render', default=False) # added by Pierre parser.add_argument('--random-pol', help='Random policy', default=False) # added by Pierre args = parser.parse_args() plot_bool = True plot_dim = 2 log_bool = False if plot_bool: if plot_dim == 2: fig, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(5, 10)) elif plot_dim == 3: fig = plt.figure() ax = fig.gca(projection='3d') if log_bool: output_df = pd.DataFrame() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print('Loading latest experiment, id={}'.format(args.exp_id)) # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, '{}_{}'.format(env_id, args.exp_id)) else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), "The {} folder was not found".format(log_path) if not args.random_pol: # added by Pierre model_path = find_saved_model(algo, log_path, env_id, load_best=args.load_best) if algo in ['dqn', 'ddpg', 'sac', 'td3']: args.n_envs = 1 set_global_seeds(args.seed) is_atari = 'NoFrameskip' in env_id stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams(stats_path, norm_reward=args.norm_reward, test_mode=True) log_dir = args.reward_log if args.reward_log != '' else None env = create_test_env(env_id, n_envs=args.n_envs, is_atari=is_atari, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams) # ACER raises errors because the environment passed must have # the same number of environments as the model was trained on. load_env = None if algo == 'acer' else env if not args.random_pol: # added by Pierre model = ALGOS[algo].load(model_path, env=load_env) # if not args.no_render: # env.render(mode="human") # added by Pierre (to work with ReachingJaco-v1) obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in ['dqn', 'ddpg', 'sac', 'her', 'td3'] and not args.stochastic episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 episode = 0 # success_threshold_001 = 0.01 # success_list_001, reachtime_list_001, episode_success_list_001 = [], [], [] # success_threshold_0002 = 0.002 # success_list_0002, reachtime_list_0002, episode_success_list_0002 = [], [], [] # success_threshold_0001 = 0.001 # success_list_0001, reachtime_list_0001, episode_success_list_0001 = [], [], [] # success_threshold_00005 = 0.0005 # success_list_00005, reachtime_list_00005, episode_success_list_00005 = [], [], [] # changed for the paper success_threshold_50 = 0.05 success_list_50, reachtime_list_50, episode_success_list_50 = [], [], [] success_threshold_20 = 0.02 success_list_20, reachtime_list_20, episode_success_list_20 = [], [], [] success_threshold_10 = 0.01 success_list_10, reachtime_list_10, episode_success_list_10 = [], [], [] success_threshold_5 = 0.005 success_list_5, reachtime_list_5, episode_success_list_5 = [], [], [] # For HER, monitor success rate successes = [] state = None for _ in range(args.n_timesteps): if args.random_pol: # Random Agent action = [env.action_space.sample()] else: action, state = model.predict(obs, state=state, deterministic=deterministic) # Clip Action to avoid out of bound errors if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, reward, done, infos = env.step(action) if args.render_pybullet: time.sleep(1./30.) # added by Pierre (slow down Pybullet for rendering) if infos[0]['total_distance'] <= success_threshold_50: episode_success_list_50.append(1) else: episode_success_list_50.append(0) if infos[0]['total_distance'] <= success_threshold_20: episode_success_list_20.append(1) else: episode_success_list_20.append(0) if infos[0]['total_distance'] <= success_threshold_10: episode_success_list_10.append(1) else: episode_success_list_10.append(0) if infos[0]['total_distance'] <= success_threshold_5: episode_success_list_5.append(1) else: episode_success_list_5.append(0) if plot_bool: goal = infos[0]['goal position'] tip = infos[0]['tip position'] if plot_dim == 2: ax1.cla() ax1.plot(goal[0], goal[2], marker='x', color='b', linestyle='', markersize=10, label="goal", mew=3) ax1.plot(tip[0], tip[2], marker='o', color='r', linestyle='', markersize=10, label="end effector") circ_1_50 = plt.Circle((goal[0], goal[2]), radius=success_threshold_50, edgecolor='g', facecolor='w', linestyle='--', label="50 mm") circ_1_20 = plt.Circle((goal[0], goal[2]), radius=success_threshold_20, edgecolor='b', facecolor='w', linestyle='--', label="20 mm") circ_1_10 = plt.Circle((goal[0], goal[2]), radius=success_threshold_10, edgecolor='m', facecolor='w', linestyle='--', label="10 mm") circ_1_5 = plt.Circle((goal[0], goal[2]), radius=success_threshold_5, edgecolor='r', facecolor='w', linestyle='--', label="5 mm") ax1.add_patch(circ_1_50) ax1.add_patch(circ_1_20) ax1.add_patch(circ_1_10) ax1.add_patch(circ_1_5) ax1.set_xlim([-0.25, 0.25]) ax1.set_ylim([0, 0.5]) ax1.set_xlabel("x (m)", fontsize=15) ax1.set_ylabel("z (m)", fontsize=15) ax2.cla() ax2.plot(goal[1], goal[2], marker='x', color='b', linestyle='', markersize=10, mew=3) ax2.plot(tip[1], tip[2], marker='o', color='r', linestyle='', markersize=10) circ_2_50 = plt.Circle((goal[1], goal[2]), radius=success_threshold_50, edgecolor='g', facecolor='w', linestyle='--') circ_2_20 = plt.Circle((goal[1], goal[2]), radius=success_threshold_20, edgecolor='b', facecolor='w', linestyle='--') circ_2_10 = plt.Circle((goal[1], goal[2]), radius=success_threshold_10, edgecolor='m', facecolor='w', linestyle='--') circ_2_5 = plt.Circle((goal[1], goal[2]), radius=success_threshold_5, edgecolor='r', facecolor='w', linestyle='--') ax2.add_patch(circ_2_50) ax2.add_patch(circ_2_20) ax2.add_patch(circ_2_10) ax2.add_patch(circ_2_5) ax2.set_xlim([-0.25, 0.25]) ax2.set_ylim([0, 0.5]) ax2.set_xlabel("y (m)", fontsize=15) ax2.set_ylabel("z (m)", fontsize=15) ax1.legend(loc='upper left', bbox_to_anchor=(0, 1.2), ncol=3, fancybox=True, shadow=True) elif plot_dim == 3: ax.cla() ax.plot([tip[0]], [tip[1]], zs=[tip[2]], marker='x', color='b') ax.plot([goal[0]], [goal[1]], zs=[goal[2]], marker='o', color='r', linestyle="None") ax.set_xlim([-0.2, 0.2]) ax.set_ylim([-0.2, 0.2]) ax.set_zlim([0, 0.5]) ax.set_xlabel("x (m)", fontsize=15) ax.set_ylabel("y (m)", fontsize=15) ax.set_zlabel("z (m)", fontsize=15) fig.suptitle("timestep "+str(ep_len)+" | distance to target: "+str(round(infos[0]['total_distance']*1000, 1))+" mm") plt.pause(0.01) # plt.show() if log_bool: dict_log = infos[0] dict_log['action'] = action[0] dict_log['obs'] = obs[0] dict_log['reward'] = reward[0] dict_log['done'] = done[0] dict_log['timestep'] = ep_len dict_log['episode'] = episode output_df = output_df.append(dict_log, ignore_index=True) # if not args.no_render: # env.render('human') episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get('episode') if episode_infos is not None: print("Atari Episode Score: {:.2f}".format(episode_infos['r'])) print("Atari Episode Length", episode_infos['l']) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print("Episode nb: {} | Episode Reward: {:.2f} | Episode Length: {}".format(episode, episode_reward, ep_len)) # print("Episode Length", ep_len) # commented by Pierre state = None episode_rewards.append(episode_reward) episode_lengths.append(ep_len) # append the last element of the episode success list when episode is done success_list_50.append(episode_success_list_50[-1]) success_list_20.append(episode_success_list_20[-1]) success_list_10.append(episode_success_list_10[-1]) success_list_5.append(episode_success_list_5[-1]) # if the episode is successful and it starts from an unsucessful step, calculate reach time if episode_success_list_50[-1] == True and episode_success_list_50[0] == False: idx = 0 while episode_success_list_50[idx] == False: idx += 1 reachtime_list_50.append(idx) if episode_success_list_20[-1] == True and episode_success_list_20[0] == False: idx = 0 while episode_success_list_20[idx] == False: idx += 1 reachtime_list_20.append(idx) if episode_success_list_10[-1] == True and episode_success_list_10[0] == False: idx = 0 while episode_success_list_10[idx] == False: idx += 1 reachtime_list_10.append(idx) if episode_success_list_5[-1] == True and episode_success_list_5[0] == False: idx = 0 while episode_success_list_5[idx] == False: idx += 1 reachtime_list_5.append(idx) if log_bool: # output_df.to_csv(log_path+"/res_episode_"+str(episode)+".csv", index=False) # slow output_df.to_pickle(log_path+"/res_episode_"+str(episode)+".pkl") # reset for new episode episode_reward = 0.0 ep_len = 0 episode_success_list_50 = [] episode_success_list_20 = [] episode_success_list_10 = [] episode_success_list_5 = [] episode += 1 # Reset also when the goal is achieved when using HER if done or infos[0].get('is_success', False): if args.algo == 'her' and args.verbose > 1: print("Success?", infos[0].get('is_success', False)) # Alternatively, you can add a check to wait for the end of the episode # if done: obs = env.reset() if args.algo == 'her': successes.append(infos[0].get('is_success', False)) episode_reward, ep_len = 0.0, 0 if args.verbose > 0 and len(successes) > 0: print("Success rate: {:.2f}%".format(100 * np.mean(successes))) if args.verbose > 0 and len(episode_rewards) > 0: print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards))) print("success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}".format(success_threshold_50, np.mean(success_list_50), np.mean(reachtime_list_50))) print("success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}".format(success_threshold_20, np.mean(success_list_20), np.mean(reachtime_list_20))) print("success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}".format(success_threshold_10, np.mean(success_list_10), np.mean(reachtime_list_10))) print("success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}".format(success_threshold_5, np.mean(success_list_5), np.mean(reachtime_list_5))) # added by Pierre print("path:", log_path) d = { "Eval mean reward": np.mean(episode_rewards), "Eval std": np.std(episode_rewards), "success ratio 50mm": np.mean(success_list_50), "Average reach time 50mm": np.mean(reachtime_list_50), "success ratio 20mm": np.mean(success_list_20), "Average reach time 20mm": np.mean(reachtime_list_20), "success ratio 10mm": np.mean(success_list_10), "Average reach time 10mm": np.mean(reachtime_list_10), "success ratio 5mm": np.mean(success_list_5), "Average reach time 5mm": np.mean(reachtime_list_5), } df = pd.DataFrame(d, index=[0]) if args.random_pol: df.to_csv("logs/random_policy_0.2M/"+env_id+"/stats.csv", index=False) # make path naming more robust else: df.to_csv(log_path+"/stats.csv", index=False) if args.verbose > 0 and len(episode_lengths) > 0: print("Mean episode length: {:.2f} +/- {:.2f}".format(np.mean(episode_lengths), np.std(episode_lengths))) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance(env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecNormalize) or isinstance(env, VecFrameStack): env = env.venv env.envs[0].env.close() else: # SubprocVecEnv env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', help='environment ID', type=str, default='CartPole-v1') parser.add_argument('-f', '--folder', help='Log folder', type=str, default='trained_agents') parser.add_argument('--algo', help='RL Algorithm', default='ppo2', type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument('-n', '--n-timesteps', help='number of timesteps', default=1000, type=int) parser.add_argument('--n-envs', help='number of environments', default=1, type=int) parser.add_argument( '--exp-id', help='Experiment ID (default: -1, no exp folder, 0: latest)', default=-1, type=int) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1, type=int) parser.add_argument( '--no-render', action='store_true', default=False, help='Do not render the environment (useful for tests)') parser.add_argument('--deterministic', action='store_true', default=False, help='Use deterministic actions') parser.add_argument('--stochastic', action='store_true', default=False, help='Use stochastic actions (for DDPG/DQN/SAC)') parser.add_argument( '--load-best', action='store_true', default=False, help='Load best model instead of last model if available') parser.add_argument( '--norm-reward', action='store_true', default=False, help='Normalize reward if applicable (trained with VecNormalize)') parser.add_argument('--seed', help='Random generator seed', type=int, default=0) parser.add_argument('--reward-log', help='Where to log reward', default='', type=str) parser.add_argument( '--gym-packages', type=str, nargs='+', default=[], help= 'Additional external Gym environemnt package modules to import (e.g. gym_minigrid)' ) parser.add_argument( '--env-kwargs', type=str, nargs='+', action=StoreDict, help='Optional keyword argument to pass to the env constructor') parser.add_argument('--render-pybullet', help='Slow down Pybullet simulation to render', default=False) # added by Pierre parser.add_argument('--random-pol', help='Random policy', default=False) # added by Pierre parser.add_argument( '--log-dir-random', help='Log directory of the random policy') # added by Pierre args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print('Loading latest experiment, id={}'.format(args.exp_id)) # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, '{}_{}'.format(env_id, args.exp_id)) else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), "The {} folder was not found".format( log_path) if not args.random_pol: # added by Pierre model_path = find_saved_model(algo, log_path, env_id, load_best=args.load_best) if algo in ['dqn', 'ddpg', 'sac', 'td3']: args.n_envs = 1 set_global_seeds(args.seed) is_atari = 'NoFrameskip' in env_id stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams( stats_path, norm_reward=args.norm_reward, test_mode=True) log_dir = args.reward_log if args.reward_log != '' else None env_kwargs = {} if args.env_kwargs is None else args.env_kwargs env = create_test_env(env_id, n_envs=args.n_envs, is_atari=is_atari, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs) # ACER raises errors because the environment passed must have # the same number of environments as the model was trained on. load_env = None if algo == 'acer' else env if not args.random_pol: # added by Pierre model = ALGOS[algo].load(model_path, env=load_env) obs = env.reset() # Force deterministic for DQN, DDPG, SAC and HER (that is a wrapper around) deterministic = args.deterministic or algo in [ 'dqn', 'ddpg', 'sac', 'her', 'td3' ] and not args.stochastic # INITIALISE METRICS episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 success_threshold_50 = 0.05 success_list_50, reachtime_list_50, episode_success_list_50 = [], [], [] success_threshold_20 = 0.02 success_list_20, reachtime_list_20, episode_success_list_20 = [], [], [] success_threshold_10 = 0.01 success_list_10, reachtime_list_10, episode_success_list_10 = [], [], [] success_threshold_5 = 0.005 success_list_5, reachtime_list_5, episode_success_list_5 = [], [], [] # For HER, monitor success rate successes = [] state = None for _ in range(args.n_timesteps): # Added by Pierre if args.random_pol: action = [env.action_space.sample()] # Random Agent else: action, state = model.predict(obs, state=state, deterministic=deterministic) # Clip Action to avoid out of bound errors if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, reward, done, infos = env.step(action) # if args.render_pybullet: # time.sleep(1./30.) # added by Pierre (slow down Pybullet for rendering) # added by Pierre if infos[0]['dist_ft_t'] <= success_threshold_50: episode_success_list_50.append(1) else: episode_success_list_50.append(0) if infos[0]['dist_ft_t'] <= success_threshold_20: episode_success_list_20.append(1) else: episode_success_list_20.append(0) if infos[0]['dist_ft_t'] <= success_threshold_10: episode_success_list_10.append(1) else: episode_success_list_10.append(0) if infos[0]['dist_ft_t'] <= success_threshold_5: episode_success_list_5.append(1) else: episode_success_list_5.append(0) if not args.no_render: env.render('human') # env.render(mode="human") episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get('episode') if episode_infos is not None: print("Atari Episode Score: {:.2f}".format( episode_infos['r'])) print("Atari Episode Length", episode_infos['l']) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print("Episode Reward: {:.2f}".format(episode_reward)) print("Episode Length", ep_len) episode_rewards.append(episode_reward) episode_lengths.append(ep_len) # Pierre: append the last element of the episode success list when episode is done success_list_50.append(episode_success_list_50[-1]) success_list_20.append(episode_success_list_20[-1]) success_list_10.append(episode_success_list_10[-1]) success_list_5.append(episode_success_list_5[-1]) # if the episode is successful and it starts from an unsucessful step, calculate reach time if episode_success_list_50[ -1] == True and episode_success_list_50[0] == False: idx = 0 while episode_success_list_50[idx] == False: idx += 1 reachtime_list_50.append(idx) if episode_success_list_20[ -1] == True and episode_success_list_20[0] == False: idx = 0 while episode_success_list_20[idx] == False: idx += 1 reachtime_list_20.append(idx) if episode_success_list_10[ -1] == True and episode_success_list_10[0] == False: idx = 0 while episode_success_list_10[idx] == False: idx += 1 reachtime_list_10.append(idx) if episode_success_list_5[ -1] == True and episode_success_list_5[0] == False: idx = 0 while episode_success_list_5[idx] == False: idx += 1 reachtime_list_5.append(idx) # RESET FOR NEW EPISODE state = None episode_reward = 0.0 ep_len = 0 episode_success_list_50 = [] episode_success_list_20 = [] episode_success_list_10 = [] episode_success_list_5 = [] # Reset also when the goal is achieved when using HER if done or infos[0].get('is_success', False): if args.algo == 'her' and args.verbose > 1: print("Success?", infos[0].get('is_success', False)) # Alternatively, you can add a check to wait for the end of the episode # if done: obs = env.reset() if args.algo == 'her': successes.append(infos[0].get('is_success', False)) episode_reward, ep_len = 0.0, 0 if args.verbose > 0 and len(successes) > 0: print("Success rate: {:.2f}%".format(100 * np.mean(successes))) if args.verbose > 0 and len(episode_rewards) > 0: print("Mean reward: {:.2f} +/- {:.2f}".format(np.mean(episode_rewards), np.std(episode_rewards))) print( "success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}" .format(success_threshold_50, np.mean(success_list_50), np.mean(reachtime_list_50))) print( "success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}" .format(success_threshold_20, np.mean(success_list_20), np.mean(reachtime_list_20))) print( "success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}" .format(success_threshold_10, np.mean(success_list_10), np.mean(reachtime_list_10))) print( "success threshold: {} | success ratio: {:.2f} | Average reach time: {:.2f}" .format(success_threshold_5, np.mean(success_list_5), np.mean(reachtime_list_5))) if args.verbose > 0 and len(episode_lengths) > 0: print("Mean episode length: {:.2f} +/- {:.2f}".format( np.mean(episode_lengths), np.std(episode_lengths))) # added by Pierre print("path:", log_path) d = { "Eval mean reward": np.mean(episode_rewards), "Eval std": np.std(episode_rewards), "success ratio 50mm": np.mean(success_list_50), "Average reach time 50mm": np.mean(reachtime_list_50), "success ratio 20mm": np.mean(success_list_20), "Average reach time 20mm": np.mean(reachtime_list_20), "success ratio 10mm": np.mean(success_list_10), "Average reach time 10mm": np.mean(reachtime_list_10), "success ratio 5mm": np.mean(success_list_5), "Average reach time 5mm": np.mean(reachtime_list_5), } df = pd.DataFrame(d, index=[0]) if args.random_pol: log_rand = args.log_dir_random df.to_csv(log_rand + "/stats.csv", index=False) else: df.to_csv(log_path + "/stats.csv", index=False) # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and 'Bullet' not in env_id and not is_atari and isinstance( env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecNormalize) or isinstance( env, VecFrameStack): env = env.venv env.envs[0].env.close() else: # SubprocVecEnv env.close()