def configure_env(arg_dict, model_logdir=None, for_train=True): env_arguments = {"render_on": True, "visualize": arg_dict["visualize"], "workspace": arg_dict["workspace"], "robot": arg_dict["robot"], "robot_init_joint_poses": arg_dict["robot_init"], "robot_action": arg_dict["robot_action"],"max_velocity": arg_dict["max_velocity"], "max_force": arg_dict["max_force"],"task_type": arg_dict["task_type"], "action_repeat": arg_dict["action_repeat"], "task_objects":arg_dict["task_objects"], "observation":arg_dict["observation"], "distractors":arg_dict["distractors"], "num_networks":arg_dict.get("num_networks", 1), "network_switcher":arg_dict.get("network_switcher", "gt"), "distance_type": arg_dict["distance_type"], "used_objects": arg_dict["used_objects"], "active_cameras": arg_dict["camera"], "color_dict":arg_dict.get("color_dict", {}), "max_steps": arg_dict["max_episode_steps"], "visgym":arg_dict["visgym"], "reward": arg_dict["reward"], "logdir": arg_dict["logdir"], "vae_path": arg_dict["vae_path"], "yolact_path": arg_dict["yolact_path"], "yolact_config": arg_dict["yolact_config"]} if for_train: env_arguments["gui_on"] = False else: env_arguments["gui_on"] = arg_dict["gui"] if arg_dict["algo"] == "her": env = gym.make(arg_dict["env_name"], **env_arguments, obs_space="dict") # her needs obs as a dict else: env = gym.make(arg_dict["env_name"], **env_arguments) if for_train: if arg_dict["engine"] == "mujoco": env = VecMonitor(env, model_logdir) if arg_dict["multiprocessing"] else Monitor(env, model_logdir) elif arg_dict["engine"] == "pybullet": env = Monitor(env, model_logdir, info_keywords=tuple('d')) if arg_dict["algo"] == "her": env = HERGoalEnvWrapper(env) return env
def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, verbose=0) model.learn(1000) model_predict(model, env, n_steps=100, additional_check=None) model.save('./test_her') del model # NOTE: HER does not support VecEnvWrapper yet with pytest.raises(AssertionError): model = HER.load('./test_her', env=VecNormalize(env)) model = HER.load('./test_her') # Check that the model raises an error when the env # is not wrapped (or no env passed to the model) with pytest.raises(ValueError): model.predict(env.reset()) env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env_ = HERGoalEnvWrapper(env_) model_predict(model, env_, n_steps=100, additional_check=None) model.set_env(env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 del model env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) model = HER.load('./test_her', env=env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 if os.path.isfile('./test_her.pkl'): os.remove('./test_her.pkl')
def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) :return: (gym.Env) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id, **env_kwargs) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: if n_envs == 1: env = DummyVecEnv([make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print("Normalization activated: {}".format(normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] if args.algo == 'her': # Wrap the env if need to flatten the dict obs if isinstance(env, VecEnv): env = _UnvecWrapper(env) env = HERGoalEnvWrapper(env) return env
def __init__(self, env: ISettableGoalEnv, verbose=1, experiment_name="ppo", rank=0): self._env = env self._dirs = Dirs( experiment_name=f"{type(env).__name__}-{experiment_name}", rank=rank) self._flat_env = HERGoalEnvWrapper(env) options = { "env": DummyVecEnv([lambda: self._flat_env]), "tensorboard_log": self._dirs.tensorboard, "gamma": 1, "seed": rank, "nminibatches": 1 } if os.path.isdir(self._dirs.models) and os.path.isfile( self._dirs.best_model): self._model = PPO2.load(load_path=self._dirs.best_model, **options) print(f"Loaded model {self._dirs.best_model}") else: self._model = PPO2("MlpPolicy", verbose=verbose, **options)
def objective(trial): kwargs = hyperparams.copy() trial.model_class = None if algo == 'her': trial.model_class = hyperparams['model_class'] # Hack to use DDPG/TD3 noise sampler if algo in ['ddpg', 'td3'] or trial.model_class in ['ddpg', 'td3']: trial.n_actions = env_fn(n_envs=1).action_space.shape[0] kwargs.update(algo_sampler(trial)) model = model_fn(**kwargs) eval_env = env_fn(n_envs=1, eval_env=True) # Account for parallel envs eval_freq_ = eval_freq if isinstance(model.get_env(), VecEnv): eval_freq_ = max(eval_freq // model.get_env().num_envs, 1) # TODO: use non-deterministic eval for Atari? eval_callback = TrialEvalCallback(eval_env, trial, n_eval_episodes=n_eval_episodes, eval_freq=eval_freq_, deterministic=True) if algo == 'her': # Wrap the env if need to flatten the dict obs if isinstance(eval_env, VecEnv): eval_env = _UnvecWrapper(eval_env) eval_env = HERGoalEnvWrapper(eval_env) try: model.learn(n_timesteps, callback=eval_callback) # Free memory model.env.close() eval_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() eval_env.close() raise optuna.exceptions.TrialPruned() is_pruned = eval_callback.is_pruned cost = -1 * eval_callback.last_mean_reward del model.env, eval_env del model if is_pruned: raise optuna.exceptions.TrialPruned() return cost
class PPOAgent(Agent): name = "ppo" def __init__(self, env: ISettableGoalEnv, verbose=1, experiment_name="ppo", rank=0): self._env = env self._dirs = Dirs( experiment_name=f"{type(env).__name__}-{experiment_name}", rank=rank) self._flat_env = HERGoalEnvWrapper(env) options = { "env": DummyVecEnv([lambda: self._flat_env]), "tensorboard_log": self._dirs.tensorboard, "gamma": 1, "seed": rank, "nminibatches": 1 } if os.path.isdir(self._dirs.models) and os.path.isfile( self._dirs.best_model): self._model = PPO2.load(load_path=self._dirs.best_model, **options) print(f"Loaded model {self._dirs.best_model}") else: self._model = PPO2("MlpPolicy", verbose=verbose, **options) def __call__(self, obs: Observation) -> np.ndarray: flat_obs = self._flat_env.convert_dict_to_obs(obs) action, _ = self._model.predict(flat_obs, deterministic=True) return action def train(self, timesteps: int, num_checkpoints=4, callbacks: Sequence[BaseCallback] = None): ppo_offset = 128 callbacks = [] if callbacks is None else callbacks cb = CheckpointCallback(save_freq=timesteps // num_checkpoints, save_path=self._dirs.models, name_prefix=self._dirs.prefix) self._model.learn(total_timesteps=timesteps + ppo_offset, callback=CallbackList([cb, *callbacks]), log_interval=100)
def main(): panda_env = PandaGraspGymEnv(urdf_root=object_data.getDataPath(), is_rendering=True, use_ik=True, is_discrete=True, num_controlled_joints=7, reward_type="sparse") env = HERGoalEnvWrapper(panda_env) model = HER.load("logs/rl_model_1000000_steps.zip") episode_rewards, episode_lengths, episode_success = evaluate_policy( model, env, n_eval_episodes=50, render=False, deterministic=True, return_episode_rewards=True) print("Final Reward {}, Episode Length{}, Success Rate {}".format( np.mean(episode_rewards), np.mean(episode_lengths), np.mean(episode_success)))
def main(_algo_name, _trained_models_dir, _trained_model_name, _lock_rotation, _should_eval, _eval_num_episodes, _should_render): is_discrete = True if _algo_name == 'DQN' else False eval_env = HERGoalEnvWrapper( get_environment(_lock_rotation=_lock_rotation, _is_discrete=is_discrete, _should_eval=_should_eval, _should_render=_should_render)) _trained_models_dir = _trained_models_dir if _trained_models_dir.endswith( '/') else _trained_models_dir + '/' model = ALGOS[_algo_name].load(_trained_models_dir + _trained_model_name) if _should_eval: episode_rewards, episode_lengths, episode_success = evaluate_policy( model=model, env=eval_env, n_eval_episodes=_eval_num_episodes, render=(not _should_eval) or _should_render, deterministic=True, return_episode_rewards=True) print( "Final evaluation for DDPG algorithm on {} episodes: " "\nReward: \n \tMEAN: {}, \tSTD: {}, \nEpisode Length: \n \tMEAN: {}, \tSTD: {}, \nSuccess Rate: {}" .format(_eval_num_episodes, np.mean(episode_rewards), np.std(episode_rewards), np.mean(episode_lengths), np.std(episode_lengths), np.mean(episode_success))) else: obs = eval_env.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = eval_env.step(action) eval_env.render(mode='human') if done: obs = eval_env.reset()
def objective(trial): kwargs = hyperparams.copy() trial.model_class = None if algo == 'her': trial.model_class = hyperparams['model_class'] # Hack to use DDPG/TD3 noise sampler if algo in ['ddpg', 'td3'] or trial.model_class in ['ddpg', 'td3']: trial.n_actions = env_fn(n_envs=1).action_space.shape[0] kwargs.update(algo_sampler(trial)) def callback(_locals, _globals): """ Callback for monitoring learning progress. :param _locals: (dict) :param _globals: (dict) :return: (bool) If False: stop training """ self_ = _locals['self'] trial = self_.trial # Initialize variables if not hasattr(self_, 'is_pruned'): self_.is_pruned = False self_.last_mean_test_reward = -np.inf self_.last_time_evaluated = 0 self_.eval_idx = 0 if (self_.num_timesteps - self_.last_time_evaluated) < evaluate_interval: return True self_.last_time_evaluated = self_.num_timesteps # Evaluate the trained agent on the test env rewards = [] n_episodes, reward_sum = 0, 0.0 # Sync the obs rms if using vecnormalize # NOTE: this does not cover all the possible cases if isinstance(self_.test_env, VecNormalize): self_.test_env.obs_rms = deepcopy(self_.env.obs_rms) # Do not normalize reward self_.test_env.norm_reward = False obs = self_.test_env.reset() while n_episodes < n_test_episodes: # Use default value for deterministic action, _ = self_.predict(obs) obs, reward, done, _ = self_.test_env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = self_.test_env.reset() mean_reward = np.mean(rewards) self_.last_mean_test_reward = mean_reward self_.eval_idx += 1 # report best or report current ? # report num_timesteps or elasped time ? trial.report(-1 * mean_reward, self_.eval_idx) # Prune trial if need if trial.should_prune(self_.eval_idx): self_.is_pruned = True return False return True model = model_fn(**kwargs) model.test_env = env_fn(n_envs=1) model.trial = trial if algo == 'her': model.model.trial = trial # Wrap the env if need to flatten the dict obs if isinstance(model.test_env, VecEnv): model.test_env = _UnvecWrapper(model.test_env) model.model.test_env = HERGoalEnvWrapper(model.test_env) try: model.learn(n_timesteps, callback=callback) # Free memory model.env.close() model.test_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() model.test_env.close() raise is_pruned = False cost = np.inf if hasattr(model, 'is_pruned'): is_pruned = model.is_pruned cost = -1 * model.last_mean_test_reward del model.env, model.test_env del model if is_pruned: raise optuna.structs.TrialPruned() return cost
def main(_algo_name, _algo_tag, _tag_suffix, _save_freq, _lock_rotation, _eval_num, _eval_freq, hyperparams): rotation_tag = "_LOCKED_ROT_" if _lock_rotation else "_ROTATION_" full_tag = _algo_name + rotation_tag + _algo_tag + _tag_suffix current_dir = _algo_name + "/" + full_tag log_dir = current_dir + "/log/" eval_log_dir = current_dir + "/log/eval/" trained_models_dir = current_dir + "/models/" os.makedirs(log_dir, exist_ok=True) os.makedirs(eval_log_dir, exist_ok=True) os.makedirs(trained_models_dir, exist_ok=True) is_discrete = True if _algo_name == 'DQN' else False panda_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation, _is_discrete=is_discrete), log_dir)) eval_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation, _is_discrete=is_discrete), eval_log_dir)) callbacks = [] callbacks.append(CheckpointCallback(_save_freq, trained_models_dir)) if _save_freq > 0 else None callbacks.append(MeanHundredEpsTensorboardCallback(log_dir)) callbacks.append(StdHundredEpsTensorboardCallback(log_dir)) callbacks.append(SuccessRateTensorboardCallback(log_dir)) if _algo_name == 'DDPG': callbacks.append(SaveOnBestTrainingRewardCallback(10000, log_dir)) else: callbacks.append(EvalCallback(eval_env, best_model_save_path=trained_models_dir, log_path=log_dir, eval_freq=_eval_freq, deterministic=True, render=False, n_eval_episodes=_eval_num)) if _eval_freq > 0 else None time_steps = hyperparams.pop('n_timesteps') if hyperparams.get('n_timesteps') is not None else None param_noise = None action_noise = None if hyperparams.get('noise_type') is not None: noise_type = hyperparams.pop('noise_type').strip() if 'ornstein-uhlenbeck' in noise_type: n_actions = panda_env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.005) * np.ones(n_actions)) elif 'param_noise' in noise_type: param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # add action noise for DDPG or TD3, in DQN noise as flag already in hyperparams if _algo_name == 'DDPG' or _algo_name == 'TD3': hyperparams['action_noise'] = action_noise # add hyperparams specific only for DDPG if _algo_name == 'DDPG': hyperparams['param_noise'] = param_noise hyperparams['eval_env'] = eval_env model = ALGOS[_algo_name](env=panda_env, tensorboard_log="tensorboard/", n_cpu_tf_sess=None, **hyperparams) model.learn(total_timesteps=time_steps, callback=callbacks, tb_log_name=full_tag, log_interval=10) model.save(current_dir + "/" + full_tag + "_final")
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id, **env_kwargs) env.seed(args.seed) # added by Pierre (for some reason, monitoring the training wasn't enabled for DDPG) log_file = os.path.join(log_dir, str(rank)) if log_dir is not None else None env = Monitor(env, log_file) if env_wrapper is not None: env = env_wrapper(env) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs['norm_reward'] = False else: local_normalize_kwargs = {'norm_reward': False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print("Normalization activated: {}".format( local_normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) if args.algo == 'her': # Wrap the env if need to flatten the dict obs if isinstance(env, VecEnv): env = _UnvecWrapper(env) env = HERGoalEnvWrapper(env) return env
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path if eval_env: eval_env_kwargs = env_kwargs if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) else: if n_envs == 1: if eval_env: eval_env_kwargs["goal_tolerance_parameters"]["set_tol"] = 0.001 env = DummyVecEnv( [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, info_keywords=("is_success", "error"), env_kwargs=eval_env_kwargs)]) else: env = DummyVecEnv( [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, info_keywords=(), env_kwargs=env_kwargs)]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)]) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs['norm_reward'] = False else: local_normalize_kwargs = {'norm_reward': False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print("Normalization activated: {}".format(local_normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) if args.algo == 'her': # Wrap the env if need to flatten the dict obs if isinstance(env, VecEnv): env = _UnvecWrapper(env) env = HERGoalEnvWrapper(env) return env
def configure_env(arg_dict, model_logdir=None, for_train=True): if arg_dict["engine"] == "pybullet": env_arguments = { "render_on": True, "visualize": arg_dict["visualize"], "workspace": arg_dict["workspace"], "robot": arg_dict["robot"], "robot_init_joint_poses": arg_dict["robot_init"], "robot_action": arg_dict["robot_action"], "task_type": arg_dict["task_type"], "num_subgoals": arg_dict["num_subgoals"], "task_objects": arg_dict["task_objects"], "distractors": arg_dict["distractors"], "distractor_moveable": arg_dict["distractor_moveable"], "distractor_constant_speed": arg_dict["distractor_constant_speed"], "distractor_movement_dimensions": arg_dict["distractor_movement_dimensions"], "distractor_movement_endpoints": arg_dict["distractor_movement_endpoints"], "coefficient_kd": arg_dict["coefficient_kd"], "coefficient_kw": arg_dict["coefficient_kw"], "coefficient_ka": arg_dict["coefficient_ka"], "observed_links_num": arg_dict["observed_links_num"], "reward_type": arg_dict["reward_type"], "distance_type": arg_dict["distance_type"], "used_objects": arg_dict["used_objects"], "object_sampling_area": arg_dict["object_sampling_area"], "active_cameras": arg_dict["camera"], "max_steps": arg_dict["max_episode_steps"], "visgym": arg_dict["visgym"], "reward": arg_dict["reward"], "logdir": arg_dict["logdir"], "vae_path": arg_dict["vae_path"], "yolact_path": arg_dict["yolact_path"], "yolact_config": arg_dict["yolact_config"] } if for_train: env_arguments["gui_on"] = False else: env_arguments["gui_on"] = arg_dict["gui"] if arg_dict["algo"] == "her": env = gym.make(arg_dict["env_name"], **env_arguments, obs_space="dict") # her needs obs as a dict else: env = gym.make(arg_dict["env_name"], **env_arguments) elif arg_dict["engine"] == "mujoco": if arg_dict["multiprocessing"]: # ACKTR, PPO2, A2C, DDPG can use vectorized environments, but the only way to display the results (for me) is using CV2 imshow. -(TensorFlow comment) env = make_vec_env(arg_dict["env_name"], n_envs=arg_dict["vectorized_envs"]) else: env = gym.make(arg_dict["env_name"]) if for_train: if arg_dict["engine"] == "mujoco": env = VecMonitor( env, model_logdir) if arg_dict["multiprocessing"] else Monitor( env, model_logdir) elif arg_dict["engine"] == "pybullet": env = Monitor(env, model_logdir, info_keywords=tuple('d')) if arg_dict["algo"] == "her": env = HERGoalEnvWrapper(env) return env
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path # Set initialzier and action type for environment, standard implementation currently does not support # custom types, so pass them here (kwargs is global, so do set again during repeated calls) if "initializer" in env_kwargs.keys() and isinstance( env_kwargs["initializer"], int): if env_kwargs["initializer"] == 0: env_kwargs["initializer"] = RandomInitializer( env_kwargs.pop("difficulty")) elif env_kwargs["initializer"] == 1: env_kwargs["initializer"] = CompletelyRandomInitializer() else: raise RuntimeError('Unsupported initializer "{}"'.format( env_kwargs["initializer"])) if "action_type" in env_kwargs.keys() and isinstance( env_kwargs["action_type"], int): if env_kwargs["action_type"] == "POSITION": env_kwargs["action_type"] = ActionType.POSITION elif env_kwargs["action_type"] == "TORQUE": env_kwargs["action_type"] = ActionType.TORQUE elif env_kwargs["action_type"] == "TORQUE_AND_POSITION": env_kwargs["action_type"] = ActionType.TORQUE_AND_POSITION else: raise RuntimeError('Unsupported Action Type"{}"'.format( kwargs["action_type"])) else: env_kwargs["action_type"] = ActionType.POSITION if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id, **env_kwargs) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs['norm_reward'] = False else: local_normalize_kwargs = {'norm_reward': False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print("Normalization activated: {}".format( local_normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) if args.algo == 'her': # Wrap the env if need to flatten the dict obs if isinstance(env, VecEnv): env = _UnvecWrapper(env) env = HERGoalEnvWrapper(env) return env