def make_env(i, this_seed): # Previously, we directly called `gym.make(env_name)`, but running # `imitation.scripts.train_adversarial` within `imitation.scripts.parallel` # created a weird interaction between Gym and Ray -- `gym.make` would fail # inside this function for any of our custom environment unless those # environments were also `gym.register()`ed inside `make_env`. Even # registering the custom environment in the scope of `make_vec_env` didn't # work. For more discussion and hypotheses on this issue see PR #160: # https://github.com/HumanCompatibleAI/imitation/pull/160. env = spec.make() # Seed each environment with a different, non-sequential seed for diversity # (even if caller is passing us sequentially-assigned base seeds). int() is # necessary to work around gym bug where it chokes on numpy int64s. env.seed(int(this_seed)) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps) elif spec.max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=spec.max_episode_steps) # Use Monitor to record statistics needed for Baselines algorithms logging # Optionally, save to disk log_path = None if log_dir is not None: log_subdir = os.path.join(log_dir, "monitor") os.makedirs(log_subdir, exist_ok=True) log_path = os.path.join(log_subdir, f"mon{i:03d}") env = bench.Monitor(env, log_path) env = wrappers.RolloutInfoWrapper(env) return env
def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
def test_deepq(): """ test DeepQ on atari """ logger.configure() set_global_seeds(SEED) env = make_atari(ENV_ID) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) model = DQN(env=env, policy=CnnPolicy, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6, checkpoint_freq=10000) model.learn(total_timesteps=NUM_TIMESTEPS) env.close() del model, env
def make_env(datapaths): if len(datapaths) > 1: env = EnsembleEnv(datapaths) else: env = Env(datapaths[0]) env = bench.Monitor(env, logger.get_dir()) return env
def test_deepq(): """ test DeepQ on atari """ logger.configure() set_global_seeds(SEED) env = make_atari(ENV_ID) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) q_func = deepq_models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) model = DeepQ(env=env, policy=q_func, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6, checkpoint_freq=10000) model.learn(total_timesteps=NUM_TIMESTEPS) env.close() del model, env
def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1.load(BEST_MODEL_PATH, env=env) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join( LOGDIR, "final_model")) # probably never get to this point.
def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Atari environments, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) model.learn(total_timesteps=num_timesteps) env.close() del env
def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out
def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out
def _thunk(): local_env_kwargs = dict(env_kwargs) # copy this to avoid altering the others local_env_kwargs["env_rank"] = rank env = _make(env_id, env_kwargs=local_env_kwargs) env.seed(seed + rank) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) return env
def make_env(): # env_out = gym.make(env_id, reset_noise_scale=1.0) env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) env_out = wrap_mujoco(env_out, random_action_len=random_action_len) return env_out
def create_env(env_id, delay_step, env_str=str(0)): # if env_type in ["mujoco", "Mujoco", "MuJoCo", "raw", "mujoco_raw", "raw_mujoco"]: env = gym.make(env_id) env = TimestepWrapper(env) env = DelayedRewardWrapper(env, delay_step) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), env_str)) return env
def main(args): """ start training the model :param args: (ArgumentParser) the training argument """ with tf_util.make_session(num_cpu=1): set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess, hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders) #======================================================================================================== env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) # 길고 긴 task name을 받아옵니다. =========================================================================== task_name = get_task_name(args) args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) args.log_dir = os.path.join(args.log_dir, task_name) # ======================================================================================================= if args.task == 'train': dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) #discriminator 네트워크 생성 reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) #policy 네트워크 학습 #policy network 가 policy_fn 으로 선언되어있다는 것에 주의 train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained, args.bc_max_iter, task_name) # ======================================= 이것은 나중에 이해하보는 거시여============================================= # 학습된 모델을 evaluate 할 때 사용합니다. elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def make_envs(env_id, do_eval, seed, conf, normalize_observations=False, normalize_returns=False): # Create envs. env_params = conf.pop('env_params', {}) env = base_env = gym.make(env_id) if hasattr(base_env, 'env'): base_env = base_env.env for attr in env_params: setattr(base_env, attr, env_params[attr]) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if normalize_observations or normalize_returns: env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=normalize_observations, norm_reward=normalize_returns) if do_eval: eval_env = base_eval_env = gym.make(env_id) if hasattr(base_eval_env, 'env'): base_eval_env = base_eval_env.env for attr in env_params: setattr(base_eval_env, attr, env_params[attr]) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'), allow_early_resets=True) eval_env.seed(seed) eval_env.base_env = base_eval_env else: base_eval_env = None eval_env = None env.base_env = base_env return base_env, env, base_eval_env, eval_env
def make_env(i): env = gym.make(env_id) env.seed(seed + i) # seed each environment separately for diversity # Use Monitor to record statistics needed for Baselines algorithms logging # Optionally, save to disk log_path = None if log_dir is not None: log_subdir = os.path.join(log_dir, 'monitor') os.makedirs(log_subdir, exist_ok=True) log_path = os.path.join(log_subdir, f'mon{i:03d}') return bench.Monitor(env, log_path, allow_early_resets=True)
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch, dc): U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) if num_options == 1: optimsize = 64 elif num_options == 2: optimsize = 32 else: print("Only two options or primitive actions is currently supported.") sys.exit() pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=optimsize, gamma=0.99, lam=0.95, schedule='constant', num_options=num_options, app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed, dc=dc) env.close()
def main(args): """ start training the model :param args: (ArgumentParser) the training argument """ with tf_util.make_session(num_cpu=1): set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False, sess=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) args.log_dir = os.path.join(args.log_dir, task_name) dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, task_name=task_name, verbose=True) runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def main(): """ run the atari test """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) q_func = deepq_models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) model = DeepQ( env=env, policy=q_func, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) model.learn(total_timesteps=args.num_timesteps) env.close()
def test_rollout_stats(): """Applying `ObsRewIncrementWrapper` halves the reward mean. `rollout_stats` should reflect this. """ env = gym.make("CartPole-v1") env = bench.Monitor(env, None) env = ObsRewHalveWrapper(env) venv = vec_env.DummyVecEnv([lambda: env]) with serialize.load_policy("zero", "UNUSED", venv) as policy: trajs = rollout.generate_trajectories(policy, venv, rollout.min_episodes(10)) s = rollout.rollout_stats(trajs) np.testing.assert_allclose(s["return_mean"], s["monitor_return_mean"] / 2) np.testing.assert_allclose(s["return_std"], s["monitor_return_std"] / 2) np.testing.assert_allclose(s["return_min"], s["monitor_return_min"] / 2) np.testing.assert_allclose(s["return_max"], s["monitor_return_max"] / 2)
def train(env_id, num_timesteps, seed): """ Train TRPO model for the atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): # pylint: disable=W0613 # return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close()
def make_env(): env_out = GymWrapper( suite.make( "SawyerLift", use_camera_obs=False, # do not use pixel observations has_offscreen_renderer= False, # not needed since not using pixel obs has_renderer=True, # make sure we can render to the screen reward_shaping=True, # use dense rewards control_freq= 10, # control should happen fast enough so that simulation looks smooth )) env_out.reward_range = None env_out.metadata = None env_out.spec = None env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out
def main(): """ Run the atari test """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) policy = partial(CnnPolicy, dueling=args.dueling == 1) model = DQN( env=env, policy=policy, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, ) model.learn(total_timesteps=args.num_timesteps) env.close()
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def make_env(i): env = CartPoleNoVelEnv() env = bench.Monitor(env, None, allow_early_resets=True) env.seed(i) return env
def _init(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) return env
def make_env(i): env = CartPoleNoVelEnv() env = TimeLimit(env, max_episode_steps=500) env = bench.Monitor(env, None, allow_early_resets=True) env.seed(i) return env
def make_env(i): env = env = gym.make("Breakout-v0") env = bench.Monitor(env, None, allow_early_resets=True) env.seed(i) return env
def make_env(): env_out = StudentEnv() env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out
def make_env(): env_out = gym.make('CartPole-v0') env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out