예제 #1
0
 def _thunk():
   env = make_atari(env_id)
   env = gym.wrappers.Monitor(env, '/tmp/video', force=True, video_callable=lambda ep: True)
   env.seed(seed + rank)
   env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
                 allow_early_resets=allow_early_resets)
   return wrap_deepmind(env, **wrapper_kwargs)
예제 #2
0
def _create_single_football_env(
    level,
    stacked,
    representation,
    reward_experiment,
    write_goal_dumps,
    write_full_episode_dumps,
    write_video,
    dump_frequency,
    render,
    process_number=0,
):
    """
  Creates gfootball environment.
  Meaning of all variables you can find in footbal.gfootball.examples.run_ppo2.py
  """
    env = create_environment(
        env_name=level,
        stacked=stacked,
        representation=representation,
        rewards=reward_experiment,
        logdir=logger.get_dir(),
        write_goal_dumps=write_goal_dumps and (process_number == 0),
        write_full_episode_dumps=write_full_episode_dumps
        and (process_number == 0),
        write_video=write_video,
        render=render and (process_number == 0),
        dump_frequency=dump_frequency if render and process_number == 0 else 0)
    env = monitor.Monitor(
        env,
        logger.get_dir()
        and os.path.join(logger.get_dir(), str(process_number)))
    return env
예제 #3
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()
    elif args["env_kind"] == "virtualbox":
        print("Make virtualbox")
        env = make_virtualbox()

    if add_monitor:
        logger_dir = "log" if logger.get_dir() is None else logger.get_dir()
        print("Adding monitor at dir", logger_dir)
        env = Monitor(env, osp.join(logger_dir, '%.2i' % rank))
    return env
예제 #4
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Atari environments, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    env = bench.Monitor(env, logger.get_dir() and
                        os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4,
                 optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    model.learn(total_timesteps=num_timesteps)
    env.close()
    del env
예제 #5
0
 def _thunk():
     env = gym.make(env_id)
     env.seed(seed + rank)
     env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
                   allow_early_resets=allow_early_resets, reset_keywords=())
     # return wrap_deepmind(env, **wrapper_kwargs)
     return env
예제 #6
0
def train():
  """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
  rank = MPI.COMM_WORLD.Get_rank()

  if rank == 0:
    logger.configure(folder=LOGDIR)

  else:
    logger.configure(format_strs=[])
  workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
  set_global_seeds(workerseed)
  env = make_env(workerseed)

  env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
  env.seed(workerseed)

  model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
               optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',
               verbose=1)

  eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

  env.close()
  del env
  if rank == 0:
    model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
예제 #7
0
def train():
    """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure(folder=LOGDIR)

    else:
        logger.configure(format_strs=[])
    workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_env(workerseed)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    model = PPO1.load(BEST_MODEL_PATH, env=env)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    env.close()
    del env
    if rank == 0:
        model.save(os.path.join(
            LOGDIR, "final_model"))  # probably never get to this point.
예제 #8
0
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param rank: (int) the rank of the environment (for logging)
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The robotic environment
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    keys = ['observation', 'desired_goal']
    # TODO: remove try-except once most users are running modern Gym
    try:  # for modern Gym (>=0.15.4)
        from gym.wrappers import FilterObservation, FlattenObservation
        env = FlattenObservation(FilterObservation(env, keys))
    except ImportError:  # for older gym (<=0.15.3)
        from gym.wrappers import FlattenDictWrapper  # pytype:disable=import-error
        env = FlattenDictWrapper(env, keys)
    env = Monitor(env,
                  logger.get_dir()
                  and os.path.join(logger.get_dir(), str(rank)),
                  info_keywords=('is_success', ),
                  allow_early_resets=allow_early_resets)
    env.seed(seed)
    return env
예제 #9
0
 def _thunk():
     env = make_atari(env_id)
     env.seed(seed + rank)
     env = Monitor(env,
                   logger.get_dir()
                   and os.path.join(logger.get_dir(), str(rank)),
                   allow_early_resets=allow_early_resets)
     return wrap_deepmind(env, **wrapper_kwargs)
def main(args):
    """
    start training the model

    :param args: (ArgumentParser) the training argument
    """
    with tf_util.make_session(num_cpu=1):
        set_global_seeds(args.seed)
        env = gym.make(args.env_id)

        def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None):
            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess,
                                        hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders)


        #========================================================================================================
        env = bench.Monitor(env, logger.get_dir() and
                            os.path.join(logger.get_dir(), "monitor.json"))
        env.seed(args.seed)
        gym.logger.setLevel(logging.WARN)

        # 길고 긴 task name을 받아옵니다. ===========================================================================
        task_name = get_task_name(args)
        args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name)
        args.log_dir = os.path.join(args.log_dir, task_name)
        # =======================================================================================================


        if args.task == 'train':
            dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)

            #discriminator 네트워크 생성
            reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)

            #policy 네트워크 학습
            #policy network 가 policy_fn 으로 선언되어있다는 것에 주의
            train(env, args.seed, policy_fn, reward_giver,
                  dataset, args.algo, args.g_step, args.d_step,
                  args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained, args.bc_max_iter, task_name)



        # ======================================= 이것은 나중에 이해하보는 거시여=============================================
        # 학습된 모델을 evaluate 할 때 사용합니다.
        elif args.task == 'evaluate':
            runner(env,
                   policy_fn,
                   args.load_model_path,
                   timesteps_per_batch=1024,
                   number_trajs=10,
                   stochastic_policy=args.stochastic_policy,
                   save=args.save_sample
                   )
        else:
            raise NotImplementedError

        env.close()
예제 #11
0
파일: run_util.py 프로젝트: MouseHu/gem
def create_env(env_id, delay_step, env_str=str(0)):
    # if env_type in ["mujoco", "Mujoco", "MuJoCo", "raw", "mujoco_raw", "raw_mujoco"]:
    env = gym.make(env_id)
    env = TimestepWrapper(env)
    env = DelayedRewardWrapper(env, delay_step)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), env_str))
    return env
예제 #12
0
 def _thunk():
     no_graphics = not use_visual
     unity_env = UnityEnvironment(env_directory,
                                  no_graphics=no_graphics)
     env = UnityToGymWrapper(unity_env, uint8_visual=False)
     env = Monitor(
         env,
         logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
     return env
예제 #13
0
파일: cmd_util.py 프로젝트: buoyancy99/sap
 def _thunk():
     env = make_atari(env_id,
                      max_episode_steps=max_episode_steps,
                      action_space=action_space)
     env.seed(seed + rank)
     env = Monitor(env,
                   logger.get_dir()
                   and os.path.join(logger.get_dir(), str(rank)),
                   allow_early_resets=True)
     return wrap_deepmind(env, **wrapper_kwargs)
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch,
          dc):

    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2,
                                    num_options=num_options,
                                    dc=dc)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    if num_options == 1:
        optimsize = 64
    elif num_options == 2:
        optimsize = 32
    else:
        print("Only two options or primitive actions is currently supported.")
        sys.exit()

    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=optimsize,
                        gamma=0.99,
                        lam=0.95,
                        schedule='constant',
                        num_options=num_options,
                        app=app,
                        saves=saves,
                        wsaves=wsaves,
                        epoch=epoch,
                        seed=seed,
                        dc=dc)
    env.close()
        def _thunk():
            env = make_mario(env_id)
            env.seed(seed + rank)

            if cut_map:
                env = CutMarioMap(env)

            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(), str(rank)),
                          allow_early_resets=allow_early_resets)

            # FIXME do if wrap deepmind, create other methods
            return wrap_deepmind_custom(
                env, **wrapper_kwargs)  # converts to 84*84 bw, keep for now
예제 #16
0
def create_single_football_env(seed):
  """Creates gfootball environment."""
#   env = football_env.create_environment(
#       env_name=FLAGS.level, stacked=('stacked' in FLAGS.state),
#       rewards=FLAGS.reward_experiment,
#       logdir=logger.get_dir() + str(seed),
#       enable_goal_videos=FLAGS.dump_scores and (seed == 0),
#       enable_full_episode_videos=FLAGS.dump_full_episodes and (seed == 0),
#       render= True and (seed == 0),
#       dump_frequency=50 if FLAGS.render and seed == 0 else 0)

  env = football_env.create_environment(env_name="academy_3_vs_1_with_keeper", stacked=True,
                                        representation='extracted', render=False and (seed == 0), channel_dimensions=(64, 64), rewards='scoring,checkpoints')
  env = monitor.Monitor(env, logger.get_dir()
                        and os.path.join(logger.get_dir(), str(seed)))
  return env
예제 #17
0
def test_deepq():
    """
    test DeepQ on atari
    """
    logger.configure()
    set_global_seeds(SEED)
    env = make_atari(ENV_ID)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)
    q_func = deepq_models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                            (64, 3, 1)],
                                     hiddens=[256],
                                     dueling=True)

    model = DeepQ(env=env,
                  policy=q_func,
                  learning_rate=1e-4,
                  buffer_size=10000,
                  exploration_fraction=0.1,
                  exploration_final_eps=0.01,
                  train_freq=4,
                  learning_starts=10000,
                  target_network_update_freq=1000,
                  gamma=0.99,
                  prioritized_replay=True,
                  prioritized_replay_alpha=0.6,
                  checkpoint_freq=10000)
    model.learn(total_timesteps=NUM_TIMESTEPS)

    env.close()
    del model, env
예제 #18
0
def test_deepq():
    """
    test DeepQ on atari
    """
    logger.configure()
    set_global_seeds(SEED)
    env = make_atari(ENV_ID)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)

    model = DQN(env=env,
                policy=CnnPolicy,
                learning_rate=1e-4,
                buffer_size=10000,
                exploration_fraction=0.1,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                prioritized_replay_alpha=0.6,
                checkpoint_freq=10000)
    model.learn(total_timesteps=NUM_TIMESTEPS)

    env.close()
    del model, env
예제 #19
0
    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out
def main():
    """
    Runs the test
    """
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
    parser.set_defaults(num_timesteps=int(2e7))

    args = parser.parse_args()

    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)

    else:
        # construct the model object, load pre-trained model and render
        model = train(num_timesteps=1, seed=args.seed)
        tf_util.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=0)

        obs = env.reset()
        while True:
            action = model.policy.act(stochastic=False, obs=obs)[0]
            obs, _, done, _ = env.step(action)
            env.render()
            if done:
                obs = env.reset()
예제 #21
0
def make_env(datapaths):
    if len(datapaths) > 1:
        env = EnsembleEnv(datapaths)
    else:
        env = Env(datapaths[0])
    env = bench.Monitor(env, logger.get_dir())
    return env
예제 #22
0
 def make_env():
     env_out = gym.make(env_id)
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     env_out.seed(seed)
     return env_out
예제 #23
0
 def make_env():
     env_out = gym.make(args.env)
     env_out.env.disableViewer = True
     env_out.env.visualize = False
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     return env_out
예제 #24
0
def main(args):
    """
    start training the model

    :param args: (ArgumentParser) the training argument
    """
    with tf_util.make_session(num_cpu=1):
        set_global_seeds(args.seed)
        env = gym.make(args.env_id)

        def policy_fn(name, ob_space, ac_space, reuse=False, sess=None):
            return mlp_policy.MlpPolicy(name=name,
                                        ob_space=ob_space,
                                        ac_space=ac_space,
                                        sess=sess,
                                        reuse=reuse,
                                        hid_size=args.policy_hidden_size,
                                        num_hid_layers=2)

        env = bench.Monitor(
            env,
            logger.get_dir()
            and os.path.join(logger.get_dir(), "monitor.json"))
        env.seed(args.seed)
        gym.logger.setLevel(logging.WARN)
        task_name = get_task_name(args)
        args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name)
        args.log_dir = os.path.join(args.log_dir, task_name)
        dataset = MujocoDset(expert_path=args.expert_path,
                             traj_limitation=args.traj_limitation)
        savedir_fname = learn(env,
                              policy_fn,
                              dataset,
                              max_iters=args.BC_max_iter,
                              ckpt_dir=args.checkpoint_dir,
                              task_name=task_name,
                              verbose=True)
        runner(env,
               policy_fn,
               savedir_fname,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample,
               reuse=True)
예제 #25
0
def log_eval(num_update, mean_eval_reward, file_name='eval.csv'):
    if not os.path.exists(os.path.join(logger.get_dir(), file_name)):
        with open(os.path.join(logger.get_dir(), file_name), 'a',
                  newline='') as csvfile:
            csvwriter = csv.writer(csvfile,
                                   delimiter=',',
                                   quotechar=',',
                                   quoting=csv.QUOTE_MINIMAL)
            title = ['n_updates', 'mean_eval_reward']
            csvwriter.writerow(title)
    with open(os.path.join(logger.get_dir(), file_name), 'a',
              newline='') as csvfile:
        csvwriter = csv.writer(csvfile,
                               delimiter=',',
                               quotechar=',',
                               quoting=csv.QUOTE_MINIMAL)
        data = [num_update, mean_eval_reward]
        csvwriter.writerow(data)
예제 #26
0
 def make_env():
     # env_out = gym.make(env_id, reset_noise_scale=1.0)
     env_out = gym.make(env_id)
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     env_out.seed(seed)
     env_out = wrap_mujoco(env_out, random_action_len=random_action_len)
     return env_out
예제 #27
0
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True):
  """
  Create a wrapped, monitored gym.Env for MuJoCo.

  :param env_id: (str) the environment ID
  :param seed: (int) the inital seed for RNG
  :param rank: (int) the rank of the environment (for logging)
  :param allow_early_resets: (bool) allows early reset of the environment
  :return: (Gym Environment) The robotic environment
  """
  set_global_seeds(seed)
  env = gym.make(env_id)
  env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
  env = Monitor(
    env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
    info_keywords=('is_success',), allow_early_resets=allow_early_resets)
  env.seed(seed)
  return env
예제 #28
0
파일: util.py 프로젝트: knaggita/ac-teach
def make_envs(env_id,
              do_eval,
              seed,
              conf,
              normalize_observations=False,
              normalize_returns=False):
    # Create envs.
    env_params = conf.pop('env_params', {})
    env = base_env = gym.make(env_id)
    if hasattr(base_env, 'env'):
        base_env = base_env.env
    for attr in env_params:
        setattr(base_env, attr, env_params[attr])
    env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)

    if normalize_observations or normalize_returns:
        env = DummyVecEnv([lambda: env])
        env = VecNormalize(env,
                           norm_obs=normalize_observations,
                           norm_reward=normalize_returns)

    if do_eval:
        eval_env = base_eval_env = gym.make(env_id)
        if hasattr(base_eval_env, 'env'):
            base_eval_env = base_eval_env.env
        for attr in env_params:
            setattr(base_eval_env, attr, env_params[attr])
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'),
                                 allow_early_resets=True)
        eval_env.seed(seed)
        eval_env.base_env = base_eval_env
    else:
        base_eval_env = None
        eval_env = None
    env.base_env = base_env

    return base_env, env, base_eval_env, eval_env
예제 #29
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):  # pylint: disable=W0613
    #     return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = TRPO(CnnPolicy,
                 env,
                 timesteps_per_batch=512,
                 max_kl=0.001,
                 cg_iters=10,
                 cg_damping=1e-3,
                 entcoeff=0.0,
                 gamma=0.98,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
예제 #30
0
def make_mujoco_env(env_id, seed, allow_early_resets=True):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The mujoco environment
    """
    set_global_seeds(seed + 10000 * mpi_rank_or_zero())
    env = gym.make(env_id)
    env = Monitor(env, os.path.join(logger.get_dir(), '0'), allow_early_resets=allow_early_resets)
    env.seed(seed)
    return env