示例#1
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    env = Monitor(
        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
        info_keywords=('is_success',))
    env.seed(seed)
    return env
示例#2
0
def make_mujoco_env(env_id, seed, reward_scale=1.0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    rank = MPI.COMM_WORLD.Get_rank()
    myseed = seed  + 1000 * rank if seed is not None else None
    set_global_seeds(myseed)
    env = gym.make(env_id)
    logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank))
    env = Monitor(env, logger_path, allow_early_resets=True)
    env.seed(seed)
    if reward_scale != 1.0:
        from  deephyper.search.nas.utils.common.retro_wrappers import RewardScaler
        env = RewardScaler(env, reward_scale)
    return env
def train(num_episodes, seed, space, evaluator, num_episodes_per_batch,
          reward_rule):

    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank(
    ) if seed is not None else None
    set_global_seeds(workerseed)

    # MAKE ENV_NAS
    structure = space['create_structure']['func'](
        **space['create_structure']['kwargs'])

    num_nodes = structure.num_nodes
    timesteps_per_actorbatch = num_nodes * num_episodes_per_batch
    num_timesteps = timesteps_per_actorbatch * num_episodes

    env = NasEnvEmb(space, evaluator, structure)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return lstm.LstmPolicy(name=name,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               num_units=32,
                               async_update=True)

    pposgd_async.learn(env,
                       policy_fn,
                       max_timesteps=num_timesteps,
                       timesteps_per_actorbatch=timesteps_per_actorbatch,
                       clip_param=0.2,
                       entcoeff=0.01,
                       optim_epochs=4,
                       optim_stepsize=1e-3,
                       optim_batchsize=15,
                       gamma=0.99,
                       lam=0.95,
                       schedule='linear',
                       reward_rule=reward_rule)
    env.close()
示例#4
0
def train(num_episodes, seed, space, evaluator, num_episodes_per_batch):

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:  # rank zero simule the use of a parameter server
        pass
    else:
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank(
        ) if seed is not None else None
        set_global_seeds(workerseed)

        # MAKE ENV_NAS
        structure = space['create_structure']['func'](
            **space['create_structure']['kwargs'])

        num_nodes = structure.num_nodes
        timesteps_per_actorbatch = num_nodes * num_episodes_per_batch
        num_timesteps = timesteps_per_actorbatch * num_episodes

        max_timesteps = num_timesteps
        timesteps_per_actorbatch = timesteps_per_actorbatch

        env = NasEnv(space, evaluator, structure)

        seg_gen = traj_segment_generator(env, timesteps_per_actorbatch)

        timesteps_so_far = 0
        iters_so_far = 0

        cond = sum([max_timesteps > 0])
        assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_timesteps={max_timesteps}"

        while True:
            if max_timesteps and timesteps_so_far >= max_timesteps:
                break

            logger.log("********** Iteration %i ************" % iters_so_far)

            seg = seg_gen.__next__()
            dh_logger.info(
                jm(type='seg', rank=MPI.COMM_WORLD.Get_rank(), **seg))
            iters_so_far += 1

        env.close()
示例#5
0
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    def make_env(rank): # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id)
            env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
            env = Monitor(env,
                          logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)),
                          allow_early_resets=True)

            if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs)
            elif reward_scale != 1: return RewardScaler(env, reward_scale)
            else: return env
        return _thunk
    set_global_seeds(seed)
    if num_env > 1: return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
    else: return DummyVecEnv([make_env(start_index)])
示例#6
0
def train(num_iter, seed, evaluator, num_episodes_per_iter):

    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
    set_global_seeds(workerseed)

    # MAKE ENV_NAS
    timesteps_per_episode = 10
    timesteps_per_actorbatch = timesteps_per_episode*num_episodes_per_iter
    num_timesteps = timesteps_per_actorbatch * num_iter

    env = MathEnv(evaluator)

    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return lstm_ph.LstmPolicy(name=name, ob_space=ob_space, ac_space=ac_space, num_units=64)

    pposgd_sync_ph.learn(env, policy_fn,
        max_timesteps=int(num_timesteps),
        timesteps_per_actorbatch=timesteps_per_actorbatch,
        clip_param=0.2,
        entcoeff=0.01, #0.01,
        optim_epochs=4,
        optim_stepsize=1e-3,
        optim_batchsize=10,
        gamma=0.99, # 0.99
        lam=0.95, # 0.95
        schedule='linear',
        reward_rule=reward_for_final_timestep
    )
    env.close()