def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',)) env.seed(seed) return env
def make_mujoco_env(env_id, seed, reward_scale=1.0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() myseed = seed + 1000 * rank if seed is not None else None set_global_seeds(myseed) env = gym.make(env_id) logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank)) env = Monitor(env, logger_path, allow_early_resets=True) env.seed(seed) if reward_scale != 1.0: from deephyper.search.nas.utils.common.retro_wrappers import RewardScaler env = RewardScaler(env, reward_scale) return env
def train(num_episodes, seed, space, evaluator, num_episodes_per_batch, reward_rule): rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank( ) if seed is not None else None set_global_seeds(workerseed) # MAKE ENV_NAS structure = space['create_structure']['func']( **space['create_structure']['kwargs']) num_nodes = structure.num_nodes timesteps_per_actorbatch = num_nodes * num_episodes_per_batch num_timesteps = timesteps_per_actorbatch * num_episodes env = NasEnvEmb(space, evaluator, structure) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return lstm.LstmPolicy(name=name, ob_space=ob_space, ac_space=ac_space, num_units=32, async_update=True) pposgd_async.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=15, gamma=0.99, lam=0.95, schedule='linear', reward_rule=reward_rule) env.close()
def train(num_episodes, seed, space, evaluator, num_episodes_per_batch): rank = MPI.COMM_WORLD.Get_rank() if rank == 0: # rank zero simule the use of a parameter server pass else: workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank( ) if seed is not None else None set_global_seeds(workerseed) # MAKE ENV_NAS structure = space['create_structure']['func']( **space['create_structure']['kwargs']) num_nodes = structure.num_nodes timesteps_per_actorbatch = num_nodes * num_episodes_per_batch num_timesteps = timesteps_per_actorbatch * num_episodes max_timesteps = num_timesteps timesteps_per_actorbatch = timesteps_per_actorbatch env = NasEnv(space, evaluator, structure) seg_gen = traj_segment_generator(env, timesteps_per_actorbatch) timesteps_so_far = 0 iters_so_far = 0 cond = sum([max_timesteps > 0]) assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_timesteps={max_timesteps}" while True: if max_timesteps and timesteps_so_far >= max_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() dh_logger.info( jm(type='seg', rank=MPI.COMM_WORLD.Get_rank(), **seg)) iters_so_far += 1 env.close()
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ if wrapper_kwargs is None: wrapper_kwargs = {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) if env_type == 'atari' else gym.make(env_id) env.seed(seed + 10000*mpi_rank + rank if seed is not None else None) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)), allow_early_resets=True) if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs) elif reward_scale != 1: return RewardScaler(env, reward_scale) else: return env return _thunk set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_env(start_index)])
def train(num_iter, seed, evaluator, num_episodes_per_iter): rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) # MAKE ENV_NAS timesteps_per_episode = 10 timesteps_per_actorbatch = timesteps_per_episode*num_episodes_per_iter num_timesteps = timesteps_per_actorbatch * num_iter env = MathEnv(evaluator) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return lstm_ph.LstmPolicy(name=name, ob_space=ob_space, ac_space=ac_space, num_units=64) pposgd_sync_ph.learn(env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=0.2, entcoeff=0.01, #0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=10, gamma=0.99, # 0.99 lam=0.95, # 0.95 schedule='linear', reward_rule=reward_for_final_timestep ) env.close()