def train(env, num_episodes, horizon, iw_method, iw_norm, natural, bound, delta, seed, policy, max_offline_iters, gamma, center_return, clipping=False, njobs=1, entropy='none', max_iters=500, positive_return=False): def make_env(): if env == 'lunarlander-sparse': _env = gym.make('LunarLanderContinuous-v2') _env = SparseReward(_env) else: _env = gym.make(env) return _env if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) sampler = ParallelSampler(make_policy, make_env, num_episodes, horizon, True, n_workers=njobs, seed=seed) affinity = len(os.sched_getaffinity(0)) sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois.learn(make_env, make_policy, n_episodes=num_episodes, max_iters=max_iters, horizon=horizon, gamma=gamma, delta=delta, use_natural_gradient=natural, iw_method=iw_method, iw_norm=iw_norm, bound=bound, save_weights=True, sampler=sampler, center_return=center_return, render_after=None, max_offline_iters=max_offline_iters, clipping=clipping, entropy=entropy, positive_return=positive_return) sampler.close()
def train(env, num_episodes, horizon, iw_method, iw_norm, natural, bound, delta, seed, policy, max_offline_iters, njobs=1): env_rllab_class = rllab_env_from_name(env) def make_env(): env_rllab = env_rllab_class() env = Rllab2GymWrapper(env_rllab) return env if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) sampler = ParallelSampler(make_policy, make_env, num_episodes, horizon, True, n_workers=njobs, seed=seed) affinity = len(os.sched_getaffinity(0)) sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois.learn(make_env, make_policy, n_episodes=num_episodes, max_iters=500, horizon=horizon, gamma=1., delta=delta, use_natural_gradient=natural, iw_method=iw_method, iw_norm=iw_norm, bound=bound, save_weights=True, sampler=sampler, center_return=True, render_after=None, max_offline_iters=max_offline_iters,) sampler.close()
def train(env, policy, policy_init, n_episodes, horizon, seed, njobs=1, save_weights=False, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(): _env = make_atari(env) return wrap_deepmind(_env) else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 if policy_init == 'xavier': policy_initializer = tf.contrib.layers.xavier_initializer() elif policy_init == 'zeros': policy_initializer = U.normc_initializer(0.0) else: raise Exception('Unrecognized policy initializer.') if policy == 'linear' or policy == 'nn': def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) else: raise Exception('Unrecognized policy type.') sampler = ParallelSampler(make_policy, make_env, n_episodes, horizon, True, n_workers=njobs, seed=seed) try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois.learn(make_env, make_policy, n_episodes=n_episodes, horizon=horizon, sampler=sampler, save_weights=save_weights, **alg_args) sampler.close()
def train(env, num_episodes, horizon, iw_method, iw_norm, natural, bound, delta, seed, policy, max_offline_iters, gamma, center_return, clipping=False, njobs=1, entropy='none', max_iters=500, positive_return=False): if env == 'swimmer': make_env_rllab = SwimmerEnv elif env == 'ant': make_env_rllab = AntEnv elif env == 'half-cheetah': make_env_rllab = HalfCheetahEnv elif env == 'hopper': make_env_rllab = HopperEnv elif env == 'simple-humanoid': make_env_rllab = SimpleHumanoidEnv elif env == 'full-humanoid': make_env_rllab = HumanoidEnv elif env == 'walker': make_env_rllab = Walker2DEnv elif env == 'cartpole': make_env_rllab = CartpoleEnv elif env == 'mountain-car': make_env_rllab = MountainCarEnv elif env == 'inverted-pendulum': make_env_rllab = InvertedPendulumEnv elif env == 'acrobot': make_env_rllab = AcrobotEnv elif env == 'inverted-double-pendulum': make_env_rllab = InvertedDoublePendulumEnv def make_env(): env_rllab = make_env_rllab() env = Rllab2GymWrapper(env_rllab) return env if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) sampler = ParallelSampler(make_policy, make_env, num_episodes, horizon, True, n_workers=njobs, seed=seed) affinity = len(os.sched_getaffinity(0)) sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois.learn(make_env, make_policy, n_episodes=num_episodes, max_iters=max_iters, horizon=horizon, gamma=gamma, delta=delta, use_natural_gradient=natural, iw_method=iw_method, iw_norm=iw_norm, bound=bound, save_weights=True, sampler=sampler, center_return=center_return, render_after=None, max_offline_iters=max_offline_iters, clipping=clipping, entropy=entropy, positive_return=positive_return) sampler.close()