def train(env, policy, horizon, seed, trainable_std, gain_init, std_init, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\w+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari is not tested here raise Exception('Not tested on atari.') else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab # Create the policy if policy == 'linear': hid_layers = [] else: raise NotImplementedError def make_policy(name, ob_space, ac_space): return PeMlpPolicy(name, ob_space, ac_space, hid_layers, deterministic=True, diagonal=True, trainable_std=trainable_std, use_bias=False, use_critic=False, seed=seed, verbose=True, hidden_W_init=U.normc_initializer(1.0), higher_mean_init=tf.constant_initializer(gain_init), higher_logstd_init=tf.constant_initializer( np.log(std_init))) try: affinity = len(os.sched_getaffinity(0)) except: affinity = -1 sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) # Learn ucb1.learn(make_env, make_policy, horizon=horizon, **alg_args)
def create_env(env, seed): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(): _env = make_atari(env) return wrap_deepmind(_env) else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab env = make_env() env.seed(seed) return env
def train(env, policy, policy_init, num_episodes, episode_cap, horizon, **alg_args): # Getting the environment env_class = rllab_env_from_name(env) env = normalize(env_class()) # Policy initialization if policy_init == 'zeros': initializer = LI.Constant(0) elif policy_init == 'normal': initializer = LI.Normal() else: raise Exception('Unrecognized policy initialization.') # Setting the policy type if policy == 'linear': hidden_sizes = tuple() elif policy == 'simple-nn': hidden_sizes = [16] else: raise Exception('NOT IMPLEMENTED.') # Creating the policy obs_dim = env.observation_space.flat_dim action_dim = env.action_space.flat_dim mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, output_b_init=None, output_W_init=initializer, ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes, mean_network=mean_network, log_weights=True, ) # Creating baseline baseline = LinearFeatureBaseline(env_spec=env.spec) # Adding max_episodes constraint. If -1, this is unbounded if episode_cap: alg_args['max_episodes'] = num_episodes # Run algorithm algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=horizon * num_episodes, whole_paths=True, max_path_length=horizon, **alg_args) algo.train()
def create_policy_and_env(env, seed, policy, policy_file): # Session sess = U.single_threaded_session() sess.__enter__() ''' # Create the environment if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(): _env = make_atari(env) return wrap_deepmind(_env) else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab env = make_env() env.seed(seed) ob_space = env.observation_space ac_space = env.action_space ''' env_class = rllab_env_from_name(env) env = normalize(env_class()) ''' # Make policy if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'simple-nn': hid_size = [16] num_hid_layers = 1 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 # Temp initializer policy_initializer = U.normc_initializer(0.0) if policy == 'linear' or policy == 'nn' or policy == 'simple-nn': def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=True, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) else: raise Exception('Unrecognized policy type.') pi = make_policy('pi', ob_space, ac_space) # Load policy weights from file all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split('/')[1].startswith('pol')] set_parameter = U.SetFromFlat(var_list) ''' obs_dim = env.observation_space.flat_dim action_dim = env.action_space.flat_dim policy_init = 'zeros' # Policy initialization if policy_init == 'zeros': initializer = LI.Constant(0) elif policy_init == 'normal': initializer = LI.Normal() else: raise Exception('Unrecognized policy initialization.') # Setting the policy type if policy == 'linear': hidden_sizes = tuple() elif policy == 'simple-nn': hidden_sizes = [16] else: raise Exception('NOT IMPLEMENTED.') # Creating the policy mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=[16], hidden_nonlinearity=NL.tanh, output_nonlinearity=None, output_b_init=None, output_W_init=initializer, ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=[16], mean_network=mean_network) #weights = pkl.load(open(policy_file, 'rb')) # TMP overriding weights #weights = [-0.19337249, -0.12103618, 0.00849289, -0.1105529, -3.6525128] # TRPO #weights = [-0.5894, -0.2585, -0.0137, -0.2464, -0.2788] # POIS #weights = list(map(float, ['-0.5807', '-0.3046', '-0.0127', '-0.3045', '-0.7427'])) weights = list( map( lambda x: x.rstrip(' \r\n') if len(x.rstrip(' \r\n')) > 0 else None, """0.02483223 -0.17645608 0.77450023 0.54770311 0.33464952 -0.29827444 -0.62524864 0.46413191 -0.31990006 -0.32972003 0.38753632 -0.15170416 -0.43518174 -0.15718946 0.19542838 -0.02774486 0.13546377 -0.18621497 0.18444675 0.774653 0.19710147 -0.20958339 0.15098953 0.42278248 -0.53121678 -0.33369185 -0.04331141 -0.2140371 0.27077572 0.58111134 0.34637848 0.56956591 0.45061681 -0.15826946 -1.06925573 -0.39311001 -0.35695692 0.14414285 -1.25332428 -0.24016012 0.17774961 0.23973508 -0.65415459 1.53059934 -0.71953132 1.79764386 0.18561774 1.4640445 -0.1625999 0.0606595 -0.22058723 -0.34247517 0.46232139 0.07013392 -0.32074007 0.14488911 0.1123158 0.28914362 0.6727726 -0.58491444 0.35895434 1.32873906 -0.0708237 -0.05147256 0.01689644 0.38244615 0.10005984 0.71253728 -0.18824528 -0.15552894 -0.05634595 0.3517145 0.20900426 -0.19631462 -0.03828797 0.08125694 -0.22894259 -0.08030374 0.59522035 -0.1752422 -0.40809067 1.62409963 -1.39307047 0.81438794 -0.54068521 0.19321547 -1.65661292 0.3264788 0.46482921 -0.01649974 -0.79186757 -1.3378886 -0.57094913 -1.57079733 -1.78056839 1.05324632 -2.14386428""".rstrip(' \r\n').split(' '))) weights = [w for w in weights if w is not None] weights = list(map(float, weights)) print(weights) #pi.set_param(weights) return env, policy
def train(env, policy, seed, njobs=1, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\w+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(seed=0): def _thunk(): env_rllab = Rllab2GymWrapper(env_rllab_class()) env_rllab.seed(seed) return env_rllab return _thunk parallel_env = SubprocVecEnv([make_env(i + seed) for i in range(njobs)]) # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(seed=0): def _thunk(): _env = make_atari(env) _env.seed(seed) return wrap_deepmind(_env) return _thunk parallel_env = VecFrameStack(SubprocVecEnv([make_env(i + seed) for i in range(njobs)]), 4) else: # Not atari, standard env creation def make_env(seed=0): def _thunk(): _env = gym.make(env) _env.seed(seed) return _env return _thunk parallel_env = SubprocVecEnv([make_env(i + seed) for i in range(njobs)]) # Create the policy if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 if policy == 'linear' or policy == 'nn': def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) else: raise Exception('Unrecognized policy type.') try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois2.learn(parallel_env, make_policy, **alg_args)
def train(env, policy, policy_init, n_episodes, horizon, seed, njobs=1, save_weights=False, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(): _env = make_atari(env) return wrap_deepmind(_env) else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 if policy_init == 'xavier': policy_initializer = tf.contrib.layers.xavier_initializer() elif policy_init == 'zeros': policy_initializer = U.normc_initializer(0.0) else: raise Exception('Unrecognized policy initializer.') if policy == 'linear' or policy == 'nn': def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) else: raise Exception('Unrecognized policy type.') sampler = ParallelSampler(make_policy, make_env, n_episodes, horizon, True, n_workers=njobs, seed=seed) try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois.learn(make_env, make_policy, n_episodes=n_episodes, horizon=horizon, sampler=sampler, save_weights=save_weights, **alg_args) sampler.close()
def train(env, policy, horizon, seed, bounded_policy, mu_init, std_init, njobs=1, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\w+)', env).group(1) print('env_name', env_name) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab, env_name) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari is not tested here raise Exception('Not tested on atari.') else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab env_name = make_env().spec.id # Create the policy if policy == 'linear': hid_layers = [] else: raise NotImplementedError const_std_init = False if mu_init is not None: higher_mean_init = tf.constant_initializer(mu_init) else: higher_mean_init = U.normc_initializer(1.0) if std_init is not None: higher_logstd_init = tf.constant_initializer(np.log(std_init)) else: higher_logstd_init = tf.constant_initializer(np.log(1e-2)) # higher_logstd_init = tf.constant(np.log([0.15, 1.5]).astype(np.float32)) # const_std_init = True def make_policy(name, ob_space, ac_space): return PeMlpPolicy(name, ob_space, ac_space, hid_layers, deterministic=True, diagonal=True, trainable_std=alg_args['trainable_std'], use_bias=False, use_critic=False, seed=seed, verbose=True, hidden_W_init=U.normc_initializer(1.0), higher_mean_init=higher_mean_init, higher_logstd_init=higher_logstd_init, const_std_init=const_std_init) try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) # Prepare (sequential) sampler to generate ONE trajectory at a time sampler = None # Learn optimist.learn(env_name, make_env, seed, make_policy, horizon=horizon, sampler=sampler, **alg_args)
def train(env, max_iters, num_episodes, horizon, iw_norm, bound, delta, gamma, seed, policy, max_offline_iters, aggregate, center, use_bias, njobs=1): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\w+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari is not tested here raise Exception('Not tested on atari.') else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab # Create the policy if policy == 'linear': hid_layers = [] elif policy == 'nn': hid_layers = [100, 50, 25] elif policy == 'cnn': raise Exception('CNN policy not tested.') if aggregate=='none': learner = pbpois PolicyClass = PeMlpPolicy elif aggregate=='neuron': learner = nbpois PolicyClass = MultiPeMlpPolicy else: print("Unknown aggregation method, defaulting to none") learner = pbpois PolicyClass = PeMlpPolicy make_policy = lambda name, observation_space, action_space: PolicyClass(name, observation_space, action_space, hid_layers, use_bias=use_bias, seed=seed) sampler = ParallelSampler(make_env, make_policy, gamma, horizon, np.ravel, num_episodes, njobs, seed) try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) learner.learn( make_env, make_policy, sampler, gamma=gamma, n_episodes=num_episodes, horizon=horizon, max_iters=max_iters, verbose=1, feature_fun=np.ravel, iw_norm=iw_norm, bound = bound, max_offline_iters=max_offline_iters, delta=delta, center_return=center, line_search_type='parabola')
def create_sampler(env=None, policy='linear', n_episodes=100, horizon=500, njobs=1, seed=42): # Create the environment if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(): _env = make_atari(env) return wrap_deepmind(_env) else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab # Select policy architecture if policy == 'linear': hid_size = num_hid_layers = 0 use_bias = False elif policy == 'simple-nn': hid_size = [16] num_hid_layers = 1 use_bias = True elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 use_bias = True policy_initializer = U.normc_initializer(0.0) if policy == 'linear' or policy == 'nn' or policy == 'simple-nn': def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=use_bias, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) else: raise Exception('Unrecognized policy type.') # Create the sampler sampler = ParallelSampler(make_policy, make_env, n_episodes, horizon, True, n_workers=njobs, seed=seed) try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() # Set random seed set_global_seeds(seed) return sampler
def set_script_test(env, policy, horizon, seed, bounded_policy, trainable_std, gain_init, max_mean, min_mean, max_std, min_std, std_init): # Common imports import sys, re, os, time, logging from collections import defaultdict # Framework imports import gym import tensorflow as tf # Self imports: utils from baselines.common import set_global_seeds from baselines import logger import baselines.common.tf_util as U from baselines.common.rllab_utils import Rllab2GymWrapper, rllab_env_from_name from baselines.common.atari_wrappers import make_atari, wrap_deepmind # Import custom envs import baselines.envs.lqg1d # registered at import as gym env def get_env_type(env_id): # First load all envs _game_envs = defaultdict(set) for env in gym.envs.registry.all(): env_type = env._entry_point.split(':')[0].split('.')[-1] _game_envs[env_type].add(env.id) # Get env type env_type = None for g, e in _game_envs.items(): if env_id in e: env_type = g break return env_type env = 'LQG1D-v0' # Prepare environment maker if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\w+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(): _env = make_atari(env) return wrap_deepmind(_env) else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab # Prepare policy maker if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 def make_policy(name, ob_space, ac_space): return MlpPolicyBounded( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, trainable_std=trainable_std, use_bias=False, use_critic=False, #hidden_W_init=tf.constant_initializer(1.1), gain_init=gain_init, max_mean=max_mean, min_mean=min_mean, max_std=max_std, min_std=min_std, std_init=std_init) # Initialize affinity = len(os.sched_getaffinity(0)) sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) env = make_env() ob_space = env.observation_space ac_space = env.action_space pi = make_policy('pi', ob_space, ac_space) return pi
def create_env_rllab(env, seed): env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) env = normalize(env_rllab_class()) return env