def lstm_launcher(variant): """ Run a simple LSTM on an environment. :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic(name_or_scope="critic", env_spec=env.spec, **variant.get('qf_params', {})) policy = FeedForwardPolicy(name_or_scope="actor", env_spec=env.spec, **variant.get('policy_params', {})) algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params']) algorithm.train()
def rllab_ddpg_launcher(variant): from rllab.algos.ddpg import DDPG as RllabDDPG from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.q_functions.continuous_mlp_q_function import ( ContinuousMLPQFunction as TheanoContinuousMLPQFunction ) from rllab.policies.deterministic_mlp_policy import ( DeterministicMLPPolicy as TheanoDeterministicMLPPolicy ) from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = TheanoDeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = TheanoContinuousMLPQFunction(env_spec=env.spec) algorithm = RllabDDPG( env=env, policy=policy, es=es, qf=qf, **variant['algo_params'] ) algorithm.train()
def rllab_trpo_launcher(variant): from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.algos.trpo import TRPO from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from railrl.launchers.launcher_util import get_env_settings import lasagne.nonlinearities as NL env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=NL.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) batch_size = 5000 algorithm = TRPO( env=env, policy=policy, baseline=baseline, whole_paths=True, max_path_length=500, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm.train()
def shane_ddpg_launcher(variant): from rllab.exploration_strategies.gaussian_strategy import GaussianStrategy from sandbox.rocky.tf.algos.ddpg import DDPG as ShaneDDPG from sandbox.rocky.tf.envs.base import TfEnv from sandbox.rocky.tf.policies.deterministic_mlp_policy import ( DeterministicMLPPolicy ) from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ( ContinuousMLPQFunction ) from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = TfEnv(env_settings['env']) es = GaussianStrategy(env.spec) policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, **variant['policy_params'] ) qf = ContinuousMLPQFunction( name="qf", env_spec=env.spec, **variant['qf_params'] ) algorithm = ShaneDDPG( env, policy, qf, es, **variant['algo_params'] ) algorithm.train()
def naf_launcher(variant): from railrl.algos.naf import NAF from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] if 'es_init' in variant: es = variant['es_init'](env, **variant['exploration_strategy_params']) else: es = OUStrategy( env_spec=env.spec, **variant['exploration_strategy_params'] ) qf = QuadraticNAF( name_or_scope="qf", env_spec=env.spec, batch_norm_config=bn_config, ) algorithm = NAF( env, es, qf, batch_norm_config=bn_config, **variant['algo_params'] ) algorithm.train()
def random_action_launcher(variant): from railrl.algos.noop_algo import NoOpAlgo from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.uniform_control_policy import UniformControlPolicy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env) policy = UniformControlPolicy(env_spec=env.spec) algorithm = NoOpAlgo( env, policy, es, **variant['algo_params'] ) algorithm.train()
def my_ddpg_launcher(variant): """ Run DDPG :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algorithm = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm.train()
def run_algorithm( launch_settings, env_params, exp_prefix, seed, tensorboard_path, icm, exp_id=1, **kwargs): """ Launch an algorithm :param launch_settings: See get_launch_settings_list_from_args :param env_params: See get_env_settings :param exp_prefix: Experiment prefix :param seed: Experiment seed :param exp_id: Experiment ID # to identify it later (e.g. for plotting data) :param kwargs: Other kwargs to pass to run_experiment_lite :return: """ variant = launch_settings['variant'] variant['env_params'] = env_params variant['algo_params'] = launch_settings['algo_params'] variant['batch_norm_params'] = launch_settings['batch_norm_params'] variant['exp_id'] = exp_id variant['tensorboard'] = tensorboard_path env_settings = get_env_settings(**env_params) variant['Environment'] = env_settings['name'] algorithm_launcher = launch_settings['algorithm_launcher'] if icm: algorithm_launcher = icm_launcher run_experiment( algorithm_launcher, exp_prefix, seed, variant, **kwargs)
def oat_qddpg_launcher(variant): """ Quadratic optimal action target DDPG """ from railrl.algos.optimal_action_target_ddpg import OptimalActionTargetDDPG as OAT from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant['qf_params'] ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant['policy_params'] ) algorithm = OAT( env, es, policy, qf, batch_norm_config=bn_config, **variant['algo_params'] ) algorithm.train()
def rllab_vpg_launcher(variant): from rllab.algos.trpo import TRPO from railrl.launchers.launcher_util import get_env_settings from railrl.algos.icm_trpo import ICM from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline env_settings = get_env_settings(**variant['env_params']) env = TfEnv(env_settings['env']) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algorithm = VPG( env=env, policy=policy, baseline=baseline, **variant['algo_params'] ) algorithm.train()
def tf_trpo_launcher(variant): from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.baselines.gaussian_conv_baseline import GaussianConvBaseline from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.policies.gaussian_conv_policy import GaussianConvPolicy from sandbox.rocky.tf.algos.trpo import TRPO from sandbox.rocky.tf.envs.base import TfEnv from railrl.launchers.launcher_util import get_env_settings import tensorflow as tf env_settings = get_env_settings(**variant['env_params']) env = TfEnv(env_settings['env']) if len(env.observation_space.shape) == 1: policy = GaussianMLPPolicy( "mlp_policy", env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) elif len(env.observation_space.shape) == 2: policy = ConvNNPolicy( "conv_policy", env_spec=mdp.spec, conv_filters=(32, 32, 32, 32), conv_filter_sizes=((3,3),(3,3),(3,3),(3,3)), conv_strides=(2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256,), ) baseline = GaussianConvBaseline( mdp.spec, regressor_args={ 'conv_filters':(32, 32, 32, 32), 'conv_filter_sizes':((3,3),(3,3),(3,3),(3,3)), 'conv_strides':(2, 2, 2, 2), 'conv_pads':('SAME', 'SAME', 'SAME', 'SAME'), 'hidden_sizes':(256,), } ) else: raise NotImplementedError("Sorry, no support for observatin space: {}".format(env.observation_space.shape)) batch_size = 5000 algorithm = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=500, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm.train()
def icm_launcher(variant): if variant["Algorithm"] == "DDPG": from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.exploration_strategies.simple_gaussian_strategy import SimpleGaussianStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig from railrl.algos.icm import ICM if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) # es = SimpleGaussianStrategy(env_spec=env.spec, sigma=0.5) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algo = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm = ICM( env, algo, no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.9, external_reward_weight=0.95, inverse_tanh=True, init_learning_rate=1e-3 ) algorithm.train() elif variant["Algorithm"] == "Idle": from railrl.algos.idle import IdleAlgo from railrl.launchers.launcher_util import get_env_settings from railrl.algos.icm import ICM env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] algo = IdleAlgo(env, variant['tensorboard']) algorithm = ICM( env, algo, no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.9, external_reward_weight=0.0, inverse_tanh=True, init_learning_rate=1e-3, ) algorithm.train() elif variant["Algorithm"] == "rllab-TRPO": from rllab.algos.trpo import TRPO from railrl.launchers.launcher_util import get_env_settings from railrl.algos.icm_trpo import ICM from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from railrl.algos.icm_trpo import ICM import lasagne.nonlinearities as NL env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=NL.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) batch_size = 5000 algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm = ICM( env, algo, variant['tensorboard'], no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-4, ) algorithm.train() elif variant["Algorithm"] == 'tf-TRPO': from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.baselines.gaussian_conv_baseline import GaussianConvBaseline from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.policies.gaussian_conv_policy import GaussianConvPolicy from sandbox.rocky.tf.algos.trpo import TRPO from sandbox.rocky.tf.envs.base import TfEnv from railrl.launchers.launcher_util import get_env_settings # from railrl.algos.icm_trpo_tf import ICM from railrl.algos.icm_trpo_tf_box3d import ICM import tensorflow as tf env_settings = get_env_settings(**variant['env_params']) env = TfEnv(env_settings['env']) if len(env.observation_space.shape) == 1: policy = GaussianMLPPolicy( "mlp_policy", env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) elif len(env.observation_space.shape) == 2: policy = ConvNNPolicy( "conv_policy", env_spec=mdp.spec, conv_filters=(32, 32, 32, 32), conv_filter_sizes=((3,3),(3,3),(3,3),(3,3)), conv_strides=(2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256,), ) baseline = GaussianConvBaseline( mdp.spec, regressor_args={ 'conv_filters':(32, 32, 32, 32), 'conv_filter_sizes':((3,3),(3,3),(3,3),(3,3)), 'conv_strides':(2, 2, 2, 2), 'conv_pads':('SAME', 'SAME', 'SAME', 'SAME'), 'hidden_sizes':(256,), } ) else: raise NotImplementedError("Sorry, no support for observatin space: {}".format(env.observation_space.shape)) batch_size = 5000 algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=500, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm = ICM( env, algo, variant['tensorboard'], no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-4 ) algorithm.train() else: raise NotImplementedError("Currently only supports DDPG!")