def lstm_launcher(variant): """ Run a simple LSTM on an environment. :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic(name_or_scope="critic", env_spec=env.spec, **variant.get('qf_params', {})) policy = FeedForwardPolicy(name_or_scope="actor", env_spec=env.spec, **variant.get('policy_params', {})) algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params']) algorithm.train()
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=128, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=100, ) exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp()) algorithm = DDPG( env, es, policy, qf, **default_ddpg_params, ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=1, )
def test_serialize_feedforward_policy(self): policy = FeedForwardPolicy( name_or_scope="b", action_dim=self.action_dim, observation_dim=self.observation_dim, ) self.sess.run(tf.global_variables_initializer()) pickle.dumps(policy)
def main(): stub(globals()) env = TfEnv(HalfCheetahEnv()) for seed in range(3): ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=10000, eval_samples=10000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.0, ) vitchyr_es = OUStrategy(env_spec=env.spec) vitchyr_qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) vitchyr_policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf, **ddpg_params) shane_es = GaussianStrategy(env.spec) shane_policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) shane_qf = ContinuousMLPQFunction(name="qf", env_spec=env.spec, hidden_sizes=(100, 100)) shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es, **ddpg_params) names_and_algos = [ ("Vitchyr_DDPG", vitchyr_ddpg), ("Shane_DDPG", shane_ddpg), ] for name, algorithm in names_and_algos: env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-comparison-cheetah", seed=seed, )
def run_task(variant): import tensorflow as tf from railrl.railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.box2d.cartpole_env import CartpoleEnv env = TfEnv(CartpoleEnv()) algo_name = variant['Algorithm'] if algo_name == 'Quadratic-DDPG': qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) elif algo_name == 'DDPG': qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), hidden_nonlinearity=tf.nn.relu, ) else: raise Exception('Algo name not recognized: {0}'.format(algo_name)) es = OUStrategy(env_spec=env.spec) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train()
def my_ddpg_launcher(variant): """ Run DDPG :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algorithm = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm.train()
def run_task(_): from railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.gym_env import GymEnv def gym_env(name): return GymEnv(name, record_video=False, log_dir='/tmp/gym-test', # Ignore gym log. record_log=False) env = TfEnv(gym_env('AxeTwoDPoint-v0')) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, **ddpg_params ) algorithm.train()
def _create_network_internal(self, observation_input, action_input): observation_input = self._process_layer(observation_input, scope_name="observation_input") action_input = self._process_layer(action_input, scope_name="action_input") self._vf = MlpStateNetwork( name_or_scope="V_function", output_dim=1, observation_dim=self.observation_dim, observation_input=observation_input, observation_hidden_sizes=(100, 100), hidden_W_init=None, hidden_b_init=None, output_W_init=None, output_b_init=None, hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.identity, batch_norm_config=self._batch_norm_config, ) self._policy = FeedForwardPolicy( name_or_scope="implict_policy", action_dim=self.action_dim, observation_dim=self.observation_dim, observation_input=observation_input, observation_hidden_sizes=(100, 100), hidden_W_init=None, hidden_b_init=None, output_W_init=None, output_b_init=None, hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, batch_norm_config=self._batch_norm_config, ) self._af = QuadraticQF( name_or_scope="advantage_function", action_input=action_input, observation_input=observation_input, action_dim=self.action_dim, observation_dim=self.observation_dim, policy=self._policy, batch_norm_config=self._batch_norm_config, ) vf_out = self._add_subnetwork_and_get_output(self._vf) af_out = self._add_subnetwork_and_get_output(self._af) return vf_out + af_out
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=32, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=1000, ) sweeper = DeterministicHyperparameterSweeper( {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, ) exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp()) for ddpg_params in sweeper.iterate_hyperparameters(): algorithm = DDPG( env, es, policy, qf, scale_reward=ddpg_params['scale_reward'], **default_ddpg_params, ) for seed in range(3): run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=seed, # mode="local", # use_cloudpickle=True, )
def example(*_): env = HalfCheetahEnv() es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, n_epochs=25, batch_size=1024, replay_pool_size=10000, ) algorithm.train()
def oat_qddpg_launcher(variant): """ Quadratic optimal action target DDPG """ from railrl.algos.optimal_action_target_ddpg import OptimalActionTargetDDPG as OAT from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant['qf_params'] ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant['policy_params'] ) algorithm = OAT( env, es, policy, qf, batch_norm_config=bn_config, **variant['algo_params'] ) algorithm.train()
def main(): stub(globals()) env = TfEnv(CartpoleEnv()) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) for seed in range(3): env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="test-qddpg-cartpole", seed=seed, )
def run_task(_): for seed in range(3): env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=16, n_epochs=100, epoch_length=100, eval_samples=100, max_path_length=10, min_pool_size=2, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train(),
gammas = [0.9, 0.99, 0.995] taus = [1e-3, 1e-2] for policy_lr, qf_lr, gamma, tau in itertools.product(policy_lrs, qf_lrs, gammas, taus): env = TfEnv(normalize(env=GymEnv('Box3dReach-v4',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) algo = DDPG( env, es, policy, qf, "/data0/dianchen/box3d/ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format( policy_lr, qf_lr, gamma, tau, ), qf_learning_rate=qf_lr,
def icm_launcher(variant): if variant["Algorithm"] == "DDPG": from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.exploration_strategies.simple_gaussian_strategy import SimpleGaussianStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig from railrl.algos.icm import ICM if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) # es = SimpleGaussianStrategy(env_spec=env.spec, sigma=0.5) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algo = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm = ICM( env, algo, no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.9, external_reward_weight=0.95, inverse_tanh=True, init_learning_rate=1e-3 ) algorithm.train() elif variant["Algorithm"] == "Idle": from railrl.algos.idle import IdleAlgo from railrl.launchers.launcher_util import get_env_settings from railrl.algos.icm import ICM env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] algo = IdleAlgo(env, variant['tensorboard']) algorithm = ICM( env, algo, no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.9, external_reward_weight=0.0, inverse_tanh=True, init_learning_rate=1e-3, ) algorithm.train() elif variant["Algorithm"] == "rllab-TRPO": from rllab.algos.trpo import TRPO from railrl.launchers.launcher_util import get_env_settings from railrl.algos.icm_trpo import ICM from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from railrl.algos.icm_trpo import ICM import lasagne.nonlinearities as NL env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=NL.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) batch_size = 5000 algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm = ICM( env, algo, variant['tensorboard'], no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-4, ) algorithm.train() elif variant["Algorithm"] == 'tf-TRPO': from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.baselines.gaussian_conv_baseline import GaussianConvBaseline from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.policies.gaussian_conv_policy import GaussianConvPolicy from sandbox.rocky.tf.algos.trpo import TRPO from sandbox.rocky.tf.envs.base import TfEnv from railrl.launchers.launcher_util import get_env_settings # from railrl.algos.icm_trpo_tf import ICM from railrl.algos.icm_trpo_tf_box3d import ICM import tensorflow as tf env_settings = get_env_settings(**variant['env_params']) env = TfEnv(env_settings['env']) if len(env.observation_space.shape) == 1: policy = GaussianMLPPolicy( "mlp_policy", env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) elif len(env.observation_space.shape) == 2: policy = ConvNNPolicy( "conv_policy", env_spec=mdp.spec, conv_filters=(32, 32, 32, 32), conv_filter_sizes=((3,3),(3,3),(3,3),(3,3)), conv_strides=(2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256,), ) baseline = GaussianConvBaseline( mdp.spec, regressor_args={ 'conv_filters':(32, 32, 32, 32), 'conv_filter_sizes':((3,3),(3,3),(3,3),(3,3)), 'conv_strides':(2, 2, 2, 2), 'conv_pads':('SAME', 'SAME', 'SAME', 'SAME'), 'hidden_sizes':(256,), } ) else: raise NotImplementedError("Sorry, no support for observatin space: {}".format(env.observation_space.shape)) batch_size = 5000 algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=500, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm = ICM( env, algo, variant['tensorboard'], no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-4 ) algorithm.train() else: raise NotImplementedError("Currently only supports DDPG!")
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--seed', type=int, default=0) parser.add_argument('--policy_initlr', type=float, default=1e-4) parser.add_argument('--qf_initlr', type=float, default=1e-3) parser.add_argument('--qf_decay', type=float, default=.0) parser.add_argument('--qf_soft_tau', type=float, default=1e-3) # Exploration hyperparameters parser.add_argument('--ou_theta', type=float, default=0.15) parser.add_argument('--ou_sigma', type=float, default=0.3) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=1.0) args = parser.parse_args() env = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) name = 'ddpg-state-v11-plr{0}-qlr{1}-tau{2}-qfdecay{3}-ou_theta{4}-ou_sigma{5}'.format( args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay, args.ou_theta, args.ou_sigma) es = OUStrategy(env_spec=env.spec, theta=args.ou_theta, sigma=args.ou_sigma) policy = FeedForwardPolicy( name_or_scope="actor", observation_hidden_sizes=(400, 300), env_spec=env.spec, ) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), ) algo = DDPG( env=env, exploration_strategy=es, policy=policy, qf=qf, tensorboard_path=os.path.join(args.tfboard_path, name, '_%d' % args.seed), qf_learning_rate=args.qf_initlr, policy_learning_rate=args.policy_initlr, soft_target_tau=args.qf_soft_tau, gpu_ratio=args.gpu_ratio, ) run_experiment_lite(algo.train(), exp_prefix=name, n_parallel=1, snapshot_mode="last", seed=args.seed, mode="local")