def shane_ddpg_launcher(variant): from rllab.exploration_strategies.gaussian_strategy import GaussianStrategy from sandbox.rocky.tf.algos.ddpg import DDPG as ShaneDDPG from sandbox.rocky.tf.envs.base import TfEnv from sandbox.rocky.tf.policies.deterministic_mlp_policy import ( DeterministicMLPPolicy ) from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ( ContinuousMLPQFunction ) from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = TfEnv(env_settings['env']) es = GaussianStrategy(env.spec) policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, **variant['policy_params'] ) qf = ContinuousMLPQFunction( name="qf", env_spec=env.spec, **variant['qf_params'] ) algorithm = ShaneDDPG( env, policy, qf, es, **variant['algo_params'] ) algorithm.train()
def main(): stub(globals()) for seed in range(3): env = TfEnv(HalfCheetahEnv()) es = GaussianStrategy(env.spec) policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) qf = ContinuousMLPQFunction(name="qf", env_spec=env.spec, hidden_sizes=(100, 100)) ddpg_params = dict( batch_size=4, n_epochs=100, epoch_length=50, eval_samples=50, max_path_length=10, min_pool_size=5, ) algorithm = DDPG(env, policy, qf, es, **ddpg_params) for _ in range(3): run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="check-rllab-ddpg-seed", seed=seed, variant={"seed": seed}, )
def main(): stub(globals()) env = TfEnv(HalfCheetahEnv()) for seed in range(3): ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=10000, eval_samples=10000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.0, ) vitchyr_es = OUStrategy(env_spec=env.spec) vitchyr_qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) vitchyr_policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf, **ddpg_params) shane_es = GaussianStrategy(env.spec) shane_policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) shane_qf = ContinuousMLPQFunction(name="qf", env_spec=env.spec, hidden_sizes=(100, 100)) shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es, **ddpg_params) names_and_algos = [ ("Vitchyr_DDPG", vitchyr_ddpg), ("Shane_DDPG", shane_ddpg), ] for name, algorithm in names_and_algos: env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-comparison-cheetah", seed=seed, )
def get_qf(env, algo_name, qf_hid_size, qf_hidden_nonlinearity, **kwargs): qf = None if algo_name in ['ddpg', 'qprop', 'qvpg']: if qf_hidden_nonlinearity == 'relu': hidden_nonlinearity = tf.nn.relu elif qf_hidden_nonlinearity == 'tanh': hidden_nonlinearity = tf.nn.tanh else: raise NotImplementedError(qf_hidden_nonlinearity) qf = ContinuousMLPQFunction( env_spec=env.spec, #hidden_sizes=(100,100), hidden_sizes=(qf_hid_size, qf_hid_size), hidden_nonlinearity=hidden_nonlinearity, ) return qf
def main(): stub(globals()) ddpg_params = dict( batch_size=64, n_epochs=2000, epoch_length=1000, eval_samples=1000, discount=0.99, qf_learning_rate=1e-3, policy_learning_rate=1e-4, soft_target_tau=0.001, replay_pool_size=1000000, min_pool_size=1000, scale_reward=0.1, ) env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) qf = ContinuousMLPQFunction( name="qf", env_spec=env.spec, hidden_sizes=(100, 100), bn=False, ) algorithm = DDPG( env, policy, qf, es, **ddpg_params ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-shane-half-cheetah-script", seed=1, variant=ddpg_params, )
from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from sandbox.rocky.tf.baselines.q_baseline import QfunctionBaseline stub(globals()) env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) qf = ContinuousMLPQFunction(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) qf_baseline = QfunctionBaseline(env_spec=env.spec, policy=policy, qf=qf) algo = TRPO( env=env, policy=policy, baseline=baseline, qf_baseline=qf_baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99,
for r in range(learning_rate_size): policy = DeterministicMLPPolicy( env_spec=env.spec, name="policy", # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 50, 25), hidden_nonlinearity=tf.nn.relu, ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=(100, 50, 25), hidden_nonlinearity=tf.nn.relu, ) for e in range(num_experiments): algo = ddpg_class( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs,
for l in range(layer_size): policy = DeterministicMLPPolicy( env_spec=env.spec, name="policy", # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(layer_1[l], layer_2[l]), hidden_nonlinearity=tf.nn.relu, ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(layer_1[l], layer_2[l]), hidden_nonlinearity=tf.nn.relu,) for e in range(num_experiments): algo = ddpg_class( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs,
def get_qf(env, info, algo_name, qf_hidden_sizes, qf_hidden_nonlinearity, **kwargs): qf = None qf_class = None hidden_sizes = get_hidden_sizes(qf_hidden_sizes) hidden_nonlinearity = get_nonlinearity(qf_hidden_nonlinearity) extra_kwargs = dict() if algo_name in [ 'ddpg', 'trpg', 'trpgoff', 'qprop', 'mqprop', 'nuqprop', 'nuqfqprop', 'qfqprop', 'actrpo', 'acqftrpo', 'qvpg', 'dspg', 'dspgoff', ]: if info['is_action_discrete']: qf = DiscreteMLPQFunction( env_spec=env.spec, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, ) qf_class = 'DiscreteMLPQFunction' else: if algo_name in [ 'trpg', 'trpgoff', 'dspg', 'dspgoff', 'acqftrpo', 'qfqprop', 'nuqfqprop', ]: extra_kwargs['eqf_use_full_qf'] = True elif algo_name == 'mqprop': extra_kwargs['mqprop'] = True qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, **extra_kwargs, ) qf_class = 'ContinuousMLPQFunction' elif algo_name in [ 'nafqprop', ]: assert not info['is_action_discrete'] qf = NAFMLPQFunction( env_spec=env.spec, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, ) qf_class = 'NAFMLPQFunction' elif algo_name in [ 'dqn', ]: if info['is_action_discrete']: qf = DeterministicDiscreteMLPQFunction( env_spec=env.spec, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, ) qf_class = 'DeterministicDiscreteMLPQFunction' else: qf = DeterministicNAFMLPQFunction( env_spec=env.spec, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, ) qf_class = 'DeterministicNAFMLPQFunction' elif algo_name in ['dsqn']: assert info['is_action_discrete'] qf = StochasticDiscreteMLPQFunction( env_spec=env.spec, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, ) qf_class = 'StochasticDiscreteMLPQFunction' print('[get_qf] Instantiating %s, with sizes=%s, hidden_nonlinearity=%s.' % (qf_class, str(hidden_sizes), qf_hidden_nonlinearity)) return qf
activation_map = {"relu": tf.nn.relu, "tanh": tf.nn.tanh, "leaky_relu": lrelu} policy = DeterministicMLPPolicy( env_spec=env.spec, name="policy", # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=args.policy_size, hidden_nonlinearity=activation_map[args.policy_activation], ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_nonlinearity=activation_map[args.vf_activation], hidden_sizes=args.vf_size, ) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=128, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.995, scale_reward=args.reward_scale, qf_learning_rate=1e-3,