def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=350, epoch_length=350, min_pool_size=350, n_epochs=600, discount=0.99, scale_reward=1.0/140.0, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), str(algo.__class__.__name__), exp_name) os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def run_task(*_): env = normalize( GymEnv(env_name="MountainCarContinuous-v0", force_reset=True)) max_path_length = 300 policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=100, n_updates_per_sample=1, max_path_length=max_path_length, epoch_length=900, min_pool_size=800, replay_pool_size=5000, n_epochs=1000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-3, policy_learning_rate=1e-4, ) algo.train()
def run_task(*_): env = normalize(Walker2DEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(H_layer_first[h], H_layer_second[h]) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=size_of_batch, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=number_of_episodes, discount=discount_factor, scale_reward=reward_scaling[r], qf_learning_rate=critic_learning_rate[c], policy_learning_rate=actor_learning_rate[c], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def naf_launcher(variant): from railrl.algos.naf import NAF from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] if 'es_init' in variant: es = variant['es_init'](env, **variant['exploration_strategy_params']) else: es = OUStrategy( env_spec=env.spec, **variant['exploration_strategy_params'] ) qf = QuadraticNAF( name_or_scope="qf", env_spec=env.spec, batch_norm_config=bn_config, ) algorithm = NAF( env, es, qf, batch_norm_config=bn_config, **variant['algo_params'] ) algorithm.train()
def rllab_ddpg_launcher(variant): from rllab.algos.ddpg import DDPG as RllabDDPG from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.q_functions.continuous_mlp_q_function import ( ContinuousMLPQFunction as TheanoContinuousMLPQFunction ) from rllab.policies.deterministic_mlp_policy import ( DeterministicMLPPolicy as TheanoDeterministicMLPPolicy ) from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = TheanoDeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = TheanoContinuousMLPQFunction(env_spec=env.spec) algorithm = RllabDDPG( env=env, policy=policy, es=es, qf=qf, **variant['algo_params'] ) algorithm.train()
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=128, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=100, ) exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp()) algorithm = DDPG( env, es, policy, qf, **default_ddpg_params, ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=1, )
def run_task(*_): env = normalize(SwimmerEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=200, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def lstm_launcher(variant): """ Run a simple LSTM on an environment. :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic(name_or_scope="critic", env_spec=env.spec, **variant.get('qf_params', {})) policy = FeedForwardPolicy(name_or_scope="actor", env_spec=env.spec, **variant.get('policy_params', {})) algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params']) algorithm.train()
def run_task(*_): """ DPG on Swimmer environment """ env = normalize(SwimmerEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy. The OU process satisfies the following stochastic differential equation: dxt = theta*(mu - xt)*dt + sigma*dWt where Wt denotes the Wiener process """ es = OUStrategy(env_spec=env.spec) """ Defining the Q network """ qf = ContinuousMLPQFunction(env_spec=env.spec) w = qf.get_param_values(regularizable=True) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy) """ Using the DDPG algorithm """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=32, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=15000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) """ Training the networks based on the DDPG algorithm """ algo.train()
def main(): stub(globals()) env = TfEnv(HalfCheetahEnv()) for seed in range(3): ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=10000, eval_samples=10000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.0, ) vitchyr_es = OUStrategy(env_spec=env.spec) vitchyr_qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) vitchyr_policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf, **ddpg_params) shane_es = GaussianStrategy(env.spec) shane_policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) shane_qf = ContinuousMLPQFunction(name="qf", env_spec=env.spec, hidden_sizes=(100, 100)) shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es, **ddpg_params) names_and_algos = [ ("Vitchyr_DDPG", vitchyr_ddpg), ("Shane_DDPG", shane_ddpg), ] for name, algorithm in names_and_algos: env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-comparison-cheetah", seed=seed, )
def setUp(self): super().setUp() self.env = TfEnv(CartpoleEnv()) self.es = OUStrategy(env_spec=self.env.spec) self.sum_policy = SumPolicy(name_or_scope='policies', observation_dim=4, action_dim=1) self.sum_critic = SumCritic(name_or_scope='qf', observation_dim=4, action_dim=1)
def run_task(*_): env = normalize(SimpleHumanoidEnv()) # env = SimpleHumanoidEnv() policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(32, 32)) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration( env=env, qf=qf, policy=policy, L_p=L_p_param[l_p_ind], b_step_size=b_step_size[b_ind], sigma=sigma_param[s_ind], max_exploratory_steps=max_exploratory_steps_iters, batch_size=batch_size_value, n_epochs=num_episodes, scale_reward=0.01, epoch_length=steps_per_episode, qf_learning_rate=0.001, policy_learning_rate=0.0001, ) """ DDPG """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=batch_size_value, max_path_length=100, epoch_length=steps_per_episode, min_pool_size=10000, n_epochs=num_episodes, discount=0.99, scale_reward=0.01, qf_learning_rate=0.001, policy_learning_rate=0.0001, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): """ DPG on Hopper environment """ env = normalize(HopperEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(400, 300)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) """ Using the DDPG algorithm """ # algo = DDPG( # env=env, # policy=policy, # es=es, # qf=qf, # batch_size=32, # max_path_length=500, # epoch_length=500, # min_pool_size=10000, # n_epochs=20000, # discount=0.99, # scale_reward=0.01, # qf_learning_rate=1e-3, # policy_learning_rate=1e-4, # #Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, # ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=10000, discount=0.99, scale_reward=0.01, qf_learning_rate=10e-3, policy_learning_rate=10e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def run_task(variant): import tensorflow as tf from railrl.railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.box2d.cartpole_env import CartpoleEnv env = TfEnv(CartpoleEnv()) algo_name = variant['Algorithm'] if algo_name == 'Quadratic-DDPG': qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) elif algo_name == 'DDPG': qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), hidden_nonlinearity=tf.nn.relu, ) else: raise Exception('Algo name not recognized: {0}'.format(algo_name)) es = OUStrategy(env_spec=env.spec) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train()
def example(variant): env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG(env, es, policy, qf, **variant['ddpg_params']) algorithm.train()
def run_task(*_): f = open('/home/qingkai/ddpg_performance.csv', "w+") env = PointGatherEnv(apple_reward=10, bomb_cost=1, n_apples=2, activity_range=6) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) qf_cost = ContinuousMLPQFunction(env_spec=env.spec) safety_constraint = GatherSafetyConstraint(max_value=0.2) algo = PDO_DDPG( env=env, policy=policy, es=es, qf=qf, qf_cost=qf_cost, dual_var=0, safety_constraint=safety_constraint, batch_size=64, max_path_length=15, epoch_length=10000, min_pool_size=10000, n_epochs=150, discount=0.99, qf_learning_rate=1e-3, qf_cost_learning_rate=1e-3, dual_learning_rate=1e-2, policy_learning_rate=1e-3, scale_reward=1, scale_cost=5, soft_target=True, soft_target_tau=0.001, eval_samples=10000, qf_weight_decay=0., qf_cost_weight_decay=0., avg_horizon=100000, #plot=True, ) algo.train() f.close()
def test_ddpg(): env = CartpoleEnv() policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) algo = DDPG( env=env, policy=policy, qf=qf, es=es, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, replay_pool_size=1000, eval_samples=100, ) algo.train()
def main(): stub(globals()) ddpg_params = dict( batch_size=64, n_epochs=2000, epoch_length=1000, eval_samples=1000, discount=0.99, qf_learning_rate=1e-3, policy_learning_rate=1e-4, soft_target_tau=0.001, replay_pool_size=1000000, min_pool_size=1000, scale_reward=0.1, ) env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) qf = ContinuousMLPQFunction( name="qf", env_spec=env.spec, hidden_sizes=(100, 100), bn=False, ) algorithm = DDPG( env, policy, qf, es, **ddpg_params ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-shane-half-cheetah-script", seed=1, variant=ddpg_params, )
def random_action_launcher(variant): from railrl.algos.noop_algo import NoOpAlgo from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.uniform_control_policy import UniformControlPolicy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env) policy = UniformControlPolicy(env_spec=env.spec) algorithm = NoOpAlgo( env, policy, es, **variant['algo_params'] ) algorithm.train()
def my_ddpg_launcher(variant): """ Run DDPG :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algorithm = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm.train()
def run_task(_): from railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.gym_env import GymEnv def gym_env(name): return GymEnv(name, record_video=False, log_dir='/tmp/gym-test', # Ignore gym log. record_log=False) env = TfEnv(gym_env('AxeTwoDPoint-v0')) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, **ddpg_params ) algorithm.train()
def test_rllab(patient_id=1, Initial_Bg=0): try: from rllab.algos.ddpg import DDPG from rllab.envs.normalized_env import normalize from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.gym_env import GymEnv except ImportError: print('rllab is not installed!') return None env = GymEnv('simglucose-adult{}-CHO{}-v0'.format(Initial_Bg, patient_id + 1)) env = normalize(env) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each # with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=5, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4) algo.train() # env.close() return es, policy
def run_task(*_): env = normalize(GymEnv(args.env, force_reset=True, record_video=False)) env.wrapped_env.env.env.reward_flag = args.reward if args.hidden_sizes == 0: hidden_sizes=(8,) elif args.hidden_sizes == 1: hidden_sizes=(32, 32) elif args.hidden_sizes == 2: hidden_sizes=(100, 50, 25) elif args.hidden_sizes == 3: hidden_sizes=(400, 300) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=95, epoch_length=args.batch_size, min_pool_size=10000, n_epochs=args.n_itr, discount=args.gamma, scale_reward=args.scale_reward, qf_learning_rate=1e-3, policy_learning_rate=1e-4, eval_samples=95, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=32, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=1000, ) sweeper = DeterministicHyperparameterSweeper( {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, ) exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp()) for ddpg_params in sweeper.iterate_hyperparameters(): algorithm = DDPG( env, es, policy, qf, scale_reward=ddpg_params['scale_reward'], **default_ddpg_params, ) for seed in range(3): run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=seed, # mode="local", # use_cloudpickle=True, )
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = normalize(GymEnv(env_name = "LunarLanderContinuous-v2",force_reset=True)) # env = normalize(GymEnv(env_name="BipedalWalker-v2", force_reset=True, record_video=True)) max_path_length = 400 # print("env.horizon: ",env.horizon) # input() # env._max_episode_steps = max_path_length policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64) ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=max_path_length, train_epoch_interval=300, min_pool_size=500, replay_pool_size = 10000, n_updates_per_sample =1, n_steps = 75000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-2, policy_learning_rate=1e-3, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def example(*_): env = HalfCheetahEnv() es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, n_epochs=25, batch_size=1024, replay_pool_size=10000, ) algorithm.train()
def oat_qddpg_launcher(variant): """ Quadratic optimal action target DDPG """ from railrl.algos.optimal_action_target_ddpg import OptimalActionTargetDDPG as OAT from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant['qf_params'] ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant['policy_params'] ) algorithm = OAT( env, es, policy, qf, batch_norm_config=bn_config, **variant['algo_params'] ) algorithm.train()
def main(): stub(globals()) env = TfEnv(CartpoleEnv()) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) for seed in range(3): env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="test-qddpg-cartpole", seed=seed, )
def run_task(_): for seed in range(3): env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=16, n_epochs=100, epoch_length=100, eval_samples=100, max_path_length=10, min_pool_size=2, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train(),
import tensorflow as tf stub(globals()) # Param ranges seed = 3 policy_lrs = [1e-5, 1e-4, 1e-3] qf_lrs = [1e-5, 1e-4, 1e-3] gammas = [0.9, 0.99, 0.995] taus = [1e-3, 1e-2] for policy_lr, qf_lr, gamma, tau in itertools.product(policy_lrs, qf_lrs, gammas, taus): env = TfEnv(normalize(env=GymEnv('Box3dReach-v4',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) algo = DDPG( env, es, policy,