예제 #1
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))
    es = OUStrategy(env_spec=env.spec)
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=350,
        epoch_length=350,
        min_pool_size=350,
        n_epochs=600,
        discount=0.99,
        scale_reward=1.0/140.0,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), 
                                             str(algo.__class__.__name__), 
                                             exp_name)
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
def run_task(*_):

    env = normalize(
        GymEnv(env_name="MountainCarContinuous-v0", force_reset=True))
    max_path_length = 300

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=100,
        n_updates_per_sample=1,
        max_path_length=max_path_length,
        epoch_length=900,
        min_pool_size=800,
        replay_pool_size=5000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.1,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
    )
    algo.train()
예제 #3
0
            def run_task(*_):
                env = normalize(Walker2DEnv())

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(H_layer_first[h], H_layer_second[h])
                )

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec)

                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    batch_size=size_of_batch,
                    max_path_length=100,
                    epoch_length=1000,
                    min_pool_size=10000,
                    n_epochs=number_of_episodes,
                    discount=discount_factor,
                    scale_reward=reward_scaling[r],
                    qf_learning_rate=critic_learning_rate[c],
                    policy_learning_rate=actor_learning_rate[c],
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
def naf_launcher(variant):
	from railrl.algos.naf import NAF
	from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.core.tf_util import BatchNormConfig
	if ('batch_norm_params' in variant
			and variant['batch_norm_params'] is not None):
		bn_config = BatchNormConfig(**variant['batch_norm_params'])
	else:
		bn_config = None
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	if 'es_init' in variant:
		es = variant['es_init'](env, **variant['exploration_strategy_params'])
	else:
		es = OUStrategy(
			env_spec=env.spec,
			**variant['exploration_strategy_params']
		)
	qf = QuadraticNAF(
		name_or_scope="qf",
		env_spec=env.spec,
		batch_norm_config=bn_config,
	)
	algorithm = NAF(
		env,
		es,
		qf,
		batch_norm_config=bn_config,
		**variant['algo_params']
	)
	algorithm.train()
def rllab_ddpg_launcher(variant):
	from rllab.algos.ddpg import DDPG as RllabDDPG
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from rllab.q_functions.continuous_mlp_q_function import (
		ContinuousMLPQFunction as TheanoContinuousMLPQFunction
	)
	from rllab.policies.deterministic_mlp_policy import (
		DeterministicMLPPolicy as TheanoDeterministicMLPPolicy
	)
	from railrl.launchers.launcher_util import get_env_settings
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	policy = TheanoDeterministicMLPPolicy(
		env_spec=env.spec,
		hidden_sizes=(32, 32)
	)

	es = OUStrategy(env_spec=env.spec)

	qf = TheanoContinuousMLPQFunction(env_spec=env.spec)

	algorithm = RllabDDPG(
		env=env,
		policy=policy,
		es=es,
		qf=qf,
		**variant['algo_params']
	)
	algorithm.train()
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=128,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=100,
    )
    exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp())
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **default_ddpg_params,
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix=exp_prefix,
        seed=1,
    )
예제 #7
0
def run_task(*_):
    env = normalize(SwimmerEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=200,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    algo.train()
예제 #8
0
def lstm_launcher(variant):
    """
    Run a simple LSTM on an environment.

    :param variant: Dictionary of dictionary with the following keys:
        - algo_params
        - env_params
        - qf_params
        - policy_params
    :return:
    """
    from railrl.algos.ddpg import DDPG as MyDDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from railrl.launchers.launcher_util import get_env_settings
    env_settings = get_env_settings(**variant['env_params'])
    env = env_settings['env']
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(name_or_scope="critic",
                           env_spec=env.spec,
                           **variant.get('qf_params', {}))
    policy = FeedForwardPolicy(name_or_scope="actor",
                               env_spec=env.spec,
                               **variant.get('policy_params', {}))
    algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params'])
    algorithm.train()
예제 #9
0
def run_task(*_):
    """
    DPG on Swimmer environment
    """
    env = normalize(SwimmerEnv())
    """
    Initialise the policy as a neural network policy
    """
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))
    """
    Defining exploration strategy : OUStrategy - 
    """
    """
    This strategy implements the Ornstein-Uhlenbeck process, which adds
    time-correlated noise to the actions taken by the deterministic policy.
    The OU process satisfies the following stochastic differential equation:
    dxt = theta*(mu - xt)*dt + sigma*dWt
    where Wt denotes the Wiener process
    """
    es = OUStrategy(env_spec=env.spec)
    """
    Defining the Q network
    """
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    w = qf.get_param_values(regularizable=True)
    """
    Persistence Length Exploration
    """
    lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy)
    """
    Using the DDPG algorithm
    """
    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        lp=lp,
        batch_size=32,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=15000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    """
    Training the networks based on the DDPG algorithm
    """
    algo.train()
예제 #10
0
def main():
    stub(globals())
    env = TfEnv(HalfCheetahEnv())
    for seed in range(3):
        ddpg_params = dict(
            batch_size=128,
            n_epochs=100,
            epoch_length=10000,
            eval_samples=10000,
            discount=0.99,
            policy_learning_rate=1e-4,
            qf_learning_rate=1e-3,
            soft_target_tau=0.01,
            replay_pool_size=1000000,
            min_pool_size=256,
            scale_reward=1.0,
            max_path_length=1000,
            qf_weight_decay=0.0,
        )
        vitchyr_es = OUStrategy(env_spec=env.spec)
        vitchyr_qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        vitchyr_policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf,
                            **ddpg_params)

        shane_es = GaussianStrategy(env.spec)
        shane_policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        shane_qf = ContinuousMLPQFunction(name="qf",
                                          env_spec=env.spec,
                                          hidden_sizes=(100, 100))
        shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es,
                               **ddpg_params)

        names_and_algos = [
            ("Vitchyr_DDPG", vitchyr_ddpg),
            ("Shane_DDPG", shane_ddpg),
        ]
        for name, algorithm in names_and_algos:
            env.reset()
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="ddpg-comparison-cheetah",
                seed=seed,
            )
 def setUp(self):
     super().setUp()
     self.env = TfEnv(CartpoleEnv())
     self.es = OUStrategy(env_spec=self.env.spec)
     self.sum_policy = SumPolicy(name_or_scope='policies',
                                 observation_dim=4,
                                 action_dim=1)
     self.sum_critic = SumCritic(name_or_scope='qf',
                                 observation_dim=4,
                                 action_dim=1)
예제 #12
0
            def run_task(*_):

                env = normalize(SimpleHumanoidEnv())
                # env = SimpleHumanoidEnv()

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(32, 32))

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec,
                                            hidden_sizes=(32, 32))
                """
                Persistence Length Exploration
                """
                lp = Persistence_Length_Exploration(
                    env=env,
                    qf=qf,
                    policy=policy,
                    L_p=L_p_param[l_p_ind],
                    b_step_size=b_step_size[b_ind],
                    sigma=sigma_param[s_ind],
                    max_exploratory_steps=max_exploratory_steps_iters,
                    batch_size=batch_size_value,
                    n_epochs=num_episodes,
                    scale_reward=0.01,
                    epoch_length=steps_per_episode,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                )
                """
                DDPG
                """

                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    lp=lp,
                    batch_size=batch_size_value,
                    max_path_length=100,
                    epoch_length=steps_per_episode,
                    min_pool_size=10000,
                    n_epochs=num_episodes,
                    discount=0.99,
                    scale_reward=0.01,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
예제 #13
0
def run_task(*_):
    """
    DPG on Hopper environment
    """
    env = normalize(HopperEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(400, 300))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
    """
    Using the DDPG algorithm
    """
    # algo = DDPG(
    #     env=env,
    #     policy=policy,
    #     es=es,
    #     qf=qf,
    #     batch_size=32,
    #     max_path_length=500,
    #     epoch_length=500,
    #     min_pool_size=10000,
    #     n_epochs=20000,
    #     discount=0.99,
    #     scale_reward=0.01,
    #     qf_learning_rate=1e-3,
    #     policy_learning_rate=1e-4,
    #     #Uncomment both lines (this and the plot parameter below) to enable plotting
    #     plot=True,
    # )

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=64,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=10000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=10e-3,
        policy_learning_rate=10e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )

    algo.train()
def run_task(variant):
    import tensorflow as tf
    from railrl.railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.box2d.cartpole_env import CartpoleEnv

    env = TfEnv(CartpoleEnv())
    algo_name = variant['Algorithm']
    if algo_name == 'Quadratic-DDPG':
        qf = QuadraticNAF(
            name_or_scope="quadratic_qf",
            env_spec=env.spec,
        )
    elif algo_name == 'DDPG':
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
            embedded_hidden_sizes=(100, ),
            observation_hidden_sizes=(100, ),
            hidden_nonlinearity=tf.nn.relu,
        )
    else:
        raise Exception('Algo name not recognized: {0}'.format(algo_name))

    es = OUStrategy(env_spec=env.spec)
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )

    ddpg_params = dict(
        batch_size=128,
        n_epochs=100,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)
    algorithm.train()
예제 #15
0
def example(variant):
    env_settings = get_env_settings(**variant['env_params'])
    env = env_settings['env']
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(env, es, policy, qf, **variant['ddpg_params'])
    algorithm.train()
예제 #16
0
def run_task(*_):

    f = open('/home/qingkai/ddpg_performance.csv', "w+")

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
    qf_cost = ContinuousMLPQFunction(env_spec=env.spec)

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    algo = PDO_DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        qf_cost=qf_cost,
        dual_var=0,
        safety_constraint=safety_constraint,
        batch_size=64,
        max_path_length=15,
        epoch_length=10000,
        min_pool_size=10000,
        n_epochs=150,
        discount=0.99,
        qf_learning_rate=1e-3,
        qf_cost_learning_rate=1e-3,
        dual_learning_rate=1e-2,
        policy_learning_rate=1e-3,
        scale_reward=1,
        scale_cost=5,
        soft_target=True,
        soft_target_tau=0.001,
        eval_samples=10000,
        qf_weight_decay=0.,
        qf_cost_weight_decay=0.,
        avg_horizon=100000,
        #plot=True,
    )

    algo.train()
    f.close()
예제 #17
0
def test_ddpg():
    env = CartpoleEnv()
    policy = DeterministicMLPPolicy(env.spec)
    qf = ContinuousMLPQFunction(env.spec)
    es = OUStrategy(env.spec)
    algo = DDPG(
        env=env, policy=policy, qf=qf, es=es,
        n_epochs=1,
        epoch_length=100,
        batch_size=32,
        min_pool_size=50,
        replay_pool_size=1000,
        eval_samples=100,
    )
    algo.train()
def main():
    stub(globals())
    ddpg_params = dict(
        batch_size=64,
        n_epochs=2000,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        soft_target_tau=0.001,
        replay_pool_size=1000000,
        min_pool_size=1000,
        scale_reward=0.1,
    )
    env = TfEnv(HalfCheetahEnv())
    es = OUStrategy(env_spec=env.spec)

    policy = DeterministicMLPPolicy(
        name="init_policy",
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
    )
    qf = ContinuousMLPQFunction(
        name="qf",
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        bn=False,
    )

    algorithm = DDPG(
        env,
        policy,
        qf,
        es,
        **ddpg_params
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix="ddpg-shane-half-cheetah-script",
        seed=1,
        variant=ddpg_params,
    )
def random_action_launcher(variant):
	from railrl.algos.noop_algo import NoOpAlgo
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from rllab.policies.uniform_control_policy import UniformControlPolicy
	from railrl.launchers.launcher_util import get_env_settings
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env)
	policy = UniformControlPolicy(env_spec=env.spec)
	algorithm = NoOpAlgo(
		env,
		policy,
		es,
		**variant['algo_params']
	)
	algorithm.train()
def my_ddpg_launcher(variant):
	"""
	Run DDPG
	:param variant: Dictionary of dictionary with the following keys:
		- algo_params
		- env_params
		- qf_params
		- policy_params
	:return:
	"""
	from railrl.algos.ddpg import DDPG as MyDDPG
	from railrl.policies.nn_policy import FeedForwardPolicy
	from railrl.qfunctions.nn_qfunction import FeedForwardCritic
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.core.tf_util import BatchNormConfig
	if ('batch_norm_params' in variant
		and variant['batch_norm_params'] is not None):
		bn_config = BatchNormConfig(**variant['batch_norm_params'])
	else:
		bn_config = None
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env_spec=env.spec)
	qf = FeedForwardCritic(
		name_or_scope="critic",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('qf_params', {})
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('policy_params', {})
	)

	algorithm = MyDDPG(
		env,
		es,
		policy,
		qf,
		variant['tensorboard'],
		batch_norm_config=bn_config,
		**variant['algo_params'],
	)
	algorithm.train()
def run_task(_):
    from railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.gym_env import GymEnv

    def gym_env(name):
        return GymEnv(name,
                      record_video=False,
                      log_dir='/tmp/gym-test',  # Ignore gym log.
                      record_log=False)

    env = TfEnv(gym_env('AxeTwoDPoint-v0'))
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **ddpg_params
    )
    algorithm.train()
예제 #22
0
def test_rllab(patient_id=1, Initial_Bg=0):
    try:
        from rllab.algos.ddpg import DDPG
        from rllab.envs.normalized_env import normalize
        from rllab.exploration_strategies.ou_strategy import OUStrategy
        from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
        from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
        from rllab.envs.gym_env import GymEnv
    except ImportError:
        print('rllab is not installed!')
        return None

    env = GymEnv('simglucose-adult{}-CHO{}-v0'.format(Initial_Bg,
                                                      patient_id + 1))
    env = normalize(env)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each
        # with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(env=env,
                policy=policy,
                es=es,
                qf=qf,
                batch_size=32,
                max_path_length=100,
                epoch_length=1000,
                min_pool_size=10000,
                n_epochs=5,
                discount=0.99,
                scale_reward=0.01,
                qf_learning_rate=1e-3,
                policy_learning_rate=1e-4)
    algo.train()

    # env.close()

    return es, policy
예제 #23
0
def run_task(*_):
    env = normalize(GymEnv(args.env, force_reset=True, record_video=False))
    env.wrapped_env.env.env.reward_flag = args.reward

    if args.hidden_sizes == 0:
        hidden_sizes=(8,)
    elif args.hidden_sizes == 1:
        hidden_sizes=(32, 32)
    elif args.hidden_sizes == 2:
        hidden_sizes=(100, 50, 25)
    elif args.hidden_sizes == 3:
        hidden_sizes=(400, 300)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes
    )

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=64,
        max_path_length=95,
        epoch_length=args.batch_size,
        min_pool_size=10000,
        n_epochs=args.n_itr,
        discount=args.gamma,
        scale_reward=args.scale_reward,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        eval_samples=95,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=32,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=1000,
    )
    sweeper = DeterministicHyperparameterSweeper(
        {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, )
    exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp())
    for ddpg_params in sweeper.iterate_hyperparameters():
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            scale_reward=ddpg_params['scale_reward'],
            **default_ddpg_params,
        )

        for seed in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix=exp_prefix,
                seed=seed,
                # mode="local",
                # use_cloudpickle=True,
            )
예제 #25
0
def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = normalize(GymEnv(env_name = "LunarLanderContinuous-v2",force_reset=True))
    # env = normalize(GymEnv(env_name="BipedalWalker-v2", force_reset=True, record_video=True))
    max_path_length = 400
    # print("env.horizon: ",env.horizon)
    # input()
    # env._max_episode_steps = max_path_length

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64)
    )

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec,
                                hidden_sizes=(64, 64)
                                )

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=max_path_length,
        train_epoch_interval=300,
        min_pool_size=500,
        replay_pool_size = 10000,
        n_updates_per_sample =1,
        n_steps = 75000,
        discount=0.99,
        scale_reward=0.1,
        qf_learning_rate=1e-2,
        policy_learning_rate=1e-3,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
def example(*_):
    env = HalfCheetahEnv()
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        n_epochs=25,
        batch_size=1024,
        replay_pool_size=10000,
    )
    algorithm.train()
def oat_qddpg_launcher(variant):
	"""
	Quadratic optimal action target DDPG
	"""
	from railrl.algos.optimal_action_target_ddpg import OptimalActionTargetDDPG as OAT
	from railrl.policies.nn_policy import FeedForwardPolicy
	from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.core.tf_util import BatchNormConfig
	if ('batch_norm_params' in variant
		and variant['batch_norm_params'] is not None):
		bn_config = BatchNormConfig(**variant['batch_norm_params'])
	else:
		bn_config = None
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env_spec=env.spec)
	qf = QuadraticNAF(
		name_or_scope="critic",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant['qf_params']
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant['policy_params']
	)
	algorithm = OAT(
		env,
		es,
		policy,
		qf,
		batch_norm_config=bn_config,
		**variant['algo_params']
	)
	algorithm.train()
예제 #28
0
def main():
    stub(globals())
    env = TfEnv(CartpoleEnv())
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)

    for seed in range(3):
        env.reset()
        run_experiment_lite(
            algorithm.train(),
            n_parallel=1,
            snapshot_mode="last",
            exp_prefix="test-qddpg-cartpole",
            seed=seed,
        )
def run_task(_):
    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = OUStrategy(env_spec=env.spec)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        ddpg_params = dict(
            batch_size=16,
            n_epochs=100,
            epoch_length=100,
            eval_samples=100,
            max_path_length=10,
            min_pool_size=2,
        )
        algorithm = DDPG(env, es, policy, qf, **ddpg_params)

        algorithm.train(),
예제 #30
0
import tensorflow as tf

stub(globals())

# Param ranges
seed = 3
policy_lrs = [1e-5, 1e-4, 1e-3]
qf_lrs = [1e-5, 1e-4, 1e-3]
gammas = [0.9, 0.99, 0.995]
taus = [1e-3, 1e-2]

for policy_lr, qf_lr, gamma, tau in itertools.product(policy_lrs, qf_lrs, gammas, taus):
	env = TfEnv(normalize(env=GymEnv('Box3dReach-v4',record_video=False, \
	log_dir='/tmp/gym_test',record_log=False)))
	
	es = OUStrategy(env_spec=env.spec)
	qf = FeedForwardCritic(
		name_or_scope="critic",
		env_spec=env.spec,
		hidden_nonlinearity=tf.nn.tanh,
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		hidden_nonlinearity=tf.nn.tanh,
	)

	algo = DDPG(
		env,
		es,
		policy,