def run_task(*_):
    """
    DPG on Swimmer environment
    """
    env = normalize(SwimmerEnv())
    """
    Initialise the policy as a neural network policy
    """
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))
    """
    Defining exploration strategy : OUStrategy - 
    """
    """
    This strategy implements the Ornstein-Uhlenbeck process, which adds
    time-correlated noise to the actions taken by the deterministic policy.
    The OU process satisfies the following stochastic differential equation:
    dxt = theta*(mu - xt)*dt + sigma*dWt
    where Wt denotes the Wiener process
    """
    es = OUStrategy(env_spec=env.spec)
    """
    Defining the Q network
    """
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    w = qf.get_param_values(regularizable=True)
    """
    Persistence Length Exploration
    """
    lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy)
    """
    Using the DDPG algorithm
    """
    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        lp=lp,
        batch_size=32,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=15000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    """
    Training the networks based on the DDPG algorithm
    """
    algo.train()
示例#2
0
def run_task(*_):

    f = open('/home/qingkai/ddpg_performance.csv', "w+")

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
    qf_cost = ContinuousMLPQFunction(env_spec=env.spec)

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    algo = PDO_DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        qf_cost=qf_cost,
        dual_var=0,
        safety_constraint=safety_constraint,
        batch_size=64,
        max_path_length=15,
        epoch_length=10000,
        min_pool_size=10000,
        n_epochs=150,
        discount=0.99,
        qf_learning_rate=1e-3,
        qf_cost_learning_rate=1e-3,
        dual_learning_rate=1e-2,
        policy_learning_rate=1e-3,
        scale_reward=1,
        scale_cost=5,
        soft_target=True,
        soft_target_tau=0.001,
        eval_samples=10000,
        qf_weight_decay=0.,
        qf_cost_weight_decay=0.,
        avg_horizon=100000,
        #plot=True,
    )

    algo.train()
    f.close()
def run_task(*_):

    env = normalize(
        GymEnv(env_name="MountainCarContinuous-v0", force_reset=True))
    max_path_length = 300

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=100,
        n_updates_per_sample=1,
        max_path_length=max_path_length,
        epoch_length=900,
        min_pool_size=800,
        replay_pool_size=5000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.1,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
    )
    algo.train()
示例#4
0
def run_task(*_):
    env = normalize(SwimmerEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=200,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    algo.train()
示例#5
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))
    es = OUStrategy(env_spec=env.spec)
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=350,
        epoch_length=350,
        min_pool_size=350,
        n_epochs=600,
        discount=0.99,
        scale_reward=1.0/140.0,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), 
                                             str(algo.__class__.__name__), 
                                             exp_name)
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
示例#6
0
            def run_task(*_):
                env = normalize(Walker2DEnv())

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(H_layer_first[h], H_layer_second[h])
                )

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec)

                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    batch_size=size_of_batch,
                    max_path_length=100,
                    epoch_length=1000,
                    min_pool_size=10000,
                    n_epochs=number_of_episodes,
                    discount=discount_factor,
                    scale_reward=reward_scaling[r],
                    qf_learning_rate=critic_learning_rate[c],
                    policy_learning_rate=actor_learning_rate[c],
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
示例#7
0
            def run_task(*_):

                env = normalize(SimpleHumanoidEnv())
                # env = SimpleHumanoidEnv()

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(32, 32))

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec,
                                            hidden_sizes=(32, 32))
                """
                Persistence Length Exploration
                """
                lp = Persistence_Length_Exploration(
                    env=env,
                    qf=qf,
                    policy=policy,
                    L_p=L_p_param[l_p_ind],
                    b_step_size=b_step_size[b_ind],
                    sigma=sigma_param[s_ind],
                    max_exploratory_steps=max_exploratory_steps_iters,
                    batch_size=batch_size_value,
                    n_epochs=num_episodes,
                    scale_reward=0.01,
                    epoch_length=steps_per_episode,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                )
                """
                DDPG
                """

                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    lp=lp,
                    batch_size=batch_size_value,
                    max_path_length=100,
                    epoch_length=steps_per_episode,
                    min_pool_size=10000,
                    n_epochs=num_episodes,
                    discount=0.99,
                    scale_reward=0.01,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
示例#8
0
def run_task(*_):
    """
    DPG on Hopper environment
    """
    env = normalize(HopperEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(400, 300))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
    """
    Using the DDPG algorithm
    """
    # algo = DDPG(
    #     env=env,
    #     policy=policy,
    #     es=es,
    #     qf=qf,
    #     batch_size=32,
    #     max_path_length=500,
    #     epoch_length=500,
    #     min_pool_size=10000,
    #     n_epochs=20000,
    #     discount=0.99,
    #     scale_reward=0.01,
    #     qf_learning_rate=1e-3,
    #     policy_learning_rate=1e-4,
    #     #Uncomment both lines (this and the plot parameter below) to enable plotting
    #     plot=True,
    # )

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=64,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=10000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=10e-3,
        policy_learning_rate=10e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )

    algo.train()
示例#9
0
def test_ddpg():
    env = CartpoleEnv()
    policy = DeterministicMLPPolicy(env.spec)
    qf = ContinuousMLPQFunction(env.spec)
    es = OUStrategy(env.spec)
    algo = DDPG(
        env=env, policy=policy, qf=qf, es=es,
        n_epochs=1,
        epoch_length=100,
        batch_size=32,
        min_pool_size=50,
        replay_pool_size=1000,
        eval_samples=100,
    )
    algo.train()
def test_rllab(patient_id=1, Initial_Bg=0):
    try:
        from rllab.algos.ddpg import DDPG
        from rllab.envs.normalized_env import normalize
        from rllab.exploration_strategies.ou_strategy import OUStrategy
        from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
        from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
        from rllab.envs.gym_env import GymEnv
    except ImportError:
        print('rllab is not installed!')
        return None

    env = GymEnv('simglucose-adult{}-CHO{}-v0'.format(Initial_Bg,
                                                      patient_id + 1))
    env = normalize(env)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each
        # with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(env=env,
                policy=policy,
                es=es,
                qf=qf,
                batch_size=32,
                max_path_length=100,
                epoch_length=1000,
                min_pool_size=10000,
                n_epochs=5,
                discount=0.99,
                scale_reward=0.01,
                qf_learning_rate=1e-3,
                policy_learning_rate=1e-4)
    algo.train()

    # env.close()

    return es, policy
示例#11
0
def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = normalize(GymEnv(env_name = "LunarLanderContinuous-v2",force_reset=True))
    # env = normalize(GymEnv(env_name="BipedalWalker-v2", force_reset=True, record_video=True))
    max_path_length = 400
    # print("env.horizon: ",env.horizon)
    # input()
    # env._max_episode_steps = max_path_length

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64)
    )

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec,
                                hidden_sizes=(64, 64)
                                )

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=max_path_length,
        train_epoch_interval=300,
        min_pool_size=500,
        replay_pool_size = 10000,
        n_updates_per_sample =1,
        n_steps = 75000,
        discount=0.99,
        scale_reward=0.1,
        qf_learning_rate=1e-2,
        policy_learning_rate=1e-3,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
示例#12
0
def run_task(*_):
    env = normalize(GymEnv(args.env, force_reset=True, record_video=False))
    env.wrapped_env.env.env.reward_flag = args.reward

    if args.hidden_sizes == 0:
        hidden_sizes=(8,)
    elif args.hidden_sizes == 1:
        hidden_sizes=(32, 32)
    elif args.hidden_sizes == 2:
        hidden_sizes=(100, 50, 25)
    elif args.hidden_sizes == 3:
        hidden_sizes=(400, 300)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes
    )

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=64,
        max_path_length=95,
        epoch_length=args.batch_size,
        min_pool_size=10000,
        n_epochs=args.n_itr,
        discount=args.gamma,
        scale_reward=args.scale_reward,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        eval_samples=95,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
示例#13
0
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
from rllab.misc import instrument
import sys

instrument.stub(globals())

env = normalize(PegEnv(), normalize_reward=True)

policy = DeterministicMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(42, 42)
)

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_sizes=(42, 42)
)

vg = instrument.VariantGenerator()
vg.add("scale_reward", [0.01])#, 0.001, 0.1])
vg.add("policy_learning_rate", [1e-4])#, 1e-3, 1e-5])
vg.add("qf_learning_rate", [1e-3]) #, 1e-3, 1e-4])
vg.add("decay_period", [1E+6, 1E+5, 1E+4, 1E+3, 1E+7, 1E+8, 1E+9, 1E+10])

variants = vg.variants()
num = eval(sys.argv[1])

print "#Experiments number:", num
variant = variants[num]

# es = OUStrategy(env_spec=env.spec, theta=0.15, sigma=0.3)
示例#14
0
envs = {
    "Arm": ArmEnv,
    "Stand": StandEnv,
    "Gait": GaitEnv,
    "Crouch": CrouchEnv,
    "Hop": HopEnv
}

env = normalize(envs[parsed.env](visualize=False))

# env = normalize(CartpoleEnv())
# env = normalize(GymEnv("Pendulum-v0", record_video=False, record_log=False))

if alg == "DDPG":
    qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64, 64))

    es = OUStrategy(env_spec=env.spec, theta=0.5)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32, 32))

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
示例#15
0
    "tanh": NL.tanh,
    "leaky_relu": NL.LeakyRectify
}

policy = DeterministicMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=args.policy_size,
    hidden_nonlinearity=activation_map[args.policy_activation],
)

es = OUStrategy(env_spec=env.spec)

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_nonlinearity=activation_map[args.vf_activation],
    hidden_sizes=args.vf_size,
)

algo = DDPG(env=env,
            policy=policy,
            es=es,
            qf=qf,
            batch_size=128,
            max_path_length=env.horizon,
            epoch_length=1000,
            min_pool_size=10000,
            n_epochs=args.num_epochs,
            discount=0.995,
            scale_reward=args.reward_scale,
            qf_learning_rate=1e-3,
示例#16
0
env = normalize(FWMAVSimEnv())
policy = DeterministicMLPPolicy(
    env_spec=env.spec,
    hidden_nonlinearity=NL.tanh,  #NL.rectify,LeakyRectify
    output_nonlinearity=NL.tanh,
    hidden_sizes=(32, 32),
)

es = OUStrategy(
    env_spec=env.spec, theta=0.15, sigma=0.3
)  #theta = decay rate of noise (small decay slower, fluctuate more, theta = 0.01 is about 220 steps, theta = 0.1 is about 20 steps, 0.15 is 15 step, 0.022 is 100 step), sigma = variation or the size of the noise

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_nonlinearity=NL.tanh,
    output_nonlinearity=None,
    hidden_sizes=(128, 128),
    output_W_init=LI.Uniform(-3e-6, 3e-6),
    output_b_init=LI.Uniform(-3e-6, 3e-6),
)

algo = DDPG(
    env=env,
    policy=policy,
    es=es,
    qf=qf,
    batch_size=256,  # Number of samples for each minibatch.
    max_path_length=1500,  # 5 seconds
    epoch_length=15000,  # How many timesteps for each epoch.
    min_pool_size=15000,  # Minimum size of the pool to start training.
    replay_pool_size=15000000,
    n_epochs=
示例#17
0
        max_path_length=env.horizon,
        n_itr=2000000,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
else:
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
示例#18
0
def run_task(*_):
    f = open('/home/qingkai/verina.csv', "w+")
    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32))

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args={
            'hidden_sizes': (64, 32),
            'hidden_nonlinearity':
            NL.tanh,
            'learn_std':
            False,
            'step_size':
            trpo_stepsize,
            'optimizer':
            ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
        })

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    ddpg_policy = DeterministicMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=(64, 32))

    ddpg_es = OUStrategy(env_spec=env.spec)

    ddpg_qf = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=(100, 100))
    ddpg_qf_cost = ContinuousMLPQFunction(env_spec=env.spec,
                                          hidden_sizes=(100, 100))

    offline_itr_n = 100000

    algo = PDO_OFF(
        env=env,
        policy=policy,
        baseline=baseline,
        safety_constraint=safety_constraint,
        batch_size=20000,
        max_path_length=15,
        n_itr=200,
        gae_lambda=0.95,
        discount=0.995,
        step_size=trpo_stepsize,
        optimizer_args={'subsample_factor': trpo_subsample_factor},
        ddpg_policy=ddpg_policy,
        ddpg_qf=ddpg_qf,
        ddpg_qf_cost=ddpg_qf_cost,
        ddpg_es=ddpg_es,
        ddpg_dual_var=0,
        ddpg_batch_size=64,
        ddpg_qf_learning_rate=1e-4,
        ddpg_qf_cost_learning_rate=1e-4,
        ddpg_dual_learning_rate=1e-3,
        ddpg_policy_learning_rate=1e-3,
        ddpg_scale_reward=1,
        ddpg_scale_cost=1,
        offline_itr_n=offline_itr_n,
        balance=0,
        safety_tradeoff_coeff_lr=1e-2,
        ddpg_avg_horizon=offline_itr_n,
        adjust_epoch=5,
        ddpg_qf_weight_decay=0.,
        #plot=True,
    )

    algo.train()
    f.close()