def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = GymEnv(env_name = "MountainCarContinuous-v0", force_reset=True)

    # baseline = LinearFeatureBaseline(env_spec=env.spec)
    baseline = ZeroBaseline(env_spec=env.spec)
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64)
    )

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=100,
        max_path_length=100,
        n_itr=10000,
        discount=0.99,
        optimizer_args=dict(
            learning_rate=0.01,
        )
    )
    algo.train()
Пример #2
0
def run_vpg_baseline_large_batch_size_no_critic(*_):
    env = normalize(env_name())
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(
            100,
            50,
            25,
        ),
        adaptive_std=False,
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    print("Iteration Number: {:}".format(n_itr))
    print("Learning Rate : {:}".format(learning_rate))
    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size * num_of_agents,
        max_path_length=500,
        n_itr=n_itr,
        discount=0.99,
        optimizer_args={'learning_rate': learning_rate},
        sampler_cls=BatchSampler_no_critic,
    )
    algo.train()
Пример #3
0
def run_task(*_):
    env = normalize(Cassie2dEnv())

    if load_policy:
        filename = "123"
        data = joblib.load(filename)
        policy = data['policy']
        print("Loading Pretrained Policy ...............................")
    else:
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            # The neural network policy should have two hidden layers, each with 32 hidden units.
            hidden_sizes=(32, 32),
            init_std=1.0,
            #adaptive_std=True,
        )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=10000,
        max_path_length=1000,  # dt = (1/2000)*n, where n is Step(n)
        n_itr=400,
        discount=0.99,
        step_size=0.005,  # default was 0.01
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=False,
    )
    algo.train()
Пример #4
0
def test_baseline(baseline_cls):
    env = CartpoleEnv()
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,))
    baseline = baseline_cls(env_spec=env.spec)
    algo = VPG(
        env=env, policy=policy, baseline=baseline,
        n_itr=1, batch_size=1000, max_path_length=100
    )
    algo.train()
Пример #5
0
def run_task(*_):
    import gym_driving
    env = normalize(GymEnv('DrivingEnv-v0'))
    # env = normalize(GymEnv('CartPole-v0'))

    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=env.horizon,
        n_itr=250,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Пример #6
0
        def run_task(*_):
            env = normalize(GymEnv('HovorkaInterval-v0'))
            # env.wrapped_env.env.env.env.reward_flag = 'absolute'
            env.wrapped_env.env.env.reward_flag = reward_functions[k]

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            learn_std = True
            init_std = 1

            hidden_sizes = NN_sizes[i]
            # hidden_sizes=(8,)
            # hidden_sizes=(32, 32)
            # hidden_sizes=(100, 50, 25)

            policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=hidden_sizes,
                                       learn_std=learn_std,
                                       init_std=init_std)

            # =======================
            # Defining the algorithm
            # =======================
            batch_size = 5000
            n_itr = 200
            gamma = .99
            step_size = 0.01
            # max_path_length = 96,

            algo = VPG(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                # max_path_length=max_path_length,
                n_itr=n_itr,
                discount=gamma,
                step_size=step_size)
            algo.train()
Пример #7
0
        def run_task(*_):
            env = normalize(GymEnv(models[k]))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            learn_std = True
            init_std = 1

            # hidden_sizes = NN_sizes[i]
            # hidden_sizes=(8,)
            # hidden_sizes=(32, 32)
            hidden_sizes = (100, 50, 25)

            policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=hidden_sizes,
                                       learn_std=learn_std,
                                       init_std=init_std)

            # =======================
            # Defining the algorithm
            # =======================
            batch_size = 5000
            n_itr = 200
            gamma = .99
            step_size = 0.01
            # max_path_length = 96,

            # algo = VPG(
            algo = VPG(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                # max_path_length=max_path_length,
                n_itr=n_itr,
                discount=gamma,
                step_size=step_size)
            algo.train()
def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = normalize(
        GymEnv(env_name="Acrobot-v1", force_reset=True, record_video=True))

    max_path_length = env.horizon
    print(max_path_length)
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64))
    # optimizer = FirstOrderOptimizer(update_method=lasagne.updates.adam, learning_rate=1e-1)

    algo = VPG(env=env,
               policy=policy,
               baseline=baseline,
               batch_size=800,
               max_path_length=500,
               n_itr=10000,
               discount=0.99,
               optimizer_args=dict(learning_rate=0.01, ))
    algo.train()
Пример #9
0
def main(args):
    env = GymEnv(args.env_id)

    # If the user provided a starting policy, use it. Otherwise, we start with
    # a fresh policy.
    if args.input_policy is not None:
        with open(args.input_policy, "rb") as f:
            policy = pickle.load(f)
    else:
        policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
#        n_itr=2000,
#        max_path_length=env.horizon,
#        discount=0.99,
#        batch_size=4000,
    )
    algo.train()
    with open(args.output_policy, "wb") as f:
        pickle.dump(policy, f)
Пример #10
0
from contrib.alexbeloi.is_sampler import ISSampler

"""
Example using VPG with ISSampler, iterations alternate between live and
importance sampled iterations.
"""

env = normalize(CartpoleEnv())

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
    step_size=0.01,
    sampler_cls=ISSampler,
    sampler_args=dict(n_backtrack=1),
)
algo.train()
Пример #11
0
    zero_adv_policy = ConstantControlPolicy(env_spec=env.spec,
                                            is_protagonist=False,
                                            constant_val=0.0)

    ## Optimizer for the Protagonist ##
    from rllab.sampler import parallel_sampler
    parallel_sampler.initialize(n_process)
    if adv_name == 'no_adv':
        pro_algo = VPG(
            env=env,
            pro_policy=pro_policy,
            policy=pro_policy,
            adv_policy=zero_adv_policy,
            pro_baseline=pro_baseline,
            baseline=pro_baseline,
            adv_baseline=pro_baseline,
            batch_size=batch_size,
            max_path_length=path_length,
            n_itr=n_pro_itr,
            discount=0.995,
            gae_lambda=gae_lambda,
            step_size=step_size,
            # optimizer=optimizer,
            is_protagonist=True)

    ## Joint optimization ##
    if ifRender == True:
        test_const_adv(env,
                       pro_policy,
                       path_length=path_length,
                       n_traj=1,
                       render=True)
Пример #12
0
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(8, ),
    # hidden_sizes=(32, 32),
    # hidden_sizes=(100, 50, 25),
    learn_std=True,
    init_std=1)
baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(
    env=env,
    policy=policy,
    # baseline=baseline,
    baseline=baseline,
    batch_size=5000,
    max_path_length=env.horizon,
    n_itr=200,
    # discount=0.80,
    discount=.9,
    step_size=0.01
    # Uncomment both lines (this and the plot parameter below) to enable plotting
    # plot=True,
)
algo.train()

## Testing the policy

reward = []
actions = []

s = env.reset()
Пример #13
0
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from contrib.alexbeloi.is_sampler import ISSampler
"""
Example using VPG with ISSampler, iterations alternate between live and
importance sampled iterations.
"""

env = normalize(CartpoleEnv())

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
    step_size=0.01,
    sampler_cls=ISSampler,
    sampler_args=dict(n_backtrack=1),
)
algo.train()
Пример #14
0
    n_parallel = 1
exp_prefix, algo = None, None
if algo_name == 'vpg':
    exp_prefix = args.env_name + "_vpg_larger"
    if args.test:
        exp_prefix = "test_" + exp_prefix
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(256, 64),  # (64, 64)
        min_std=1e-4)
    optimizer_args = dict(learning_rate=0.01)
    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        max_path_length=max_path_length,
        n_itr=n_itr,  # 40
        discount=discount,
        optimizer_args=optimizer_args)
    exp_prefix = "{0}_lr{1}".format(exp_prefix, str(learning_rate))
elif algo_name == 'hierarchical_vpg':
    exp_common_prefix = args.env_name + "_hiervpg"
    if args.test:
        exp_common_prefix = "test_" + exp_common_prefix
    if skill_baseline:
        exp_common_prefix += "_sbl"
    if args.random_init:
        exp_common_prefix += "_randominit"
    if trainable_snn:
        exp_common_prefix += "_trainablelat"
Пример #15
0
# ===================
# Defining the policy
# ===================
policy = GaussianMLPPolicy(env_spec=env.spec,
                           hidden_sizes=hidden_sizes,
                           learn_std=learn_std,
                           init_std=init_std)

# =======================
# Defining the algorithm
# =======================
algo = VPG(env=env,
           policy=policy,
           baseline=baseline,
           batch_size=batch_size,
           n_itr=n_itr,
           discount=gamma,
           step_size=step_size)

# Formatting string for data directory
hidden_arc = [str(i) for i in hidden_sizes]
hidden_arc = '_'.join(hidden_arc)

data_dir = 'Reinforce_batchSize_{}_nIters_{}_stepSize_{}_gamma_{}_initStd_{}{}_policyPar_{}_reward_{}'\
        .format(batch_size, n_itr, step_size,''.join(str(gamma).split('.')), init_std, learn_std, hidden_arc, reward_fun)

now = datetime.datetime.now(dateutil.tz.tzlocal())
timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

DROPBOX_DIR = '/home/jonas/Dropbox/results/jonas_experiments/'
Пример #16
0
from rllab.misc.instrument import run_experiment_lite, stub
from rllab.policies.categorical_mlp_policy import CategoricalMLPPolicy
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.policies.uniform_control_policy import UniformControlPolicy

from rllab.sampler import parallel_sampler

stub(globals())

env = GymEnv("SpaceInvaders-v0", force_reset=True)
# env = AtariEnvWrapper(env, 4, 84, 84)
# env = CartpoleEnv()
policy = CategoricalMLPPolicy(env.spec)
baseline = ZeroBaseline(env.spec)
# parallel_sampler.initialize(n_parallel=1)
algo = VPG(env, policy, baseline)
algo = DummyAlgo(env, policy)
# algo.train()

run_experiment_lite(
    algo.train(),
    exp_prefix="dummy-tester",
    # Number of parallel workers for sampling
    n_parallel=2,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # mode="ec2",
    # use_gpu=True,
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
Пример #17
0
def run_task(*_):
    env = normalize(GymEnv(args.env))
    # env.wrapped_env.env.env.env.reward_flag = 'absolute'
    env.wrapped_env.env.env.reward_flag = args.reward

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    learn_std = True
    init_std = 2

    # hidden_sizes=(8,)
    hidden_sizes = (32, 32)
    # hidden_sizes=(100, 50, 25)

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=hidden_sizes,
                               learn_std=learn_std,
                               init_std=init_std)

    # =======================
    # Defining the algorithm
    # =======================
    batch_size = 5000
    n_itr = args.n_itr
    gamma = .9
    step_size = 0.01

    if args.algorithm == 0:
        algo = VPG(env=env,
                   policy=policy,
                   baseline=baseline,
                   batch_size=batch_size,
                   n_itr=n_itr,
                   discount=gamma,
                   step_size=step_size)
    if args.algorithm == 1:
        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    n_itr=n_itr,
                    discount=gamma,
                    step_size=step_size)
    if args.algorithm == 2:
        algo = TNPG(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    n_itr=n_itr,
                    discount=gamma,
                    step_size=step_size)
    # if args.algorithm == 4:
    # algo = DDPG(
    # env=env,
    # policy=policy,
    # baseline=baseline,
    # batch_size=batch_size,
    # n_itr=n_itr,
    # discount=gamma,
    # step_size=step_size
    # )
    algo.train()

    return algo
Пример #18
0
from rllab.algos.vpg import VPG
from rllab.baselines.zero_baseline import ZeroBaseline
from three_card_poker_env import ThreeCardPokerEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.categorical_mlp_policy import CategoricalMLPPolicy

env = normalize(ThreeCardPokerEnv())
policy = CategoricalMLPPolicy(env_spec=env.spec)
baseline = ZeroBaseline(env_spec=env.spec)
algo = VPG(env=env, policy=policy, baseline=baseline, n_itr=300)
algo.train()