def run_task(*_): # env = normalize(HalfCheetahEnv()) env = GymEnv(env_name = "MountainCarContinuous-v0", force_reset=True) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = ZeroBaseline(env_spec=env.spec) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64) ) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=100, max_path_length=100, n_itr=10000, discount=0.99, optimizer_args=dict( learning_rate=0.01, ) ) algo.train()
def run_vpg_baseline_large_batch_size_no_critic(*_): env = normalize(env_name()) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=( 100, 50, 25, ), adaptive_std=False, ) baseline = LinearFeatureBaseline(env_spec=env.spec) print("Iteration Number: {:}".format(n_itr)) print("Learning Rate : {:}".format(learning_rate)) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size * num_of_agents, max_path_length=500, n_itr=n_itr, discount=0.99, optimizer_args={'learning_rate': learning_rate}, sampler_cls=BatchSampler_no_critic, ) algo.train()
def run_task(*_): env = normalize(Cassie2dEnv()) if load_policy: filename = "123" data = joblib.load(filename) policy = data['policy'] print("Loading Pretrained Policy ...............................") else: policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), init_std=1.0, #adaptive_std=True, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=1000, # dt = (1/2000)*n, where n is Step(n) n_itr=400, discount=0.99, step_size=0.005, # default was 0.01 # Uncomment both lines (this and the plot parameter below) to enable plotting plot=False, ) algo.train()
def test_baseline(baseline_cls): env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,)) baseline = baseline_cls(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100 ) algo.train()
def run_task(*_): import gym_driving env = normalize(GymEnv('DrivingEnv-v0')) # env = normalize(GymEnv('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=env.horizon, n_itr=250, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(GymEnv('HovorkaInterval-v0')) # env.wrapped_env.env.env.env.reward_flag = 'absolute' env.wrapped_env.env.env.reward_flag = reward_functions[k] baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 1 hidden_sizes = NN_sizes[i] # hidden_sizes=(8,) # hidden_sizes=(32, 32) # hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = 200 gamma = .99 step_size = 0.01 # max_path_length = 96, algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, # max_path_length=max_path_length, n_itr=n_itr, discount=gamma, step_size=step_size) algo.train()
def run_task(*_): env = normalize(GymEnv(models[k])) baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 1 # hidden_sizes = NN_sizes[i] # hidden_sizes=(8,) # hidden_sizes=(32, 32) hidden_sizes = (100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = 200 gamma = .99 step_size = 0.01 # max_path_length = 96, # algo = VPG( algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, # max_path_length=max_path_length, n_itr=n_itr, discount=gamma, step_size=step_size) algo.train()
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = normalize( GymEnv(env_name="Acrobot-v1", force_reset=True, record_video=True)) max_path_length = env.horizon print(max_path_length) baseline = LinearFeatureBaseline(env_spec=env.spec) policy = CategoricalMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64)) # optimizer = FirstOrderOptimizer(update_method=lasagne.updates.adam, learning_rate=1e-1) algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=800, max_path_length=500, n_itr=10000, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) algo.train()
def main(args): env = GymEnv(args.env_id) # If the user provided a starting policy, use it. Otherwise, we start with # a fresh policy. if args.input_policy is not None: with open(args.input_policy, "rb") as f: policy = pickle.load(f) else: policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, # n_itr=2000, # max_path_length=env.horizon, # discount=0.99, # batch_size=4000, ) algo.train() with open(args.output_policy, "wb") as f: pickle.dump(policy, f)
from contrib.alexbeloi.is_sampler import ISSampler """ Example using VPG with ISSampler, iterations alternate between live and importance sampled iterations. """ env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1), ) algo.train()
zero_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) ## Optimizer for the Protagonist ## from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_process) if adv_name == 'no_adv': pro_algo = VPG( env=env, pro_policy=pro_policy, policy=pro_policy, adv_policy=zero_adv_policy, pro_baseline=pro_baseline, baseline=pro_baseline, adv_baseline=pro_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, # optimizer=optimizer, is_protagonist=True) ## Joint optimization ## if ifRender == True: test_const_adv(env, pro_policy, path_length=path_length, n_traj=1, render=True)
env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(8, ), # hidden_sizes=(32, 32), # hidden_sizes=(100, 50, 25), learn_std=True, init_std=1) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, # baseline=baseline, baseline=baseline, batch_size=5000, max_path_length=env.horizon, n_itr=200, # discount=0.80, discount=.9, step_size=0.01 # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train() ## Testing the policy reward = [] actions = [] s = env.reset()
from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from contrib.alexbeloi.is_sampler import ISSampler """ Example using VPG with ISSampler, iterations alternate between live and importance sampled iterations. """ env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1), ) algo.train()
n_parallel = 1 exp_prefix, algo = None, None if algo_name == 'vpg': exp_prefix = args.env_name + "_vpg_larger" if args.test: exp_prefix = "test_" + exp_prefix policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(256, 64), # (64, 64) min_std=1e-4) optimizer_args = dict(learning_rate=0.01) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, # 40 discount=discount, optimizer_args=optimizer_args) exp_prefix = "{0}_lr{1}".format(exp_prefix, str(learning_rate)) elif algo_name == 'hierarchical_vpg': exp_common_prefix = args.env_name + "_hiervpg" if args.test: exp_common_prefix = "test_" + exp_common_prefix if skill_baseline: exp_common_prefix += "_sbl" if args.random_init: exp_common_prefix += "_randominit" if trainable_snn: exp_common_prefix += "_trainablelat"
# =================== # Defining the policy # =================== policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) # Formatting string for data directory hidden_arc = [str(i) for i in hidden_sizes] hidden_arc = '_'.join(hidden_arc) data_dir = 'Reinforce_batchSize_{}_nIters_{}_stepSize_{}_gamma_{}_initStd_{}{}_policyPar_{}_reward_{}'\ .format(batch_size, n_itr, step_size,''.join(str(gamma).split('.')), init_std, learn_std, hidden_arc, reward_fun) now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') DROPBOX_DIR = '/home/jonas/Dropbox/results/jonas_experiments/'
from rllab.misc.instrument import run_experiment_lite, stub from rllab.policies.categorical_mlp_policy import CategoricalMLPPolicy from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.policies.uniform_control_policy import UniformControlPolicy from rllab.sampler import parallel_sampler stub(globals()) env = GymEnv("SpaceInvaders-v0", force_reset=True) # env = AtariEnvWrapper(env, 4, 84, 84) # env = CartpoleEnv() policy = CategoricalMLPPolicy(env.spec) baseline = ZeroBaseline(env.spec) # parallel_sampler.initialize(n_parallel=1) algo = VPG(env, policy, baseline) algo = DummyAlgo(env, policy) # algo.train() run_experiment_lite( algo.train(), exp_prefix="dummy-tester", # Number of parallel workers for sampling n_parallel=2, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # mode="ec2", # use_gpu=True, # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1,
def run_task(*_): env = normalize(GymEnv(args.env)) # env.wrapped_env.env.env.env.reward_flag = 'absolute' env.wrapped_env.env.env.reward_flag = args.reward baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 2 # hidden_sizes=(8,) hidden_sizes = (32, 32) # hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = args.n_itr gamma = .9 step_size = 0.01 if args.algorithm == 0: algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) if args.algorithm == 1: algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) if args.algorithm == 2: algo = TNPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) # if args.algorithm == 4: # algo = DDPG( # env=env, # policy=policy, # baseline=baseline, # batch_size=batch_size, # n_itr=n_itr, # discount=gamma, # step_size=step_size # ) algo.train() return algo
from rllab.algos.vpg import VPG from rllab.baselines.zero_baseline import ZeroBaseline from three_card_poker_env import ThreeCardPokerEnv from rllab.envs.normalized_env import normalize from rllab.policies.categorical_mlp_policy import CategoricalMLPPolicy env = normalize(ThreeCardPokerEnv()) policy = CategoricalMLPPolicy(env_spec=env.spec) baseline = ZeroBaseline(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, n_itr=300) algo.train()