def run_vpg_baseline_large_batch_size_no_critic(*_): env = normalize(env_name()) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=( 100, 50, 25, ), adaptive_std=False, ) baseline = LinearFeatureBaseline(env_spec=env.spec) print("Iteration Number: {:}".format(n_itr)) print("Learning Rate : {:}".format(learning_rate)) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size * num_of_agents, max_path_length=500, n_itr=n_itr, discount=0.99, optimizer_args={'learning_rate': learning_rate}, sampler_cls=BatchSampler_no_critic, ) algo.train()
def run_task(*_): env = normalize(Cassie2dEnv()) if load_policy: filename = "123" data = joblib.load(filename) policy = data['policy'] print("Loading Pretrained Policy ...............................") else: policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), init_std=1.0, #adaptive_std=True, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=1000, # dt = (1/2000)*n, where n is Step(n) n_itr=400, discount=0.99, step_size=0.005, # default was 0.01 # Uncomment both lines (this and the plot parameter below) to enable plotting plot=False, ) algo.train()
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = GymEnv(env_name = "MountainCarContinuous-v0", force_reset=True) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = ZeroBaseline(env_spec=env.spec) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64) ) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=100, max_path_length=100, n_itr=10000, discount=0.99, optimizer_args=dict( learning_rate=0.01, ) ) algo.train()
def test_baseline(baseline_cls): env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,)) baseline = baseline_cls(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100 ) algo.train()
def run_task(*_): import gym_driving env = normalize(GymEnv('DrivingEnv-v0')) # env = normalize(GymEnv('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=env.horizon, n_itr=250, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(GymEnv('HovorkaInterval-v0')) # env.wrapped_env.env.env.env.reward_flag = 'absolute' env.wrapped_env.env.env.reward_flag = reward_functions[k] baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 1 hidden_sizes = NN_sizes[i] # hidden_sizes=(8,) # hidden_sizes=(32, 32) # hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = 200 gamma = .99 step_size = 0.01 # max_path_length = 96, algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, # max_path_length=max_path_length, n_itr=n_itr, discount=gamma, step_size=step_size) algo.train()
def run_task(*_): env = normalize(GymEnv(models[k])) baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 1 # hidden_sizes = NN_sizes[i] # hidden_sizes=(8,) # hidden_sizes=(32, 32) hidden_sizes = (100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = 200 gamma = .99 step_size = 0.01 # max_path_length = 96, # algo = VPG( algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, # max_path_length=max_path_length, n_itr=n_itr, discount=gamma, step_size=step_size) algo.train()
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = normalize( GymEnv(env_name="Acrobot-v1", force_reset=True, record_video=True)) max_path_length = env.horizon print(max_path_length) baseline = LinearFeatureBaseline(env_spec=env.spec) policy = CategoricalMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64)) # optimizer = FirstOrderOptimizer(update_method=lasagne.updates.adam, learning_rate=1e-1) algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=800, max_path_length=500, n_itr=10000, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) algo.train()
def main(args): env = GymEnv(args.env_id) # If the user provided a starting policy, use it. Otherwise, we start with # a fresh policy. if args.input_policy is not None: with open(args.input_policy, "rb") as f: policy = pickle.load(f) else: policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, # n_itr=2000, # max_path_length=env.horizon, # discount=0.99, # batch_size=4000, ) algo.train() with open(args.output_policy, "wb") as f: pickle.dump(policy, f)
rand_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) step_testing_rews = [] step_testing_rews.append( test_step_adv(env, pro_policy, path_length=path_length)) rand_step_testing_rews = [] rand_step_testing_rews.append( test_rand_step_adv(env, pro_policy, path_length=path_length)) adv_testing_rews = [] adv_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) #embed() for ni in range(n_itr): logger.log('\n\n\n####expNO{}_{} global itr# {}####\n\n\n'.format( ne, adv_name, ni)) pro_algo.train() pro_rews += pro_algo.rews all_rews += pro_algo.rews logger.log('Protag Reward: {}'.format(np.array(pro_algo.rews).mean())) const_testing_rews.append( test_const_adv(env, pro_policy, path_length=path_length)) rand_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) step_testing_rews.append( test_step_adv(env, pro_policy, path_length=path_length)) rand_step_testing_rews.append( test_rand_step_adv(env, pro_policy, path_length=path_length)) adv_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) if ni != 0 and ni % save_every == 0: ## SAVING INFO ##
algo = VPG( env=env, policy=policy, # baseline=baseline, baseline=baseline, batch_size=5000, max_path_length=env.horizon, n_itr=200, # discount=0.80, discount=.9, step_size=0.01 # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train() ## Testing the policy reward = [] actions = [] s = env.reset() # done = False # Testing the algorithm # while not done: for i in range(48): # Get action recommended by policy
if args.extra is not None: exp_prefix += "_" + args.extra if args.get_exp_paths: print('data/s3/' + exp_prefix.replace('_', '-')) else: if (not args.eval) and (not args.pchange) and (not args.savecomvel) and ( not args.savejointangles): if len(algo_lst) > 0: # algo_lst = list(filter(lambda x: x[0] != 20 and x[0] != 30, algo_lst)) # algo_lst = list(filter(lambda x: x[0] != 10, algo_lst)) print("algo list", algo_lst) for seed, algo in algo_lst: print(seed) exp_name = '{0}_{1}_{2}'.format( exp_prefix, str(seed), time.strftime("%d-%m-%Y_%H-%M-%S")) run_experiment_lite(stub_method_call=algo.train(), mode=mode, use_gpu=use_gpu, use_cloudpickle=False, pre_commands=['pip install --upgrade pip'], n_parallel=n_parallel, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, seed=seed, confirm_remote=False, exp_prefix=exp_prefix, exp_name=exp_name) else: print("seeds", seeds) for seed in seeds: exp_name = '{0}_{1}_{2}'.format(
hidden_arc = [str(i) for i in hidden_sizes] hidden_arc = '_'.join(hidden_arc) data_dir = 'Reinforce_batchSize_{}_nIters_{}_stepSize_{}_gamma_{}_initStd_{}{}_policyPar_{}_reward_{}'\ .format(batch_size, n_itr, step_size,''.join(str(gamma).split('.')), init_std, learn_std, hidden_arc, reward_fun) now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') DROPBOX_DIR = '/home/jonas/Dropbox/results/jonas_experiments/' # log_dir = PROJECT_PATH + '/data/local/' + data_dir + timestamp log_dir = DROPBOX_DIR + data_dir + timestamp # Running and saving the experiment run_experiment_lite( algo.train(), log_dir=log_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used # exp_prefix="Reinforce_" + env_name, # exp_prefix=data_dir seed=1, mode="local", plot=False, # terminate_machine=args.dont_terminate_machine, added_project_directories=[ osp.abspath(osp.join(osp.dirname(__file__), '.'))
from contrib.alexbeloi.is_sampler import ISSampler """ Example using VPG with ISSampler, iterations alternate between live and importance sampled iterations. """ env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1), ) algo.train()
def run_task(*_): env = normalize(GymEnv(args.env)) # env.wrapped_env.env.env.env.reward_flag = 'absolute' env.wrapped_env.env.env.reward_flag = args.reward baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 2 # hidden_sizes=(8,) hidden_sizes = (32, 32) # hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = args.n_itr gamma = .9 step_size = 0.01 if args.algorithm == 0: algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) if args.algorithm == 1: algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) if args.algorithm == 2: algo = TNPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) # if args.algorithm == 4: # algo = DDPG( # env=env, # policy=policy, # baseline=baseline, # batch_size=batch_size, # n_itr=n_itr, # discount=gamma, # step_size=step_size # ) algo.train() return algo