def run_task(*_): env = normalize(PointEnv()) policy = GaussianMLPPolicy(env_spec=env.spec,) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=500,discount=0.99,step_size=0.01 ) algo.train()
def run_task(*_): """Implement the ``run_task`` method needed to run experiments with rllab. Note that the flow-specific parameters are imported at the start of this script and unzipped and processed here. """ env_name = flow_params["env_name"] exp_tag = flow_params["exp_tag"] sumo_params = flow_params["sumo"] vehicles = flow_params["veh"] env_params = flow_params["env"] net_params = flow_params["net"] initial_config = flow_params.get("initial", InitialConfig()) traffic_lights = flow_params.get("tls", TrafficLights()) # import the scenario and generator classes module = __import__("flow.scenarios", fromlist=[flow_params["scenario"]]) scenario_class = getattr(module, flow_params["scenario"]) module = __import__("flow.scenarios", fromlist=[flow_params["generator"]]) generator_class = getattr(module, flow_params["generator"]) # create the scenario object scenario = scenario_class(name=exp_tag, generator_class=generator_class, vehicles=vehicles, net_params=net_params, initial_config=initial_config, traffic_lights=traffic_lights) pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25)) baseline = LinearFeatureBaseline(env_spec=env.spec) horizon = flow_params["env"].horizon algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=horizon * (N_ROLLOUTS - PARALLEL_ROLLOUTS + 1), max_path_length=horizon, n_itr=500, discount=0.999, step_size=0.01, ) algo.train(),
def test_trpo_deterministic_nan(): env = DummyEnv() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_trpo_relu_nan(): env = DummyEnv() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_nonlinearity=naive_relu, hidden_sizes=(1, )) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def run_task(v): print("_________________________________") print("#################################") print("_________________________________") print("_________________________________") print("#################################") print("### agents_number : " + str(agents_number) + " ####") print("### ####") print("### participation_rate : " + str(participation_rate) + " ####") print("### ####") print("### average_period : " + str(average_period) + " ####") print("### ####") print("### quantization_tuning : " + str(quantization_tuning) + " ####") print("### ####") print("### discount : " + str(discount) + " ####") print("#################################") print("_________________________________") print("_________________________________") print("#################################") print("_________________________________") env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = Server( participation_rate=participation_rate, agents_number=agents_number, average_period=average_period, env=env, policy=policy, baseline=baseline, difference_params=True, quantize=True, quantization_tuning=quantization_tuning, batch_size=400, max_path_length=100, n_itr=50, discount=discount, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): tot_cars = 6 auton_cars = 6 sumo_params = SumoParams(time_step=0.1, rl_speed_mode="no_collide", sumo_binary="sumo-gui") vehicles = Vehicles() vehicles.add_vehicles("rl", (RLController, {}), (StaticLaneChanger, {}), (ContinuousRouter, {}), 0, auton_cars) env_params = EnvParams(additional_params={"target_velocity": 25, "num_steps": 1000}) additional_net_params = {"length": 220, "lanes": 1, "speed_limit": 30, "resolution": 40} net_params = NetParams(additional_params=additional_net_params) initial_config = InitialConfig() scenario = LoopScenario("rl-test", CircleGenerator, vehicles, net_params, initial_config) env_name = "SimpleAccelerationEnvironment" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) logging.info("Experiment Set Up complete") policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(16,) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=2000, max_path_length=horizon, # whole_paths=True, n_itr=2, # 1000 # discount=0.99, # step_size=0.01, ) algo.train()
def run_task(*_): f = open('/home/qingkai/verina.csv', "w+") trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 env = PointGatherEnv(apple_reward=10, bomb_cost=1, n_apples=2, activity_range=6) policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32)) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64, 32), 'hidden_nonlinearity': NL.tanh, 'learn_std': False, 'step_size': trpo_stepsize, 'optimizer': ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) }) safety_constraint = GatherSafetyConstraint(max_value=0.2) algo = PDO( env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, batch_size=50000, max_path_length=15, n_itr=100, gae_lambda=0.95, discount=0.995, safety_tradeoff_coeff_lr=1e-1, step_size=trpo_stepsize, optimizer_args={'subsample_factor': trpo_subsample_factor}, #plot=True, ) algo.train() f.close()
def experiment_scratch_baseline(): # k = 100 for seed in [10, 30, 50, 100]: for _ in range(4): env = StandardControllerEnv(k=4, noise=0.05, num_dynamics=4, num_points=k) now = datetime.datetime.now() timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=( 32, 32, ), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=1000, max_path_length=env.horizon, n_itr=100, discount=0.995, step_size=0.001, plot=False, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # script="scripts/run_experiment_lite_rl.py", script="scripts/run_experiment_lite.py", exp_name=os.path.join("Baseline %d" % k, timestamp), log_dir=os.path.join( "Results/Controls/Increasing_Points/Baseline", timestamp) # Specifies the seed for the experiment. If this is not provided, a random seed # will be used # plot=True, )
def test(num=1, path="./Results/Tmp", save=False): # env = normalize(GymEnv("BipedalWalkerPit-v2")) env = normalize(GymEnv("BipedalWalker-v2", record_video=False)) # env = DoublePendulumEnv() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) rollout(env, policy) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=env.horizon, n_itr=100, discount=0.995, step_size=0.01, # plot=True, )
def run_task(*_): env = normalize( GymEnv("DartWalker3d-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) #policy = joblib.load('data/local/experiment/walker3d_symmetry1_sd13_2alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_2d_hardvelenforce_contsupport/policy.pkl') # increase policy std a bit for exploration #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = TRPO_Symmetry( env=env, policy=policy, baseline=baseline, batch_size=60000, max_path_length=env.horizon, n_itr=500, discount=0.99, step_size=0.02, gae_lambda=0.97, observation_permutation=np.array([0.0001,-1, 2,-3,-4, -5,-6,7, 14,-15,-16, 17, 18,-19, 8,-9,-10, 11, 12,-13,\ 20,21,-22, 23,-24,-25, -26,-27,28, 35,-36,-37, 38, 39,-40, 29,-30,-31, 32, 33,-34, 42, 41]), #observation_permutation=np.array([0.0001, 1, 5,6,7, 2,3,4, 8,9,10, 14,15,16, 11,12,13]), #action_permutation=np.array([3,4,5, 0.00001,1,2]), action_permutation=np.array([-0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8]), sym_loss_weight=2.0, whole_paths=False, ) algo.train()
def run_task(*_): env = normalize(GymEnv("Pendulum-v0")) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run_task(*_): env = normalize( GymEnv("DartHumanWalker-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) #policy = joblib.load('data/local/experiment/humanwalker_symmetry1_sd11_1alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_spd20002000/policy.pkl') # increase policy std a bit for exploration #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = TRPO_Symmetry( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=env.horizon, n_itr=1000, discount=0.99, step_size=0.02, gae_lambda=0.97, observation_permutation=np.array([0.0001,-1,2,-3,-4, -11,12,-13,14,15,16, -5,6,-7,8,9,10, -17,18, -19, -24,25,-26,27, -20,21,-22,23,\ 28,29,-30,31,-32,-33, -40,41,-42,43,44,45, -34,35,-36,37,38,39, -46,47, -48, -53,54,-55,56, -49,50,-51,52, 58,57]), action_permutation=np.array([-6,7,-8, 9, 10,11, -0.001,1,-2, 3, 4,5, -12,13, -14, -19,20,-21,22, -15,16,-17,18]), sym_loss_weight=1.0, action_reg_weight=0.0, whole_paths=False, ) algo.train()
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'Acrobot-v2' env = GymEnv(env_name) policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64)) algo = PPO(env=env, policy=policy, n_itr=1500, batch_size=8000, max_path_length=1000, discount=0.95, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env)) data_path = 'data/acrobat_data_rllab_ppo/%s/' % exp_name os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def run_vime(vv): setup_rllab_logging(vv) seed = vv['seed'] eta = 0.0001 path_len = vv['path_len'] mdp = get_env(vv) policy = GaussianMLPPolicy( env_spec=mdp.spec, hidden_sizes=(300, 200, 100), init_std=1.0, ) baseline = LinearFeatureBaseline( mdp.spec, ) batch_size = path_len * 100 algo = TRPOVIME( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=path_len, n_itr=1000, step_size=0.01, eta=eta, snn_n_samples=10, subsample_factor=1.0, use_replay_pool=True, use_kl_ratio=True, use_kl_ratio_q=True, n_itr_update=1, kl_batch_size=1, normalize_reward=False, replay_pool_size=1000000, n_updates_per_sample=5000, second_order_update=True, unn_n_hidden=[32], unn_layers_type=[1, 1], unn_learning_rate=0.0001 ) algo.train()
def run_task(v): which_agent = v["which_agent"] env, _ = create_env(which_agent) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = dict(base_eps=1e-5) #how many iters num_trpo_iters = 2500 if (which_agent == 1): num_trpo_iters = 2500 if (which_agent == 2): steps_per_rollout = 333 num_trpo_iters = 200 if (which_agent == 4): num_trpo_iters = 2000 if (which_agent == 6): num_trpo_iters = 2000 #recreate the policy policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(v["depth_fc_layers"], v["depth_fc_layers"]), init_std=v["std_on_mlp_policy"]) all_params = np.concatenate( (v["policy_values"], policy._l_log_std.get_params()[0].get_value())) policy.set_param_values(all_params) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=v["trpo_batchsize"], max_path_length=v["steps_per_rollout"], n_itr=num_trpo_iters, discount=0.995, optimizer=v["ConjugateGradientOptimizer"]( hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)), step_size=0.05, plot_true=True) #train the policy algo.train()
def run_task(*_): env = normalize(GymEnv(models[k])) baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 1 # hidden_sizes = NN_sizes[i] # hidden_sizes=(8,) # hidden_sizes=(32, 32) hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std ) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = 200 gamma = .99 step_size = 0.01 # max_path_length = 96, # algo = VPG( algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, # max_path_length=max_path_length, n_itr=n_itr, discount=gamma, step_size=step_size ) algo.train()
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env) algo = PPO(env=env, policy=policy, n_itr=1500, batch_size=8000, max_path_length=1000, discount=0.99, store_paths=True, entropy_weight=ent_wt, baseline=baseline) data_path = 'data/%s_data_rllab_%s/%s/' % (env_name.replace( '-', '_'), str(algo.__class__.__name__), exp_name) os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def run_task(v): env, _ = create_env(v["which_agent"]) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = dict(base_eps=1e-5) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=v["batch_size"], max_path_length=v["steps_per_rollout"], n_itr=v["num_trpo_iters"], discount=0.995, optimizer=v["ConjugateGradientOptimizer"]( hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)), step_size=0.05, plot_true=True) #train the policy algo.train()
def run_task(*_): env = normalize(GymEnv('HovorkaInterval-v0')) # env.wrapped_env.env.env.env.reward_flag = 'absolute' env.wrapped_env.env.env.reward_flag = reward_functions[k] baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 1 hidden_sizes = NN_sizes[i] # hidden_sizes=(8,) # hidden_sizes=(32, 32) # hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = 200 gamma = .99 step_size = 0.01 # max_path_length = 96, algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, # max_path_length=max_path_length, n_itr=n_itr, discount=gamma, step_size=step_size) algo.train()
def run_vpg_baseline_large_batch_size_no_critic(*_): env = normalize(env_name()) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(50,25,), adaptive_std=False, ) baseline = LinearFeatureBaseline(env_spec=env.spec) print("Iteration Number: {:}".format(n_itr)) print("Learning Rate : {:}".format(learning_rate)) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size* num_of_agents, max_path_length=500, n_itr=n_itr, discount=0.99, optimizer_args = {'learning_rate':learning_rate}, sampler_cls = BatchSampler_no_critic, ) algo.train()
def run_task(*_): env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=1000, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True) algo.train()
def run_task(*_): # Non-registration of this custom environment is an rllab bug # See https://github.com/openai/rllab/issues/68 # At the moment I'm bypassing this problem by adding the # import statement in gym_env.py import gym_follower_2d import lasagne.nonlinearities as NL gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=True) env = normalize(gymenv) logger.log("Training Policy on %s" % args.env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25), hidden_nonlinearity=NL.tanh) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=100, n_itr=args.num_epochs, discount=0.99, step_size=args.step_size, optimizer=ConjugateGradientOptimizer( reg_coeff=args.reg_coeff, hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)), plot=False, ) algo.train()
def run_trpo(vv): setup_rllab_logging(vv) path_len = vv['path_len'] env = get_env(vv) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(300, 200, 100), init_std=1.0, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=100*path_len, max_path_length=path_len, n_itr=1000, discount=0.99, step_size=0.01, ) algo.train()
def run_task(*_): """TRY OUT normalized environment""" env = normalize(TendonOneSegmentEnv()) policy = GaussianMLPPolicy( env_spec = env.spec, hidden_sizes=(64, 64) # output_nonlinearity=NL.tanh ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size = 4000, max_path_length=np.inf, n_itr=20001, discount=0.99, step_size=0.01, ) algo.train()
def run_task(*_): # env = normalize(SwimmerWrapperGym('Swimmer-v1')) env = normalize(GymEnv('Swimmer-v1')) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), learn_std=True) print('horizon {}'.format(env.horizon)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=200, discount=0.99, step_size=0.01, ) algo.train()
def rllab_vpg_launcher(variant): from rllab.algos.trpo import TRPO from railrl.launchers.launcher_util import get_env_settings from railrl.algos.icm_trpo import ICM from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline env_settings = get_env_settings(**variant['env_params']) env = TfEnv(env_settings['env']) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algorithm = VPG( env=env, policy=policy, baseline=baseline, **variant['algo_params'] ) algorithm.train()
def run_task(v): env = ServerEnv(agents_number, -10, 10) policy = GaussianMLPPolicy(env_spec=env.agents_envs[0].spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = Server( agents_number=agents_number, average_period=average_period, server_env=env, policy=policy, baseline=baseline, batch_size=400, max_path_length=100, n_itr=20, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(GymEnv(args.env)) baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = args.learn_std init_std = args.init_std if args.hidden_sizes == 0: hidden_sizes = (8, ) elif args.hidden_sizes == 1: hidden_sizes = (32, 32) elif args.hidden_sizes == 2: hidden_sizes = (100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = args.batch_size n_itr = args.n_itr gamma = args.gamma step_size = args.step_size algo = TNPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) algo.train()
def run_task(*_): env = normalize(GymEnv("Pendulum-v0", record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(8, 8) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import NormalizedEnv from rllab.algos.trpo import TRPO from rllab.misc.instrument import stub, run_experiment_lite from sandbox.bradly.third_person.envs.reacher import ReacherEnv from rllab.envs.gym_env import GymEnv stub(globals()) env = GymEnv("Reacher3DOF-v1", mode='oracle', force_reset=True)#, imsize=(48,48)) # env = TfEnv(normalize(ReacherEnv())) policy = GaussianMLPPolicy( # name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), init_std=10 ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=25000, max_path_length=50, n_itr=1000, discount=0.99, step_size=0.01,