def run_task(v): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] env = TfEnv(DmControlEnv.from_suite(*task)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=10, max_path_length=5, n_itr=1, discount=0.99, step_size=0.01, ) runner = LocalRunner(self.sess) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=10) env.close()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(env_name="CartPole-v1") policy = CategoricalLSTMPolicy( name="policy", env_spec=env.spec, lstm_layer_cls=L.TfBasicLSTMLayer, # gru_layer_cls=L.GRULayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def run_task(*_): env_name = "HumanoidStandup-v2" hidden_sizes = (100, 50, 25) env = TheanoEnv(normalize(gym.make(env_name))) print(env.spec.observation_space, env.spec.action_space) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy, n_timestep=5e6, learning_rate=0.05, batch_size=5000, minibatch_size=500, n_sub_itr=10, baseline=baseline, max_path_length=500, discount=0.99, decay_learing_rate=True, log_dir='./logs/' + env_name, ) algo.train()
def run_task(v): """ We wrap the main training loop in the run_task function so that run_experiment can easily execute variants of the experiment on different machines """ env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def run_task(*_): """Run task function.""" initial_goal = np.array([0.6, -0.1, 0.30]) rospy.init_node('trpo_real_sawyer_reacher_exp', anonymous=True) env = TheanoEnv( ReacherEnv( initial_goal, initial_joint_pos=INITIAL_ROBOT_JOINT_POS, simulated=False, robot_control_mode='position')) rospy.on_shutdown(env.shutdown) env.initialize() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with self.graph.as_default(): env = TfEnv(DmControlEnv.from_suite(*task)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=10, max_path_length=5, n_itr=1, discount=0.99, step_size=0.01, ) algo.train() env.close()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(normalize(PointEnv(goal=(-1, 0)))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) batch_size = 4000 max_path_length = 100 n_envs = batch_size // max_path_length runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size, plot=False)
def run_task(*_): """Run task function.""" initial_goal = np.array([0.6, -0.1, 0.40]) # Initialize moveit_commander moveit_commander.roscpp_initialize(sys.argv) rospy.init_node('trpo_sim_sawyer_reacher_exp', anonymous=True) env = ReacherEnv(initial_goal, initial_joint_pos=INITIAL_ROBOT_JOINT_POS, simulated=True) rospy.on_shutdown(env.shutdown) env.initialize() policy = GaussianMLPPolicy(env_spec=spec(env), hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=spec(env)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train()
def run_task(*_): with LocalRunner() as runner: env = TfEnv( normalize( OneHotMultiTaskEnv(task_env_cls=PointEnv, task_args=TASK_ARGS, task_kwargs=TASK_KWARGS))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with LocalRunner(self.sess) as runner: logger.reset() env = TfEnv(gym.make("CartPole-v0")) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, max_kl_step=1e6) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def test_vpg_cartpole(self): """Test VPG with CartPole-v1 environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 90 env.close()
def run_task(*_): env = TfEnv( normalize( MinibotEnv( use_maps=[0, 1], # 'all', # [0,1] discretized=True))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=100, n_itr=15, discount=0.99, step_size=0.01, plot=plot, pause_for_plot=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as session: algo.train(sess=session)
def run_task(vv): with LocalRunner() as runner: env = TfEnv(normalize(gym.make('HalfCheetah-v1'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32), name="policy") baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, step_size=vv["step_size"], ) runner.setup(algo=algo, env=env) runner.train( n_epochs=40, batch_size=4000, # Uncomment to enable plotting # plot=True )
def run_task(*_): env = TfEnv( normalize( OneHotMultiTaskEnv(task_env_cls=PR2ArmClockEnv, task_args=TASK_ARGS, task_kwargs=TASK_KWARGS))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=400000000, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def test_batch_sampler(self): max_cpus = 8 with LocalRunner(max_cpus=max_cpus) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, max_path_length=1, whole_paths=True, discount=0.99) runner.setup(algo, env, sampler_cls=BatchSampler, sampler_args={'n_envs': max_cpus}) try: runner.initialize_tf_vars() except BaseException: raise self.failureException( "LocalRunner should be able to initialize tf variables.") runner.start_worker() paths = runner.sampler.obtain_samples(0, 8) self.assertGreaterEqual( len(paths), max_cpus, "BatchSampler should sample more than " "max_cpus=%d trajectories" % max_cpus)
def test_categorical_policies(self, policy_cls): with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make("CartPole-v0"))) policy = policy_cls(name="policy", env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, step_size=0.01, plot=True, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)), ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=4000) env.close()
def run_task(v): """ We wrap the main training loop in the run_task function so that run_experiment can easily execute variants of the experiment on different machines """ with LocalRunner() as runner: env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, step_size=v["step_size"], ) runner.setup(algo=algo, env=env) runner.train( n_epochs=40, batch_size=4000, # Uncomment to enable plotting # plot=True )
def run_task(*_): env = normalize( DmControlEnv( domain_name='cartpole', task_name='balance', visualize_reward=True)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=400, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may # require different policies. For example with a Discrete action space, a # CategoricalMLPPolicy works, but for a Box action space may need to use # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example) env = TheanoEnv(normalize(gym.make("CartPole-v0"))) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def run_task(*_): env = normalize(CartpoleEnv()) policy = DummyPolicy(env_spec=env) baseline = LinearFeatureBaseline(env_spec=env) algo = InstrumentedNOP(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=4, discount=0.99, step_size=0.01, plot=True) algo.train()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make("CartPole-v0")) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000, plot=False)
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = InstrumentedTRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=4, discount=0.99, step_size=0.01, plot=True) algo.train()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, step_size=0.01) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=4000)
def run_task(*_): """Wrap REPS training task in the run_task function.""" env = TfEnv(gym.make("CartPole-v0")) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, plot=False) algo.train()
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) algo.train()
def run_task(*_): """Wrap ERWR training task in the run_task function.""" env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR(env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=40, discount=0.99) algo.train()
def run_pick_and_place(*_): initial_goal = np.array([0.6, -0.1, 0.80]) env = TheanoEnv(PickAndPlaceEnv(initial_goal)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=2000, baseline=baseline, n_itr=1000, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(*_): """Wrap ERWR training task in the run_task function.""" env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR(env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=100, plot=True, discount=0.99) algo.train()
def run(*_): """Stub method for running trpo.""" env = TheanoEnv( ReacherEnv(control_method='position_control', sparse_reward=False)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=100, baseline=baseline, n_itr=2500, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()