def run_task(*_): env_name = "HumanoidStandup-v2" hidden_sizes = (100, 50, 25) env = TheanoEnv(normalize(gym.make(env_name))) print(env.spec.observation_space, env.spec.action_space) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy, n_timestep=5e6, learning_rate=0.05, batch_size=5000, minibatch_size=500, n_sub_itr=10, baseline=baseline, max_path_length=500, discount=0.99, decay_learing_rate=True, log_dir='./logs/' + env_name, ) algo.train()
def run_task(*_): env_name = "Ant" hidden_sizes = (32,32) env = TheanoEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = ZeroBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy, n_timestep=5e6, learning_rate=0.01, batch_size=5000, minibatch_size=500, n_sub_itr = 10, baseline=baseline, max_path_length=500, discount=0.99, decay_learing_rate=True, log_dir='./logs/' + env_name, ) algo.train()
def test_trpo_relu_nan(self): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_trpo_deterministic_nan(self): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def run_task(*_): """Run task function.""" initial_goal = np.array([0.6, -0.1, 0.30]) rospy.init_node('trpo_real_sawyer_reacher_exp', anonymous=True) env = TheanoEnv( ReacherEnv( initial_goal, initial_joint_pos=INITIAL_ROBOT_JOINT_POS, simulated=False, robot_control_mode='position')) rospy.on_shutdown(env.shutdown) env.initialize() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train()
def run_task(*_): initial_goal = np.array([0.6, -0.1, 0.80]) rospy.init_node('trpo_real_sawyer_pnp_exp', anonymous=True) pnp_env = TheanoEnv( PickAndPlaceEnv(initial_goal, initial_joint_pos=INITIAL_ROBOT_JOINT_POS, simulated=False)) rospy.on_shutdown(pnp_env.shutdown) pnp_env.initialize() env = pnp_env policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may # require different policies. For example with a Box action space, a # GaussianMLPPolicy works, but for a Discrete action space may need to use # a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example) env = TheanoEnv(normalize(gym.make("Pendulum-v0"))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable # plotting # plot=True, ) algo.train()
def run_task(v): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def test_baseline(self, baseline_cls): env = TheanoEnv(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6, )) baseline = baseline_cls(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100) algo.train()
def test_adaptive_std(): """ Checks if the adaptive_std parameter works. """ env = TheanoEnv(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1) algo.train()
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = InstrumentedTRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=4, discount=0.99, step_size=0.01, plot=True) algo.train()
def run_pick_and_place(*_): initial_goal = np.array([0.6, -0.1, 0.80]) env = TheanoEnv(PickAndPlaceEnv(initial_goal)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=2000, baseline=baseline, n_itr=1000, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(v): env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # plot=True, ) algo.train()
def run(*_): """Stub method for running trpo.""" env = TheanoEnv( ReacherEnv(control_method='position_control', sparse_reward=False)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=100, baseline=baseline, n_itr=2500, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(*_): env = TheanoEnv(normalize(gym.make("Pendulum-v0"))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.max_episode_steps, n_itr=50, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run_block_stacking(*_): """Run TRPO with block stacking. """ env = TheanoEnv(BlockStackingEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=2000, baseline=baseline, n_itr=1000, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=1000, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable # plotting #plot=True ) algo.train()
def test_dm_control_theano_policy(self): task = ALL_TASKS[0] env = TheanoEnv(DmControlEnv(domain_name=task[0], task_name=task[1])) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=10, max_path_length=5, n_itr=1, discount=0.99, step_size=0.01, ) algo.train()
from garage.envs.mujoco import SwimmerEnv from garage.theano.algos.capg import CAPG from garage.theano.envs import TheanoEnv from garage.theano.baselines import GaussianMLPBaseline from garage.theano.policies import GaussianMLPPolicy from garage.misc.instrument import run_experiment from garage.misc.ext import set_seed import numpy as np for batchsize in [5000]: for learning_rate in [0.05, 0.01]: for i in range(3): seed = np.random.randint(1, 10000) env_name = "SGD_Swimmer_-t" hidden_sizes = (32, 32) env = TheanoEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy,
from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.envs.point_env import PointEnv from garage.theano.algos import TRPO from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy env = TheanoEnv(normalize(PointEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, ) algo.train()
import numpy as np import theano import theano.tensor as TT from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy # normalize() makes sure that the actions for the environment lies within the # range [-1, 1] (only works for environments with continuous actions) env = TheanoEnv(normalize(CartpoleEnv())) # Initialize a neural network policy with a single hidden layer of 8 hidden # units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, )) # Initialize a linear baseline estimator using default hand-crafted features baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph
from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.theano.algos import TRPO from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, # plot=True ) algo.train()
import numpy as np import theano import theano.tensor as TT from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy from garage.sampler import parallel_sampler # normalize() makes sure that the actions for the environment lies within the # range [-1, 1] (only works for environments with continuous actions) env = TheanoEnv(normalize(CartpoleEnv())) # Initialize a neural network policy with a single hidden layer of 8 hidden # units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, )) parallel_sampler.populate_task(env, policy) parallel_sampler.initialize(10) paths = parallel_sampler.sample_paths(policy.get_param_values(), 100) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.01 # Construct the computation graph
import theano.tensor as TT import theano from garage.baselines import LinearFeatureBaseline from garage.baselines import ZeroBaseline from garage.envs import normalize from garage.envs.mujoco import SwimmerEnv from garage.theano.algos.capg_re import CAPG from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy from garage.theano.misc import tensor_utils from garage.misc.instrument import run_experiment env_name = "Swimmer" hidden_sizes = (32, 32) env = TheanoEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) observations_var = env.observation_space.new_tensor_variable('observations', extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) rewards_var = tensor_utils.new_tensor('rewards', ndim=1, dtype=theano.config.floatX) dist = policy.distribution dist_info_vars = policy.dist_info_sym(observations_var) old_dist_info_vars = backup_policy.dist_info_sym(observations_var)