def run_task(*_): env_name = "Ant" hidden_sizes = (32,32) env = TheanoEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = ZeroBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy, n_timestep=5e6, learning_rate=0.01, batch_size=5000, minibatch_size=500, n_sub_itr = 10, baseline=baseline, max_path_length=500, discount=0.99, decay_learing_rate=True, log_dir='./logs/' + env_name, ) algo.train()
def test_polopt_algo(self, algo_cls, env_cls, policy_cls): print("Testing %s, %s, %s" % (algo_cls.__name__, env_cls.__name__, policy_cls.__name__)) env = TheanoEnv(env_cls()) policy = policy_cls(env_spec=env.spec, ) baseline = ZeroBaseline(env_spec=env.spec) algo = algo_cls(env=env, policy=policy, baseline=baseline, **(algo_args.get(algo_cls, dict()))) algo.train() assert not np.any(np.isnan(policy.get_param_values()))
def test_adaptive_std(): """ Checks if the adaptive_std parameter works. """ env = TheanoEnv(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1) algo.train()
def test_trpo_relu_nan(self): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_trpo_deterministic_nan(): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_issue_3(): """ As reported in https://github.com/garage/garage/issues/3, the adaptive_std parameter was not functioning properly """ env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1) algo.train()
for _ in range(10): seed = np.random.randint(1, 10000) env_name = "SGD_nn_CartPole" hidden_sizes = (8, ) env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = ZeroBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy, n_timestep=5e5, learning_rate=learning_rate, batch_size=batch_size, minibatch_size=500, n_sub_itr=0, center_adv=True, baseline=baseline,
from garage.baselines import LinearFeatureBaseline from garage.baselines import ZeroBaseline from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.envs.mujoco import SwimmerEnv from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy # normalize() makes sure that the actions for the environment lies within the # range [-1, 1] (only works for environments with continuous actions) env = TheanoEnv(normalize(SwimmerEnv())) # Initialize a neural network policy with a single hidden layer of 8 hidden # units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(32, 32)) # Initialize a linear baseline estimator using default hand-crafted features baseline = ZeroBaseline(env.spec) # We will collect 100 trajectories per iteration N = 10 # Each trajectory will have at most 100 time steps T = 500 # Number of iterations n_itr = 1000 # Set the discount factor for the problem discount = 0.995 # Learning rate for the gradient update learning_rate = 0.005 # Construct the computation graph # Create a Theano variable for storing the observations We could have