def run_task(*_): env = normalize(SimpleHumanoidEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(H_layer_first[h], H_layer_second[h])) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=size_of_batch, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=number_of_episodes, discount=discount_factor, scale_reward=reward_scaling[r], qf_learning_rate=critic_learning_rate[c], policy_learning_rate=actor_learning_rate[c], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(SimpleHumanoidEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 100) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction( env_spec=env.spec hidden_sizes =(100,100) ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=batch_size_value, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=number_of_episodes, discount=0.99, scale_reward=0.01, qf_learning_rate=0.001, policy_learning_rate=0.0001, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(SimpleHumanoidEnv()) # env = SimpleHumanoidEnv() policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(32, 32)) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration( env=env, qf=qf, policy=policy, L_p=L_p_param[l_p_ind], b_step_size=b_step_size[b_ind], sigma=sigma_param[s_ind], max_exploratory_steps=max_exploratory_steps_iters, batch_size=batch_size_value, n_epochs=num_episodes, scale_reward=0.01, epoch_length=steps_per_episode, qf_learning_rate=0.001, policy_learning_rate=0.0001, ) """ DDPG """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=batch_size_value, max_path_length=100, epoch_length=steps_per_episode, min_pool_size=10000, n_epochs=num_episodes, discount=0.99, scale_reward=0.01, qf_learning_rate=0.001, policy_learning_rate=0.0001, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def get(perm): name = perm["problem"] if name.lower() == "cartpole": from rllab.envs.box2d.cartpole_env import CartpoleEnv return normalize(CartpoleEnv()) elif name.lower() == "mountain car height bonus": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv()) elif name.lower() == "mountain car": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv(height_bonus=0)) elif name.lower() == "gym mountain car": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("MountainCarContinuous-v0", record_video=False)) elif name.lower() == "pendulum": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("Pendulum-v0", record_video=False)) elif name.lower() == "mujoco double pendulum": from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv return normalize(InvertedDoublePendulumEnv()) elif name.lower() == "double pendulum": from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv return normalize(DoublePendulumEnv()) elif name.lower() == "hopper": from rllab.envs.mujoco.hopper_env import HopperEnv return normalize(HopperEnv()) elif name.lower() == "swimmer": from rllab.envs.mujoco.swimmer_env import SwimmerEnv return normalize(SwimmerEnv()) elif name.lower() == "2d walker": from rllab.envs.mujoco.walker2d_env import Walker2DEnv return normalize(Walker2DEnv()) elif name.lower() == "half cheetah": from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv return normalize(HalfCheetahEnv()) elif name.lower() == "ant": from rllab.envs.mujoco.ant_env import AntEnv return normalize(AntEnv()) elif name.lower() == "simple humanoid": from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv return normalize(SimpleHumanoidEnv()) elif name.lower() == "full humanoid": from rllab.envs.mujoco.humanoid_env import HumanoidEnv return normalize(HumanoidEnv()) else: raise NotImplementedError(f"Environment {name} unknown")
from rllab.algos.ddpg import DDPG from rllab.envs.normalized_env import normalize from rllab.misc.instrument import run_experiment_lite from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv env = normalize(SimpleHumanoidEnv()) # H_layer_first = [32, 100, 400] # H_layer_second = [32, 100, 300] H_layer_first = [32] H_layer_second = [32] # reward_scaling = [0.01, 0.1, 1.0] reward_scaling = [0.01] # critic_learning_rate = [1e-3, 10e-3] # actor_learning_rate = [1e-4, 10e-4] critic_learning_rate = [0.001] actor_learning_rate = [0.0001] #0.99 was originally set by rllab discount_factor = 0.99 #originally : 32 set by rllab size_of_batch = 64