def __init__( self, env=None, f_se=None, f_actor=None, f_critic=None, episode_n=2000, discount_factor=0.9, network_optimizer_ctor=lambda: hrl.network.LocalOptimizer( tf.train.AdamOptimizer(1e-4), grad_clip=10.0), # ou_params=(0, 0.2, [hrl.utils.CappedExp(1e5, 0.5, 0.02), # hrl.utils.CappedExp(1e6, 2.0, 0.02)]), ou_params=(0, 0.2, hrl.utils.CappedExp(1e5, 0.5, 0.02)), target_sync_interval=10, target_sync_rate=0.01, batch_size=128, replay_capacity=100000, **kwargs): if env is None: env = gym.make("RoboschoolReacher-v1") # env = StateStack(env, k=2) # env = MaxAndSkipEnv(env, max_len=1, skip=2) env = ReacherEndTorch(env) # env = ScalePenalty(env, scale=2.0) env = envs.ScaledRewards(env, 0.2) super(DPGReacher, self).__init__(env, f_se, f_actor, f_critic, episode_n, discount_factor, network_optimizer_ctor, ou_params, target_sync_interval, target_sync_rate, batch_size, replay_capacity, **kwargs)
def __init__(self, env=None, f_se=None, f_actor=None, f_critic=None, episode_n=10000, discount_factor=0.9, network_optimizer_ctor=lambda: hrl.network.LocalOptimizer( tf.train.AdamOptimizer(1e-4), grad_clip=10.0), ou_params=(0, 0.2, hrl.utils.CappedExp(2e5, 0.5, 0.01)), target_sync_interval=10, target_sync_rate=0.01, batch_size=128, replay_capacity=100000, **kwargs): if env is None: env = gym.make("RoboschoolAnt-v1") env = MaxAndSkipEnv(env, max_len=1, skip=2) env = envs.ScaledRewards(env, 0.1) state_shape = list(env.observation_space.shape) dim_action = env.action_space.shape[-1] l2 = 1e-8 if f_se is None: def f(inputs): return {"se": inputs[0]} f_se = f if f_actor is None: def f(inputs): se = inputs[0] actor = hrl.network.Utils.layer_fcs(se, [200, 100], dim_action, activation_out=tf.nn.tanh, l2=l2, var_scope="action") return {"action": actor} f_actor = f if f_critic is None: def f(inputs): se, action = inputs[0], inputs[1] se = tf.concat([se, action], axis=-1) q = hrl.network.Utils.layer_fcs(se, [100], 1, activation_out=None, l2=l2, var_scope="q") q = tf.squeeze(q, axis=1) return {"q": q} f_critic = f super(DPGAnt, self).__init__(env, f_se, f_actor, f_critic, episode_n, discount_factor, network_optimizer_ctor, ou_params, target_sync_interval, target_sync_rate, batch_size, replay_capacity, **kwargs)
def wrap_car(env, steer_n, speed_n): """Apply a common set of wrappers for Atari games.""" env = CarDiscreteWrapper(env, steer_n, speed_n) env = envs.MaxAndSkipEnv(env, skip=2, max_len=1) # env = ProcessFrame96H(env) env = envs.FrameStack(env, 4) env = envs.ScaledRewards(env, 0.1) env = envs.ScaledFloatFrame(env) return env
def __init__(self, env=None, f_create_net=None, episode_n=1000000, learning_rate=5e-5, discount_factor=0.95, entropy=hrl.utils.CappedLinear(1e6, 1e-3, 1e-4), batch_size=64): if env is None: env = gym.make("RoboschoolHumanoid-v1") env = envs.MaxAndSkipEnv(env, skip=2, max_len=1) # env = ProcessFrame96H(env) # env = envs.FrameStack(env, 4) env = envs.ScaledRewards(env, 0.2) # env = envs.ScaledFloatFrame(env) if f_create_net is None: dim_action = env.action_space.shape[-1] def create_ac_car(inputs): l2 = 1e-7 input_state = inputs[0] se = hrl.utils.Network.layer_fcs(input_state, [256, 256, 256], 256, activation_hidden=tf.nn.elu, l2=l2, var_scope="se") v = hrl.utils.Network.layer_fcs(se, [256], 1, activation_hidden=tf.nn.elu, l2=l2, var_scope="v") v = tf.squeeze(v, axis=1) mean = hrl.utils.Network.layer_fcs(se, [256], dim_action, activation_hidden=tf.nn.elu, activation_out=None, l2=l2, var_scope="mean") mean = tf.nn.tanh(mean / 4.0) stddev = hrl.utils.Network.layer_fcs( se, [256], dim_action, activation_hidden=tf.nn.elu, # activation_out=tf.nn.softplus, activation_out=None, l2=l2, var_scope="stddev") stddev = 4.0 * tf.nn.sigmoid(stddev / 4.0) return {"v": v, "mean": mean, "stddev": stddev} f_create_net = create_ac_car super(A3CHumanoidContinuous, self).__init__(env, f_create_net, episode_n, learning_rate, discount_factor, entropy, batch_size)
def wrap_car(env): """Apply a common set of wrappers for Box2d games.""" env = CarGrassWrapper(env, grass_penalty=0.5) env = CarContinuousWrapper(env) env = envs.MaxAndSkipEnv(env, skip=2, max_len=1) env = envs.FrameStack(env, 4) env = envs.ScaledRewards(env, 0.1) env = envs.ScaledFloatFrame(env) env = envs.AugmentEnvWrapper(env, reward_decay=0.99) return env
def __init__(self, env=None, f_create_net=None, episode_n=10000, discount_factor=0.9, entropy=hrl.utils.CappedLinear(1e5, 1e-4, 1e-4), clip_epsilon=0.1, epoch_per_step=4, network_optimizer_ctor=lambda: hrl.network.LocalOptimizer( tf.train.AdamOptimizer(1e-4), grad_clip=10.0), batch_size=16, horizon=200): if env is None: env = gym.make("RoboschoolAnt-v1") env = envs.ScaledRewards(env, 0.1) if f_create_net is None: dim_action = env.action_space.shape[-1] def f_net(inputs): l2 = 1e-8 state = inputs[0] v = hrl.network.Utils.layer_fcs(state, [200, 100], 1, l2=l2, var_scope="v") v = tf.squeeze(v, axis=1) mean = hrl.network.Utils.layer_fcs( state, [200, 100], dim_action, # activation_out=None, activation_out=lambda x: tf.tanh(x / 4.0), l2=l2, var_scope="mean") stddev = hrl.network.Utils.layer_fcs( state, [200, 100], dim_action, # activation_out=None, activation_out=lambda x: 4.0 * tf.sigmoid(x / 8.0), l2=l2, var_scope="stddev") return {"v": v, "mean": mean, "stddev": stddev} f_create_net = f_net super(PPOAnt, self).__init__(env, f_create_net, episode_n, discount_factor, entropy, clip_epsilon, epoch_per_step, network_optimizer_ctor, batch_size, horizon)
def __init__(self, env=None, f_create_net=None, episode_n=1000000, learning_rate=5e-5, discount_factor=0.95, entropy=hrl.utils.CappedLinear(1e6, 1e-3, 1e-4), batch_size=64): if env is None: env = gym.make("RoboschoolAnt-v1") env = envs.MaxAndSkipEnv(env, skip=2, max_len=1) # env = ProcessFrame96H(env) # env = envs.FrameStack(env, 4) env = envs.ScaledRewards(env, 0.2) super(A3CAnt, self).__init__(env, f_create_net, episode_n, learning_rate, discount_factor, entropy, batch_size)
def __init__(self, env=None, f_create_net=None, episode_n=1000, discount_factor=0.9, entropy=hrl.utils.CappedLinear(1e5, 1e-6, 1e-8), clip_epsilon=0.1, epoch_per_step=4, network_optimizer_ctor=lambda: hrl.network.LocalOptimizer( tf.train.AdamOptimizer(1e-4), grad_clip=10.0), batch_size=16, horizon=1000): if env is None: env = gym.make("RoboschoolReacher-v1") env = envs.ScaledRewards(env, 0.2) super(PPOReacher, self).__init__(env, f_create_net, episode_n, discount_factor, entropy, clip_epsilon, epoch_per_step, network_optimizer_ctor, batch_size, horizon)