示例#1
0
 def __init__(
         self,
         env=None,
         f_se=None,
         f_actor=None,
         f_critic=None,
         episode_n=2000,
         discount_factor=0.9,
         network_optimizer_ctor=lambda: hrl.network.LocalOptimizer(
             tf.train.AdamOptimizer(1e-4), grad_clip=10.0),
         # ou_params=(0, 0.2, [hrl.utils.CappedExp(1e5, 0.5, 0.02),
         #                     hrl.utils.CappedExp(1e6, 2.0, 0.02)]),
         ou_params=(0, 0.2, hrl.utils.CappedExp(1e5, 0.5, 0.02)),
         target_sync_interval=10,
         target_sync_rate=0.01,
         batch_size=128,
         replay_capacity=100000,
         **kwargs):
     if env is None:
         env = gym.make("RoboschoolReacher-v1")
         # env = StateStack(env, k=2)
         # env = MaxAndSkipEnv(env, max_len=1, skip=2)
         env = ReacherEndTorch(env)
         # env = ScalePenalty(env, scale=2.0)
         env = envs.ScaledRewards(env, 0.2)
     super(DPGReacher,
           self).__init__(env, f_se, f_actor, f_critic, episode_n,
                          discount_factor, network_optimizer_ctor,
                          ou_params, target_sync_interval, target_sync_rate,
                          batch_size, replay_capacity, **kwargs)
示例#2
0
    def __init__(self,
                 env=None,
                 f_se=None,
                 f_actor=None,
                 f_critic=None,
                 episode_n=10000,
                 discount_factor=0.9,
                 network_optimizer_ctor=lambda: hrl.network.LocalOptimizer(
                     tf.train.AdamOptimizer(1e-4), grad_clip=10.0),
                 ou_params=(0, 0.2, hrl.utils.CappedExp(2e5, 0.5, 0.01)),
                 target_sync_interval=10,
                 target_sync_rate=0.01,
                 batch_size=128,
                 replay_capacity=100000,
                 **kwargs):
        if env is None:
            env = gym.make("RoboschoolAnt-v1")
            env = MaxAndSkipEnv(env, max_len=1, skip=2)
            env = envs.ScaledRewards(env, 0.1)
        state_shape = list(env.observation_space.shape)
        dim_action = env.action_space.shape[-1]
        l2 = 1e-8
        if f_se is None:

            def f(inputs):
                return {"se": inputs[0]}

            f_se = f
        if f_actor is None:

            def f(inputs):
                se = inputs[0]
                actor = hrl.network.Utils.layer_fcs(se, [200, 100],
                                                    dim_action,
                                                    activation_out=tf.nn.tanh,
                                                    l2=l2,
                                                    var_scope="action")
                return {"action": actor}

            f_actor = f
        if f_critic is None:

            def f(inputs):
                se, action = inputs[0], inputs[1]
                se = tf.concat([se, action], axis=-1)
                q = hrl.network.Utils.layer_fcs(se, [100],
                                                1,
                                                activation_out=None,
                                                l2=l2,
                                                var_scope="q")
                q = tf.squeeze(q, axis=1)
                return {"q": q}

            f_critic = f
        super(DPGAnt,
              self).__init__(env, f_se, f_actor, f_critic, episode_n,
                             discount_factor, network_optimizer_ctor,
                             ou_params, target_sync_interval, target_sync_rate,
                             batch_size, replay_capacity, **kwargs)
示例#3
0
def wrap_car(env, steer_n, speed_n):
    """Apply a common set of wrappers for Atari games."""
    env = CarDiscreteWrapper(env, steer_n, speed_n)
    env = envs.MaxAndSkipEnv(env, skip=2, max_len=1)
    # env = ProcessFrame96H(env)
    env = envs.FrameStack(env, 4)
    env = envs.ScaledRewards(env, 0.1)
    env = envs.ScaledFloatFrame(env)
    return env
示例#4
0
    def __init__(self,
                 env=None,
                 f_create_net=None,
                 episode_n=1000000,
                 learning_rate=5e-5,
                 discount_factor=0.95,
                 entropy=hrl.utils.CappedLinear(1e6, 1e-3, 1e-4),
                 batch_size=64):
        if env is None:
            env = gym.make("RoboschoolHumanoid-v1")
            env = envs.MaxAndSkipEnv(env, skip=2, max_len=1)
            # env = ProcessFrame96H(env)
            # env = envs.FrameStack(env, 4)
            env = envs.ScaledRewards(env, 0.2)
            # env = envs.ScaledFloatFrame(env)
        if f_create_net is None:
            dim_action = env.action_space.shape[-1]

            def create_ac_car(inputs):
                l2 = 1e-7
                input_state = inputs[0]
                se = hrl.utils.Network.layer_fcs(input_state, [256, 256, 256],
                                                 256,
                                                 activation_hidden=tf.nn.elu,
                                                 l2=l2,
                                                 var_scope="se")

                v = hrl.utils.Network.layer_fcs(se, [256],
                                                1,
                                                activation_hidden=tf.nn.elu,
                                                l2=l2,
                                                var_scope="v")
                v = tf.squeeze(v, axis=1)
                mean = hrl.utils.Network.layer_fcs(se, [256],
                                                   dim_action,
                                                   activation_hidden=tf.nn.elu,
                                                   activation_out=None,
                                                   l2=l2,
                                                   var_scope="mean")
                mean = tf.nn.tanh(mean / 4.0)
                stddev = hrl.utils.Network.layer_fcs(
                    se,
                    [256],
                    dim_action,
                    activation_hidden=tf.nn.elu,
                    # activation_out=tf.nn.softplus,
                    activation_out=None,
                    l2=l2,
                    var_scope="stddev")
                stddev = 4.0 * tf.nn.sigmoid(stddev / 4.0)
                return {"v": v, "mean": mean, "stddev": stddev}

            f_create_net = create_ac_car
        super(A3CHumanoidContinuous,
              self).__init__(env, f_create_net, episode_n, learning_rate,
                             discount_factor, entropy, batch_size)
示例#5
0
def wrap_car(env):
    """Apply a common set of wrappers for Box2d games."""
    env = CarGrassWrapper(env, grass_penalty=0.5)
    env = CarContinuousWrapper(env)
    env = envs.MaxAndSkipEnv(env, skip=2, max_len=1)
    env = envs.FrameStack(env, 4)
    env = envs.ScaledRewards(env, 0.1)
    env = envs.ScaledFloatFrame(env)
    env = envs.AugmentEnvWrapper(env, reward_decay=0.99)
    return env
示例#6
0
    def __init__(self,
                 env=None,
                 f_create_net=None,
                 episode_n=10000,
                 discount_factor=0.9,
                 entropy=hrl.utils.CappedLinear(1e5, 1e-4, 1e-4),
                 clip_epsilon=0.1,
                 epoch_per_step=4,
                 network_optimizer_ctor=lambda: hrl.network.LocalOptimizer(
                     tf.train.AdamOptimizer(1e-4), grad_clip=10.0),
                 batch_size=16,
                 horizon=200):

        if env is None:
            env = gym.make("RoboschoolAnt-v1")
            env = envs.ScaledRewards(env, 0.1)
        if f_create_net is None:
            dim_action = env.action_space.shape[-1]

            def f_net(inputs):
                l2 = 1e-8
                state = inputs[0]
                v = hrl.network.Utils.layer_fcs(state, [200, 100],
                                                1,
                                                l2=l2,
                                                var_scope="v")
                v = tf.squeeze(v, axis=1)
                mean = hrl.network.Utils.layer_fcs(
                    state,
                    [200, 100],
                    dim_action,
                    # activation_out=None,
                    activation_out=lambda x: tf.tanh(x / 4.0),
                    l2=l2,
                    var_scope="mean")
                stddev = hrl.network.Utils.layer_fcs(
                    state,
                    [200, 100],
                    dim_action,
                    # activation_out=None,
                    activation_out=lambda x: 4.0 * tf.sigmoid(x / 8.0),
                    l2=l2,
                    var_scope="stddev")
                return {"v": v, "mean": mean, "stddev": stddev}

            f_create_net = f_net

        super(PPOAnt,
              self).__init__(env, f_create_net, episode_n, discount_factor,
                             entropy, clip_epsilon, epoch_per_step,
                             network_optimizer_ctor, batch_size, horizon)
示例#7
0
    def __init__(self,
                 env=None,
                 f_create_net=None,
                 episode_n=1000000,
                 learning_rate=5e-5,
                 discount_factor=0.95,
                 entropy=hrl.utils.CappedLinear(1e6, 1e-3, 1e-4),
                 batch_size=64):
        if env is None:
            env = gym.make("RoboschoolAnt-v1")
            env = envs.MaxAndSkipEnv(env, skip=2, max_len=1)
            # env = ProcessFrame96H(env)
            # env = envs.FrameStack(env, 4)
            env = envs.ScaledRewards(env, 0.2)

        super(A3CAnt,
              self).__init__(env, f_create_net, episode_n, learning_rate,
                             discount_factor, entropy, batch_size)
示例#8
0
 def __init__(self,
              env=None,
              f_create_net=None,
              episode_n=1000,
              discount_factor=0.9,
              entropy=hrl.utils.CappedLinear(1e5, 1e-6, 1e-8),
              clip_epsilon=0.1,
              epoch_per_step=4,
              network_optimizer_ctor=lambda: hrl.network.LocalOptimizer(
                  tf.train.AdamOptimizer(1e-4), grad_clip=10.0),
              batch_size=16,
              horizon=1000):
     if env is None:
         env = gym.make("RoboschoolReacher-v1")
         env = envs.ScaledRewards(env, 0.2)
     super(PPOReacher,
           self).__init__(env, f_create_net, episode_n, discount_factor,
                          entropy, clip_epsilon, epoch_per_step,
                          network_optimizer_ctor, batch_size, horizon)