예제 #1
0
                                                  dim_action,
                                                  activation,
                                                  None,
                                                  l2=l2)
                return {"q": out}

            f_create_q = f
        super(OnDQNPendulum,
              self).__init__(env, f_create_q, episode_n, discount_factor, ddqn,
                             target_sync_interval, target_sync_rate,
                             update_interval, replay_size, batch_size,
                             neighbour_size, greedy_epsilon, generation_decay,
                             network_optimizer_ctor, **kwargs)


Experiment.register(OnDQNPendulum, "OnDQNPendulum")


class OnDQNBreakout(OnDQNExperiment):
    def __init__(self,
                 env=None,
                 f_create_q=None,
                 episode_n=10000,
                 discount_factor=0.99,
                 ddqn=False,
                 target_sync_interval=100,
                 target_sync_rate=1.0,
                 update_interval=4,
                 replay_size=1000,
                 batch_size=8,
                 neighbour_size=8,
예제 #2
0
                                                activation_hidden=tf.nn.relu,
                                                l2=l2,
                                                var_scope="v")
                v = tf.squeeze(v, axis=1)
                pi = hrl.utils.Network.layer_fcs(se, [256], dim_action,
                                                 activation_hidden=tf.nn.relu,
                                                 activation_out=tf.nn.softmax,
                                                 l2=l2,
                                                 var_scope="pi")

                return {"v": v, "pi": pi}
            f_create_net = create_ac_car
        super(A3CCarRecordingDiscrete2, self).__init__(env, f_create_net, episode_n, learning_rate, discount_factor, entropy,
                                              batch_size)

Experiment.register(A3CCarRecordingDiscrete2, "Discrete A3C for CarRacing Recording")


# class ACRecordingExperiment(Experiment):
#     def __init__(self,
#                  env, f_create_net, episode_n=1000,
#                  learning_rate=1e-4,
#                  discount_factor=0.9,
#                  entropy=1e-2,
#                  batch_size=8
#                  ):
#         super(A3CRecordingExperiment, self).__init__()
#         self._env, self._f_create_net, self._episode_n, self._learning_rate, \
#             self._discount_factor, self._entropy, self._batch_size = \
#             env, f_create_net, episode_n, learning_rate, \
#             discount_factor, entropy, batch_size
예제 #3
0
파일: exp_new.py 프로젝트: hobotrl/hobotrl
        agent = ClusterAgent(create_agent, create_optimizer, args.cluster,
                             args.job, args.index, args.logdir)
        with agent.create_session(config=config) as sess:
            agent.set_session(sess)
            runner = hrl.envs.EnvRunner(
                env,
                agent,
                reward_decay=discount_factor,
                evaluate_interval=sys.maxint,
                render_interval=args.render_interval,
                render_once=True,
                logdir=args.logdir if args.index == 0 else None)
            runner.episode(2000)


Experiment.register(A3CPendulum, "experiments A3C")


class ADQNExperiment(Experiment):
    def run(self, args):
        env = gym.make('Pendulum-v0')
        env = hrl.envs.C2DEnvWrapper(env, [5])
        env = hrl.envs.ScaledRewards(env, 0.1)
        state_shape = list(env.observation_space.shape)

        discount_factor = 0.9

        def f_q(inputs):
            q = network.Utils.layer_fcs(inputs[0], [200, 100],
                                        env.action_space.n,
                                        l2=1e-4)
예제 #4
0
                                            var_scope="q")
                q = tf.squeeze(q, axis=1)
                return {"q": q}

            f_create_q = f

        super(SoftQPendulum,
              self).__init__(env, f_create_actor, f_create_q, dim_noise,
                             target_sync_interval, target_sync_rate,
                             alpha_exploration, max_gradient, m_particle_svgd,
                             m_particle_v, episode_n, discount_factor,
                             update_interval, replay_size, batch_size,
                             network_optimizer_ctor)


Experiment.register(SoftQPendulum, "soft q for pendulum")


class SoftQMPCPendulum(SoftQMPCExperiment):
    def __init__(self,
                 env=None,
                 f_create_actor=None,
                 f_create_q=None,
                 f_model=None,
                 dim_noise=2,
                 target_sync_interval=100,
                 target_sync_rate=1.0,
                 greedy_epsilon=utils.CappedExp(1e5, 2.5, 0.05),
                 sample_n=4,
                 horizon_n=4,
                 alpha_exploration=0.1,
예제 #5
0
                                                 activation_out=None,
                                                 l2=l2,
                                                 var_scope="reward")
                reward = tf.squeeze(reward, axis=1)
                return {"goal": goal, "reward": reward}

            f_model = f

        super(MPCPendulum,
              self).__init__(env, f_model, sample_n, horizon_n, episode_n,
                             discount_factor, update_interval, replay_size,
                             batch_size, greedy_epsilon,
                             network_optimizer_ctor)


Experiment.register(MPCPendulum, "MPC for Pendulum")


class MPCPendulumSearch(ParallelGridSearch):
    def __init__(self, parallel=4):
        parameters = {
            "sample_n": [4, 8, 16],
            "horizon_n": [2, 4, 8],
        }
        super(MPCPendulumSearch, self).__init__(MPCPendulum, parameters,
                                                parallel)


Experiment.register(MPCPendulumSearch, "search for MPC for Pendulum")

if __name__ == '__main__':
예제 #6
0
class TabularGrid(Experiment):
    def run(self, args):
        env = hrl.envs.GridworldSink()

        agent = tabular_q.TabularQLearning(
            # TablularQMixin params
            num_action=env.action_space.n,
            discount_factor=0.9,
            # EpsilonGreedyPolicyMixin params
            epsilon_greedy=0.2)
        runner = hrl.envs.EnvRunner(env, agent)
        runner.episode(100)


Experiment.register(TabularGrid, "grid world with tabular-q learning")


class PendulumEnvWrapper(hrl.envs.C2DEnvWrapper):
    """
    wraps continuous state into discrete space
    """
    def __init__(self, env, quant_list=None, d2c_proc=None, action_n=None):
        super(PendulumEnvWrapper, self).__init__(env, quant_list, d2c_proc,
                                                 action_n)
        self.observation_space = gym.spaces.Box(low=0, high=1024, shape=(1, ))

    def _step(self, *args, **kwargs):
        next_state, reward, done, info = super(PendulumEnvWrapper,
                                               self)._step(*args, **kwargs)
        return [self.state_c2d(next_state)], reward, done, info
예제 #7
0
            f_actor = f
        if f_critic is None:
            def f(inputs):
                se, action = inputs[0], inputs[1]
                se = tf.concat([se, action], axis=-1)
                q = hrl.network.Utils.layer_fcs(se, [100], 1,
                                                activation_out=None,
                                                l2=l2,
                                                var_scope="q")
                q = tf.squeeze(q, axis=1)
                return {"q": q}
            f_critic = f
        super(OTDPGPendulum, self).__init__(env, f_se, f_actor, f_critic, lower_weight, upper_weight, neighbour_size,
                                            episode_n, discount_factor, network_optimizer_ctor, ou_params,
                                            target_sync_interval, target_sync_rate, batch_size, replay_capacity)
Experiment.register(OTDPGPendulum, "OTDPG for Pendulum")


class OTDPGBipedal(OTDPGPendulum):
    def __init__(self, env=None, f_se=None, f_actor=None, f_critic=None, lower_weight=4, upper_weight=4,
                 neighbour_size=8, episode_n=4000, discount_factor=0.9,
                 network_optimizer_ctor=lambda: hrl.network.LocalOptimizer(tf.train.AdamOptimizer(1e-4),
                                                                           grad_clip=10.0),
                 ou_params=(0, 0.2, hrl.utils.CappedLinear(1e6, 1.0, 0.1)),
                 target_sync_interval=10, target_sync_rate=0.01, batch_size=8, replay_capacity=40000):
        if env is None:
            env = gym.make("BipedalWalker-v2")
            env = hrl.envs.AugmentEnvWrapper(env, reward_decay=discount_factor, reward_scale=0.02)
        super(OTDPGBipedal, self).__init__(env, f_se, f_actor, f_critic, lower_weight, upper_weight, neighbour_size,
                                           episode_n, discount_factor, network_optimizer_ctor, ou_params,
                                           target_sync_interval, target_sync_rate, batch_size, replay_capacity)
예제 #8
0
                stddev = 2.0 * tf.sigmoid(stddev / 4.0)
                return {"mean": mean, "stddev": stddev}

            f_pi = f

        super(NoisyPendulum, self).__init__(
            env, f_se, f_manager, f_explorer, f_ik, f_value, f_model, f_pi,
            episode_n, discount_factor, noise_dimension, se_dimension,
            manager_horizon, manager_interval, batch_size, batch_horizon,
            noise_stddev, noise_explore_param, worker_explore_param,
            worker_entropy, network_optimizer_ctor, replay_size, act_ac,
            intrinsic_weight, explore_net, abs_goal, manager_ac,
            achievable_weight, disentangle_weight, **kwargs)


Experiment.register(NoisyPendulum, "Noisy explore for pendulum")


class NoisyPendulumSearch(GridSearch):
    """
        round 1:
            "act_ac": [True, False],
            "intrinsic_weight": [0.0, 1.0],
            "explore_net": [True, False],
            "abs_goal": [True, False],
            "manager_ac": [True, False],
            "achievable_weight": [1e-1, 1e-3]

        conclusion: act_ac = False, explore_net = False

        round 2: how to explore via net?