dim_action, activation, None, l2=l2) return {"q": out} f_create_q = f super(OnDQNPendulum, self).__init__(env, f_create_q, episode_n, discount_factor, ddqn, target_sync_interval, target_sync_rate, update_interval, replay_size, batch_size, neighbour_size, greedy_epsilon, generation_decay, network_optimizer_ctor, **kwargs) Experiment.register(OnDQNPendulum, "OnDQNPendulum") class OnDQNBreakout(OnDQNExperiment): def __init__(self, env=None, f_create_q=None, episode_n=10000, discount_factor=0.99, ddqn=False, target_sync_interval=100, target_sync_rate=1.0, update_interval=4, replay_size=1000, batch_size=8, neighbour_size=8,
activation_hidden=tf.nn.relu, l2=l2, var_scope="v") v = tf.squeeze(v, axis=1) pi = hrl.utils.Network.layer_fcs(se, [256], dim_action, activation_hidden=tf.nn.relu, activation_out=tf.nn.softmax, l2=l2, var_scope="pi") return {"v": v, "pi": pi} f_create_net = create_ac_car super(A3CCarRecordingDiscrete2, self).__init__(env, f_create_net, episode_n, learning_rate, discount_factor, entropy, batch_size) Experiment.register(A3CCarRecordingDiscrete2, "Discrete A3C for CarRacing Recording") # class ACRecordingExperiment(Experiment): # def __init__(self, # env, f_create_net, episode_n=1000, # learning_rate=1e-4, # discount_factor=0.9, # entropy=1e-2, # batch_size=8 # ): # super(A3CRecordingExperiment, self).__init__() # self._env, self._f_create_net, self._episode_n, self._learning_rate, \ # self._discount_factor, self._entropy, self._batch_size = \ # env, f_create_net, episode_n, learning_rate, \ # discount_factor, entropy, batch_size
agent = ClusterAgent(create_agent, create_optimizer, args.cluster, args.job, args.index, args.logdir) with agent.create_session(config=config) as sess: agent.set_session(sess) runner = hrl.envs.EnvRunner( env, agent, reward_decay=discount_factor, evaluate_interval=sys.maxint, render_interval=args.render_interval, render_once=True, logdir=args.logdir if args.index == 0 else None) runner.episode(2000) Experiment.register(A3CPendulum, "experiments A3C") class ADQNExperiment(Experiment): def run(self, args): env = gym.make('Pendulum-v0') env = hrl.envs.C2DEnvWrapper(env, [5]) env = hrl.envs.ScaledRewards(env, 0.1) state_shape = list(env.observation_space.shape) discount_factor = 0.9 def f_q(inputs): q = network.Utils.layer_fcs(inputs[0], [200, 100], env.action_space.n, l2=1e-4)
var_scope="q") q = tf.squeeze(q, axis=1) return {"q": q} f_create_q = f super(SoftQPendulum, self).__init__(env, f_create_actor, f_create_q, dim_noise, target_sync_interval, target_sync_rate, alpha_exploration, max_gradient, m_particle_svgd, m_particle_v, episode_n, discount_factor, update_interval, replay_size, batch_size, network_optimizer_ctor) Experiment.register(SoftQPendulum, "soft q for pendulum") class SoftQMPCPendulum(SoftQMPCExperiment): def __init__(self, env=None, f_create_actor=None, f_create_q=None, f_model=None, dim_noise=2, target_sync_interval=100, target_sync_rate=1.0, greedy_epsilon=utils.CappedExp(1e5, 2.5, 0.05), sample_n=4, horizon_n=4, alpha_exploration=0.1,
activation_out=None, l2=l2, var_scope="reward") reward = tf.squeeze(reward, axis=1) return {"goal": goal, "reward": reward} f_model = f super(MPCPendulum, self).__init__(env, f_model, sample_n, horizon_n, episode_n, discount_factor, update_interval, replay_size, batch_size, greedy_epsilon, network_optimizer_ctor) Experiment.register(MPCPendulum, "MPC for Pendulum") class MPCPendulumSearch(ParallelGridSearch): def __init__(self, parallel=4): parameters = { "sample_n": [4, 8, 16], "horizon_n": [2, 4, 8], } super(MPCPendulumSearch, self).__init__(MPCPendulum, parameters, parallel) Experiment.register(MPCPendulumSearch, "search for MPC for Pendulum") if __name__ == '__main__':
class TabularGrid(Experiment): def run(self, args): env = hrl.envs.GridworldSink() agent = tabular_q.TabularQLearning( # TablularQMixin params num_action=env.action_space.n, discount_factor=0.9, # EpsilonGreedyPolicyMixin params epsilon_greedy=0.2) runner = hrl.envs.EnvRunner(env, agent) runner.episode(100) Experiment.register(TabularGrid, "grid world with tabular-q learning") class PendulumEnvWrapper(hrl.envs.C2DEnvWrapper): """ wraps continuous state into discrete space """ def __init__(self, env, quant_list=None, d2c_proc=None, action_n=None): super(PendulumEnvWrapper, self).__init__(env, quant_list, d2c_proc, action_n) self.observation_space = gym.spaces.Box(low=0, high=1024, shape=(1, )) def _step(self, *args, **kwargs): next_state, reward, done, info = super(PendulumEnvWrapper, self)._step(*args, **kwargs) return [self.state_c2d(next_state)], reward, done, info
f_actor = f if f_critic is None: def f(inputs): se, action = inputs[0], inputs[1] se = tf.concat([se, action], axis=-1) q = hrl.network.Utils.layer_fcs(se, [100], 1, activation_out=None, l2=l2, var_scope="q") q = tf.squeeze(q, axis=1) return {"q": q} f_critic = f super(OTDPGPendulum, self).__init__(env, f_se, f_actor, f_critic, lower_weight, upper_weight, neighbour_size, episode_n, discount_factor, network_optimizer_ctor, ou_params, target_sync_interval, target_sync_rate, batch_size, replay_capacity) Experiment.register(OTDPGPendulum, "OTDPG for Pendulum") class OTDPGBipedal(OTDPGPendulum): def __init__(self, env=None, f_se=None, f_actor=None, f_critic=None, lower_weight=4, upper_weight=4, neighbour_size=8, episode_n=4000, discount_factor=0.9, network_optimizer_ctor=lambda: hrl.network.LocalOptimizer(tf.train.AdamOptimizer(1e-4), grad_clip=10.0), ou_params=(0, 0.2, hrl.utils.CappedLinear(1e6, 1.0, 0.1)), target_sync_interval=10, target_sync_rate=0.01, batch_size=8, replay_capacity=40000): if env is None: env = gym.make("BipedalWalker-v2") env = hrl.envs.AugmentEnvWrapper(env, reward_decay=discount_factor, reward_scale=0.02) super(OTDPGBipedal, self).__init__(env, f_se, f_actor, f_critic, lower_weight, upper_weight, neighbour_size, episode_n, discount_factor, network_optimizer_ctor, ou_params, target_sync_interval, target_sync_rate, batch_size, replay_capacity)
stddev = 2.0 * tf.sigmoid(stddev / 4.0) return {"mean": mean, "stddev": stddev} f_pi = f super(NoisyPendulum, self).__init__( env, f_se, f_manager, f_explorer, f_ik, f_value, f_model, f_pi, episode_n, discount_factor, noise_dimension, se_dimension, manager_horizon, manager_interval, batch_size, batch_horizon, noise_stddev, noise_explore_param, worker_explore_param, worker_entropy, network_optimizer_ctor, replay_size, act_ac, intrinsic_weight, explore_net, abs_goal, manager_ac, achievable_weight, disentangle_weight, **kwargs) Experiment.register(NoisyPendulum, "Noisy explore for pendulum") class NoisyPendulumSearch(GridSearch): """ round 1: "act_ac": [True, False], "intrinsic_weight": [0.0, 1.0], "explore_net": [True, False], "abs_goal": [True, False], "manager_ac": [True, False], "achievable_weight": [1e-1, 1e-3] conclusion: act_ac = False, explore_net = False round 2: how to explore via net?