def __init__( self, env, f_model, sample_n, horizon_n, episode_n=1000, discount_factor=0.99, # sampler arguments update_interval=4, replay_size=1000, batch_size=32, # epsilon greedy arguments greedy_epsilon=0.3, network_optimizer_ctor=lambda: network.LocalOptimizer( tf.train.AdamOptimizer(1e-3), grad_clip=10.0)): self._env, self._f_model, self._episode_n, \ self._discount_factor, \ self._update_interval, \ self._replay_size, \ self._batch_size, \ self._greedy_epsilon, \ self._network_optimizer_ctor = \ env, f_model, episode_n, \ discount_factor, \ update_interval, \ replay_size, \ batch_size, \ greedy_epsilon, \ network_optimizer_ctor self._sample_n, self._horizon_n = sample_n, horizon_n super(MPCExperiment, self).__init__()
def __init__(self, env=None, f_create_actor=None, f_create_q=None, dim_noise=2, target_sync_interval=100, target_sync_rate=1.0, alpha_exploration=utils.CappedExp(1e5, 0.2, 0.08), max_gradient=10.0, m_particle_svgd=16, m_particle_v=16, episode_n=1000, discount_factor=0.9, update_interval=4, replay_size=100000, batch_size=32, network_optimizer_ctor=lambda: network.LocalOptimizer( tf.train.AdamOptimizer(1e-4), grad_clip=10.0)): if env is None: env = gym.make("Pendulum-v0") env = envs.AugmentEnvWrapper(env, reward_decay=discount_factor, reward_scale=0.1) dim_state = env.observation_space.shape[0] dim_action = env.action_space.shape[0] l2 = 1e-8 if f_create_actor is None: def f(inputs): x = tf.concat(inputs, axis=1) action = network.Utils.layer_fcs(x, [256, 256], dim_action, activation_out=tf.nn.tanh, l2=l2, var_scope="actor") return {"action": action} f_create_actor = f if f_create_q is None: def f(inputs): x = tf.concat(inputs, axis=1) q = network.Utils.layer_fcs(x, [256, 256], 1, activation_out=None, l2=l2, var_scope="q") q = tf.squeeze(q, axis=1) return {"q": q} f_create_q = f super(SoftQPendulum, self).__init__(env, f_create_actor, f_create_q, dim_noise, target_sync_interval, target_sync_rate, alpha_exploration, max_gradient, m_particle_svgd, m_particle_v, episode_n, discount_factor, update_interval, replay_size, batch_size, network_optimizer_ctor)
def __init__( self, env, f_create_actor, f_create_q, f_model, dim_noise, target_sync_interval, target_sync_rate, greedy_epsilon=0.2, sample_n=4, horizon_n=4, alpha_exploration=1.0, max_gradient=10.0, m_particle_svgd=16, m_particle_v=16, episode_n=1000, discount_factor=0.99, # sampler arguments update_interval=4, replay_size=100000, batch_size=32, # epsilon greedy arguments network_optimizer_ctor=lambda: network.LocalOptimizer( tf.train.AdamOptimizer(1e-4), grad_clip=10.0)): self._env, \ self._f_create_actor, \ self._f_create_q, \ self._dim_noise, \ self._target_sync_interval, \ self._target_sync_rate, \ self._max_gradient, \ self._m_particle_svgd, \ self._m_particle_v, \ self._episode_n, \ self._discount_factor, \ self._update_interval, \ self._replay_size, \ self._batch_size, \ self._network_optimizer_ctor = \ env, \ f_create_actor, \ f_create_q, \ dim_noise, \ target_sync_interval, \ target_sync_rate, \ max_gradient, \ m_particle_svgd, \ m_particle_v, \ episode_n, \ discount_factor, \ update_interval, \ replay_size, \ batch_size, \ network_optimizer_ctor self._alpha_exploration = alpha_exploration self._greedy_epsilon, self._sample_n, self._horizon_n = greedy_epsilon, sample_n, horizon_n self._f_model = f_model super(SoftQMPCExperiment, self).__init__()
def __init__(self, env=None, f_model=None, sample_n=16, horizon_n=4, episode_n=1000, discount_factor=0.99, update_interval=4, replay_size=100000, batch_size=32, greedy_epsilon=utils.CappedExp(1e5, 2.5, 0.05), network_optimizer_ctor=lambda: network.LocalOptimizer( tf.train.AdamOptimizer(1e-4), grad_clip=10.0)): if env is None: env = gym.make("Pendulum-v0") env = envs.AugmentEnvWrapper(env, reward_decay=0.9, reward_scale=0.1) if f_model is None: dim_action = env.action_space.shape[0] dim_state = env.observation_space.shape[0] l2 = 1e-5 def f(inputs): state, action = inputs[0], inputs[1] se = tf.concat([state, action], axis=-1) se = network.Utils.layer_fcs(se, [256], 256, activation_out=None, l2=l2, var_scope="se") goal = network.Utils.layer_fcs(se, [], dim_state, activation_out=None, l2=l2, var_scope="goal") reward = network.Utils.layer_fcs(se, [], 1, activation_out=None, l2=l2, var_scope="reward") reward = tf.squeeze(reward, axis=1) return {"goal": goal, "reward": reward} f_model = f super(MPCPendulum, self).__init__(env, f_model, sample_n, horizon_n, episode_n, discount_factor, update_interval, replay_size, batch_size, greedy_epsilon, network_optimizer_ctor)
def __init__( self, f_model, sample_n, horizon_n, dim_state, dim_action, greedy_epsilon, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments update_interval=4, replay_size=1000, batch_size=32, sampler=None, **kwargs): kwargs.update({ "sample_n": sample_n, "horizon_n": horizon_n, "update_interval": update_interval, "replay_size": replay_size, "batch_size": batch_size, "greedy_epsilon": greedy_epsilon }) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TransitionSampler(MapPlayback(replay_size), batch_size, update_interval) kwargs.update({"sampler": sampler}) self._network_optimizer = network_optimizer self._dim_state, self._dim_action = dim_state, dim_action self._f_model, self._sample_n, self._horizon_n = f_model, sample_n, horizon_n self._greedy_epsilon = greedy_epsilon super(MPCAgent, self).__init__(**kwargs) network_optimizer.add_updater(ModelUpdater(self.network), name="model") network_optimizer.compile() self._policy = WrapEpsilonGreedy(MPCPolicy( NetworkFunction({ "goal": self.network["goal"], "reward": self.network["reward"] }), sample_n=self._sample_n, horizon_n=self._horizon_n, ), epsilon=greedy_epsilon, num_actions=dim_action, is_continuous=True)
def __init__(self, f_create_q, state_shape, # OneStepTD arguments num_actions, discount_factor, ddqn, # target network sync arguments target_sync_interval, target_sync_rate, # epsilon greeedy arguments greedy_epsilon, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments update_interval=4, replay_size=1000, batch_size=32, sampler=None, *args, **kwargs): """ :param f_create_q: function, f_create_q([state, action]) => {"q": op_q} :param state_shape: shape of state :param num_actions: action count :param discount_factor: :param ddqn: True if using double DQN :param target_sync_interval: interval syncing weights from learned network to target network :param target_sync_rate: syncing rate. 1.0 for hard sync, 0 < r < 1.0 for soft sync. :param greedy_epsilon: epsilon for epsilon greedy policy :param network_optimizer: NetworkOptimizer instance, default to LocalOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: gradient clip value :param update_interval: network update interval between Agent.step() :param replay_size: replay memory size. :param batch_size: :param sampler: Sampler, default to TransitionSampler. if None, a TransitionSampler is created using update_interval, replay_size, batch_size :param args: :param kwargs: """ kwargs.update({ "f_create_q": f_create_q, "state_shape": state_shape, "num_actions": num_actions, "discount_factor": discount_factor, "ddqn": ddqn, "target_sync_interval": target_sync_interval, "target_sync_rate": target_sync_rate, "update_interval": update_interval, "replay_size": replay_size, "batch_size": batch_size, "greedy_epsilon": greedy_epsilon, "max_gradient": max_gradient }) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TransitionSampler(MapPlayback(replay_size), batch_size, update_interval) kwargs.update({"sampler": sampler}) # call super.__init__ super(DQN, self).__init__(*args, **kwargs) self.network_optimizer = network_optimizer self._ddqn, self._discount_factor = ddqn, discount_factor self.init_updaters_() self._target_sync_interval, self._target_sync_rate = target_sync_interval, target_sync_rate self._update_count = 0
def __init__(self, f_se, f_actor, f_critic, state_shape, dim_action, # ACUpdate arguments discount_factor, target_estimator=None, # optimizer arguments network_optimizer=None, max_gradient=10.0, # policy arguments ou_params=(0.0, 0.2, 0.2), # target network sync arguments target_sync_interval=10, target_sync_rate=0.01, # sampler arguments sampler=None, batch_size=32, update_interval=4, replay_size=1000, noise_type=OUNoise, *args, **kwargs): """ :param f_create_net: function, f_create_net([state, action]) => {"q": op_q, "action": op_action} :param state_shape: state shape :param dim_action: action dimension :param discount_factor: :param target_estimator: default to target_estimate.ContinuousActionEstimator :type target_estimator: target_estimate.TargetEstimator :param network_optimizer: default to network.LocalOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: :param ou_params: (mu, theta, sigma) of OU noise arguments :param target_sync_interval: :param target_sync_rate: :param sampler: default to sampling.TransitionSampler :type sampler: sampling.Sampler :param batch_size: :param args: :param kwargs: """ kwargs.update({ "f_se": f_se, "f_actor": f_actor, "f_critic": f_critic, "state_shape": state_shape, "dim_action": dim_action, "discount_factor": discount_factor, "target_estimator": target_estimator, "max_gradient": max_gradient, "batch_size": batch_size, "ou_params": ou_params, }) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TransitionSampler(hrl.playback.MapPlayback(replay_size), batch_size, update_interval) kwargs.update({"sampler": sampler}) super(DPG, self).__init__(*args, **kwargs) self.network_optimizer = network_optimizer self._q_function = network.NetworkFunction(self.network["q"]) self._actor_function = network.NetworkFunction(self.network["action"], inputs=[self.network.inputs[0]]) net_target = self.network.target self._target_q_function = network.NetworkFunction(net_target["q"]) self._target_actor_function = network.NetworkFunction(net_target["action"], inputs=[self.network.inputs[0]]) self._target_v_function = network.NetworkFunction(net_target["v"], inputs=[self.network.inputs[0]]) self._discount_factor = discount_factor self.init_updaters_() self._policy = OUExplorationPolicy(self._actor_function, *ou_params, noise_type=noise_type) self._target_sync_interval = target_sync_interval self._target_sync_rate = target_sync_rate self._update_count = 0
def __init__( self, f_create_net, state_shape, # ACUpdate arguments discount_factor, entropy=1e-3, target_estimator=None, max_advantage=10.0, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, batch_size=32, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param max_advantage: advantage regulation: max advantage value in policy gradient step :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param args: :param kwargs: """ kwargs.update({ "f_create_net": f_create_net, "state_shape": state_shape, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_advantage": max_advantage, "max_gradient": max_gradient, "batch_size": batch_size, }) print "network_optimizer:", network_optimizer if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=batch_size) kwargs.update({"sampler": sampler}) super(ActorCritic, self).__init__(*args, **kwargs) pi = self.network["pi"] # tf.stop_gradient(pi.op) if pi is not None: # discrete action: pi is categorical probability distribution self._pi_function = network.NetworkFunction(self.network["pi"]) self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) # continuous action: mean / stddev for normal distribution if target_estimator is None: target_estimator = target_estimate.NStepTD(self._v_function, discount_factor) # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(ActorCriticUpdater( policy_dist=self._pi_distribution, v_function=self._v_function, target_estimator=target_estimator, entropy=entropy), name="ac") # network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)
def __init__( self, num_action, f_se, f_ac, f_tran, f_decoder, f_rollout, f_encoder, state_shape, # ACUpdate arguments discount_factor, entropy=1e-3, target_estimator=None, max_advantage=10.0, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, policy_with_iaa=False, compute_with_diff=False, with_momentum=True, rollout_depth=3, rollout_lane=3, dynamic_rollout=None, dynamic_skip_step=None, model_train_depth=3, batch_size=32, save_image_interval=1000, log_dir="./log/img", with_ob=False, with_goal=True, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param max_advantage: advantage regulation: max advantage value in policy gradient step :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param max_rollout: optional, should be an odd number :param args: :param kwargs: """ self.processed_state_shape = [] def f_iaa(inputs): input_observation = inputs[0] if compute_with_diff: logging.warning("use diff 2333") diff_ob = [] for i in range(input_observation.shape[-1] / 3 - 1): diff_ob.append(input_observation[:, :, :, (i + 1) * 3:(i + 1) * 3 + 3] - input_observation[:, :, :, i * 3:i * 3 + 3]) net_se = network.Network([tf.concat(diff_ob[:], axis=3)], f_se, var_scope="se_1") self.processed_state_shape = copy.copy(state_shape) self.processed_state_shape[-1] = state_shape[-1] - 3 else: net_se = network.Network([input_observation], f_se, var_scope="se_1") self.processed_state_shape = state_shape input_action = inputs[1] action_dim = inputs[2] input_action = tf.one_hot(indices=input_action, depth=action_dim, on_value=1.0, off_value=0.0, axis=-1) se = net_se["se"].op input_reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="input_reward") encode_state = tf.placeholder(dtype=tf.float32, shape=[None, se.shape.as_list()[-1]], name="encode_states") input_frame = tf.placeholder( dtype=tf.float32, shape=[None, state_shape[0], state_shape[1], 3], name="input_frame") rollout = network.Network([se], f_rollout, var_scope="rollout_policy") if not with_ob: net_model = network.Network([se, input_action], f_tran, var_scope="TranModel") net_decoder = network.Network([ tf.concat( (encode_state, encode_state), axis=-1), input_frame ], f_decoder, var_scope="Decoder") else: net_model = network.Network([input_observation, input_action], f_tran, var_scope="TranModelOB") net_decoder = network.Network([input_frame], f_decoder, var_scope="DecoderOB") rollout_encoder = network.Network( [tf.concat((se, se), axis=-1), input_reward], f_encoder, var_scope="rollout_encoder") current_state = se current_ob = input_observation for i in range(rollout_lane): for j in range(rollout_depth): current_rollout = rollout([current_state], name_scope="rollout_%d_%d" % (i, j)) # rollout_action_dist = tf.contrib.distributions.Categorical(rollout_action_function.output().op) # current_action = rollout_action_dist.sample() if not with_ob: tran_model = net_model([ current_state, current_rollout["rollout_action"].op ], name_scope="env_model_%d_%d" % (i, j)) else: tran_model = net_model( [current_ob, current_rollout["rollout_action"].op], name_scope="env_model_%d_%d" % (i, j)) next_goal = tran_model["next_state"].op reward = tran_model["reward"].op if not with_ob: current_state += next_goal else: current_ob = tf.concat( [current_ob[:, :, :, 3:], next_goal], axis=-1) next_goal = tf.stop_gradient( net_se([current_ob])["se"].op) if j == 0: encode_states = next_goal rollout_reward = reward else: encode_states = tf.concat([next_goal, encode_states], axis=-1) rollout_reward = tf.concat([rollout_reward, reward], axis=0) current_state = se current_ob = input_observation input_reward = tf.reshape(rollout_reward, [-1, rollout_depth]) input_reward = tf.split(input_reward, rollout_depth, axis=1) encode_state = tf.split(encode_states, rollout_depth, axis=1) for m in range(rollout_depth): if m == 0: rollout_encoder = rollout_encoder( [ tf.concat([ encode_state[-(m + 1)], encode_state[-(m + 1)] ], axis=-1), input_reward[-(m + 1)] ], name_scope="rollout_encoder_%d_%d" % (i, m)) re = rollout_encoder["re"].op else: rollout_encoder = rollout_encoder( [ tf.concat([re, encode_state[-(m + 1)]], axis=-1), input_reward[-(m + 1)] ], name_scope="rollout_encoder_%d_%d" % (i, m)) re = rollout_encoder["re"].op if i == 0: path = re else: path = tf.concat([path, re], axis=1) if policy_with_iaa: feature = tf.concat([path, se], axis=1) else: feature = se ac = network.Network([feature], f_ac, var_scope='ac') v = ac["v"].op pi_dist = ac["pi"].op return {"v": v, "pi": pi_dist, "rollout_action": None}, \ { "se": net_se, "transition": net_model, "state_decoder": net_decoder } self._log_dir = log_dir self._rollout_depth = rollout_depth if dynamic_rollout is None: self._dynamic_rollout = [1, 3, 5] self._dynamic_skip_step = [5000, 15000] else: self._dynamic_rollout = dynamic_rollout self._dynamic_skip_step = dynamic_skip_step kwargs.update({ "f_iaa": f_iaa, "state_shape": state_shape, "num_action": num_action, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_advantage": max_advantage, "max_gradient": max_gradient, "batch_size": batch_size, }) logging.warning(network_optimizer) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=batch_size) kwargs.update({"sampler": sampler}) super(ActorCriticWithI2A, self).__init__(*args, **kwargs) pi = self.network["pi"] if pi is not None: # discrete action: pi is categorical probability distribution self._pi_function = network.NetworkFunction(self.network["pi"]) self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) # continuous action: mean / stddev for normal distribution # self._rollout_action = network.NetworkFunction(self.network["rollout_action"]) # self._rollout_dist = distribution.DiscreteDistribution(self._rollout_action, self._input_action) if target_estimator is None: target_estimator = target_estimate.NStepTD(self._v_function, discount_factor) # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(ActorCriticUpdater( policy_dist=self._pi_distribution, v_function=self._v_function, target_estimator=target_estimator, entropy=entropy), name="ac") network_optimizer.add_updater(network.L2(self.network), name="l2") # network_optimizer.add_updater( # PolicyNetUpdater(rollout_dist=self._rollout_dist, # rollout_action_function=self._rollout_action, # pi_function=self._pi_function), # name="policy_net" # ) network_optimizer.add_updater(EnvModelUpdater( net_se=self.network.sub_net("se"), net_transition=self.network.sub_net("transition"), net_decoder=self.network.sub_net("state_decoder"), curriculum=self._dynamic_rollout, skip_step=self._dynamic_skip_step, state_shape=state_shape, dim_action=num_action, transition_weight=1.0, with_momentum=with_momentum, compute_with_diff=compute_with_diff, save_image_interval=save_image_interval, with_ob=with_ob, with_goal=with_goal), name="env_model") # network_optimizer.freeze(self.network.sub_net("transition").variables) network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)
def __init__( self, f_se, f_actor, f_critic, f_noise, state_shape, dim_action, dim_noise, # ACUpdate arguments discount_factor, # optimizer arguments network_optimizer=None, max_gradient=10.0, # policy arguments ou_params=(0.0, 0.2, 0.2), noise_stddev=0.5, noise_weight=1.0, noise_mean_weight=1e-2, noise_stddev_weight=1e-4, # target network sync arguments target_sync_interval=10, target_sync_rate=0.01, # sampler arguments replay_size=1000, batch_size=32, disentangle_with_dpg=True, *args, **kwargs): """ :param f_create_net: function, f_create_net([state, action]) => {"q": op_q, "action": op_action} :param state_shape: state shape :param dim_action: action dimension :param discount_factor: :param target_estimator: default to target_estimate.ContinuousActionEstimator :type target_estimator: target_estimate.TargetEstimator :param network_optimizer: default to network.LocalOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: :param ou_params: (mu, theta, sigma) of OU noise arguments :param target_sync_interval: :param target_sync_rate: :param sampler: default to sampling.TransitionSampler :type sampler: sampling.Sampler :param batch_size: :param args: :param kwargs: """ kwargs.update({ "f_se": f_se, "f_actor": f_actor, "f_critic": f_critic, "f_noise": f_noise, "state_shape": state_shape, "dim_action": dim_action, "dim_noise": dim_noise, "discount_factor": discount_factor, "max_gradient": max_gradient, "batch_size": batch_size, "replay_size": replay_size, "ou_params": ou_params, "noise_stddev": noise_stddev, "noise_weight": noise_weight }) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) super(NoisyDPG, self).__init__(*args, **kwargs) self._disentangle_with_dpg = disentangle_with_dpg def make_sample(state, action, reward, next_state, episode_done, noise, **kwargs): sample = sampling.default_make_sample(state, action, reward, next_state, episode_done) sample.update({"noise": noise}) return sample self._sampler = sampling.TransitionSampler( hrl.playback.MapPlayback(replay_size), batch_size, 4, sample_maker=make_sample) self._q_function = network.NetworkFunction( self.network["q"], inputs=[self._input_state, self._input_action]) self._actor_function = network.NetworkFunction( self.network["action"], inputs=[self._input_state, self._input_noise]) self._actor_mean_function = network.NetworkFunction( self.network["action_mean"], inputs=[self._input_state]) self._noise_function = network.NetworkFunction( self.network["action_noise"], inputs=[self._input_state, self._input_noise]) self._target_q_function = network.NetworkFunction( self.network.target["q"], inputs=[self._input_state, self._input_action]) self._target_actor_function = network.NetworkFunction( self.network.target["action"], inputs=[self._input_state, self._input_noise]) target_estimator = NoisyContinuousActionEstimator( self._target_actor_function, self._target_q_function, discount_factor) self.network_optimizer = network_optimizer if disentangle_with_dpg: network_optimizer.add_updater(DisentangleNoisyDPGUpdater( actor=self._actor_function, critic=self._q_function, f_noise=self._noise_function, target_estimator=target_estimator, discount_factor=discount_factor, actor_weight=0.02, actor_mean=self._actor_mean_function, zero_mean_weight=noise_mean_weight, stddev_weight=noise_stddev_weight), name="ac") else: network_optimizer.add_updater(NoisyDPGUpdater( actor=self._actor_function, critic=self._q_function, target_estimator=target_estimator, discount_factor=discount_factor, actor_weight=0.02, actor_mean=self._actor_mean_function), name="ac") network_optimizer.add_updater(DisentangleUpdater( self.network.sub_net("se"), self.network.sub_net("noise"), stddev=noise_stddev), weight=noise_weight, name="disentangle") network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.compile() self._act_all_function = network.NetworkFunction( { "action": self.network["action"], "mean": self.network["action_mean"], "noise": self.network["action_noise"] }, inputs=[self._input_state, self._input_noise]) self._noise_source = OUNoise([dim_noise], *ou_params) self._last_input_noise = None # self._policy = OUExplorationPolicy(self._actor_function, *ou_params) self._target_sync_interval = target_sync_interval self._target_sync_rate = target_sync_rate self._update_count = 0
def __init__( self, env, f_se, f_ac, f_forward, f_inverse, state_shape, # ACUpdate arguments discount_factor, entropy=1e-3, target_estimator=None, max_advantage=10.0, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, batch_size=32, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param max_advantage: advantage regulation: max advantage value in policy gradient step :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param args: :param kwargs: """ def f_icm(inputs): """ :param inputs: a list, [state, next_state, action] :return: a dict of op """ f_se1 = network.Network([inputs[0]], f_se, var_scope='learn_se1') f_se1 = network.NetworkFunction(f_se1["se"]).output().op f_se2 = network.Network([inputs[1]], f_se, var_scope='learn_se2') f_se2 = network.NetworkFunction(f_se2["se"]).output().op f_ac_out = network.Network([f_se1], f_ac, var_scope='learn_ac') v = network.NetworkFunction(f_ac_out["v"]).output().op pi_dist = network.NetworkFunction(f_ac_out["pi"]).output().op one_hot_action = tf.one_hot(indices=inputs[2], depth=env.action_space.n, on_value=1.0, off_value=0.0, axis=-1) f_forward_out = network.Network([one_hot_action, f_se1], f_forward, var_scope='learn_forward') phi2_hat = network.NetworkFunction( f_forward_out["phi2_hat"]).output().op f_inverse_out = network.Network([f_se1, f_se2], f_inverse, var_scope='learn_inverse') logits = network.NetworkFunction( f_inverse_out["logits"]).output().op bonus = 0.05 * tf.reduce_sum(tf.square(f_se2 - phi2_hat), axis=1) return { "pi": pi_dist, "v": v, "logits": logits, "phi1": f_se1, "phi2": f_se2, "phi2_hat": phi2_hat, "bonus": bonus } kwargs.update({ "f_icm": f_icm, "state_shape": state_shape, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_advantage": max_advantage, "max_gradient": max_gradient, "batch_size": batch_size, }) print "network_optimizer:", network_optimizer if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=batch_size) kwargs.update({"sampler": sampler}) super(ActorCriticWithICM, self).__init__(*args, **kwargs) pi = self.network["pi"] if pi is not None: # discrete action: pi is categorical probability distribution self._pi_function = network.NetworkFunction(self.network["pi"]) # self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) # continuous action: mean / stddev for normal distribution self._phi2_hat_function = network.NetworkFunction( self.network["phi2_hat"]) self._phi2_function = network.NetworkFunction(self.network["phi2"]) self._phi1_function = network.NetworkFunction(self.network["phi1"]) self._logits = network.NetworkFunction(self.network["logits"]) self._bonus = network.NetworkFunction(self.network["bonus"]) if target_estimator is None: target_estimator = target_estimate.NStepTD(self._v_function, discount_factor, bonus=self._bonus) # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(ActorCriticUpdater( policy_dist=self._pi_distribution, v_function=self._v_function, target_estimator=target_estimator, entropy=entropy), name="ac") network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.add_updater(ForwardUpdater( forward_function=self._phi2_hat_function, feature_function=self._phi2_function, policy_dist=self._pi_distribution), name="forward") network_optimizer.add_updater(InverseUpdater( inverse_function=self._logits, policy_dist=self._pi_distribution), name="inverse") network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)
def __init__( self, f_se, f_transition, f_decoder, # optimality tightening parameters state_shape, num_actions, # env model parameters rollout_depth, network_optimizer=None, max_gradient=10.0, update_interval=4, replay_size=1000, batch_size=32, sampler=None, with_momentum=True, curriculum=[1, 3, 5], skip_step=[10000, 20000], save_image_interval=10000, log_dir=None, with_ob=False, with_goal=True, *args, **kwargs): kwargs.update({ "f_se": f_se, "f_transition": f_transition, "f_decoder": f_decoder, "state_shape": state_shape, "num_actions": num_actions, "rollout_depth": rollout_depth, "log_dir": log_dir }) self._state_shape, self._num_actions = state_shape, num_actions self._rollout_depth = rollout_depth self._with_momentum = with_momentum self._curriculum, self._skip_step = curriculum, skip_step self._save_image_interval = save_image_interval self._with_ob = with_ob self._with_goal = with_goal if network_optimizer is None: self.network_optimizer = network.LocalOptimizer( grad_clip=max_gradient) if sampler is None: max_traj_length = 200 sampler = sampling.TruncateTrajectorySampler2( None, replay_size / max_traj_length, max_traj_length, batch_size=1, trajectory_length=batch_size, interval=update_interval) self._sampler = sampler # BaseDeepAgent.__init__(self, *args, **kwargs) # kwargs.pop("global_step") kwargs.update({"sampler": sampler}) # sampling.TruncateTrajectorySampler2.__init__(self, *args, **kwargs) super(Model, self).__init__(*args, **kwargs) self.init_updaters_() self._log_dir = log_dir self._update_count = 0