def init_updaters_(self): self.target_v = GreedyStateValueFunction(self.target_q) target_esitmator = OptimalityTighteningEstimator(self.target_v, self._upper_weight, self._lower_weight, discount_factor=self._discount_factor) self.network_optimizer.add_updater(TrajectoryFitQ(self.learn_q, target_esitmator), name="ot") self.network_optimizer.add_updater(network.L2(self.network), name="l2") self.network_optimizer.add_updater(EnvModelUpdater( self.network.sub_net("se"), self.network.sub_net("transition"), self.network.sub_net("decoder"), state_shape=self._state_shape, dim_action=self._num_actions, # curriculum=[1, self._rollout_depth], # skip_step=[10000], # transition_weight=1.0, with_momentum=True curriculum=self._curriculum, skip_step=self._skip_step, transition_weight=1.0, with_momentum=self._with_momentum, save_image_interval=self._save_image_interval, with_ob=self._with_ob, with_goal=self._with_goal ), name="env") self.network_optimizer.compile()
def init_updaters_(self): if self._ddqn: estimator = target_estimate.DDQNOneStepTD(self.learn_q, self.target_q, self._discount_factor) else: estimator = target_estimate.OneStepTD(self.target_q, self._discount_factor) self.network_optimizer.add_updater(network.FitTargetQ(self.learn_q, estimator), name="td") self.network_optimizer.add_updater(network.L2(self.network), name="l2") self.network_optimizer.compile() pass
def init_updaters_(self): target_estimator = target_estimate.ContinuousActionEstimator( self._target_v_function, self._discount_factor) self.network_optimizer.add_updater( DPGUpdater(actor=self._actor_function, critic=self._q_function, target_estimator=target_estimator, discount_factor=self._discount_factor, actor_weight=0.1), name="ac") self.network_optimizer.add_updater(network.L2(self.network), name="l2") self.network_optimizer.compile()
def init_updaters_(self): if self._ddqn: self.target_v = DoubleQValueFunction(self.learn_q, self.target_q) else: self.target_v = GreedyStateValueFunction(self.target_q) target_esitmator = OptimalityTighteningEstimator(self.target_v, self._upper_weight, self._lower_weight, discount_factor=self._discount_factor) self.network_optimizer.add_updater(TrajectoryFitQ(self.learn_q, target_esitmator), name="ot") self.network_optimizer.add_updater(network.L2(self.network), name="l2") self.network_optimizer.compile()
def init_updaters_(self): target_esitmator = OptimalityTighteningEstimator( self._target_v_function, self._upper_weight, self._lower_weight, discount_factor=self._discount_factor) self.network_optimizer.add_updater(TrajectoryFitQ( self._actor_function, self._q_function, target_esitmator, self._discount_factor, 0.02), name="ac") self.network_optimizer.add_updater(network.L2(self.network), name="l2") self.network_optimizer.compile()
def init_updaters_(self): self.network_optimizer.add_updater(network.L2(self.network), name="l2") self.network_optimizer.add_updater( EnvModelUpdater( self.network.sub_net("se"), self.network.sub_net("transition"), self.network.sub_net("decoder"), state_shape=self._state_shape, dim_action=self._num_actions, # curriculum=[1, self._rollout_depth], # skip_step=[10000], # transition_weight=1.0, with_momentum=True curriculum=self._curriculum, skip_step=self._skip_step, transition_weight=1.0, with_momentum=self._with_momentum, save_image_interval=self._save_image_interval, with_ob=self._with_ob, with_goal=self._with_goal), name="env") self.network_optimizer.compile()
def __init__( self, num_action, f_se, f_ac, f_tran, f_decoder, f_rollout, f_encoder, state_shape, # ACUpdate arguments discount_factor, entropy=1e-3, target_estimator=None, max_advantage=10.0, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, policy_with_iaa=False, compute_with_diff=False, with_momentum=True, rollout_depth=3, rollout_lane=3, dynamic_rollout=None, dynamic_skip_step=None, model_train_depth=3, batch_size=32, save_image_interval=1000, log_dir="./log/img", with_ob=False, with_goal=True, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param max_advantage: advantage regulation: max advantage value in policy gradient step :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param max_rollout: optional, should be an odd number :param args: :param kwargs: """ self.processed_state_shape = [] def f_iaa(inputs): input_observation = inputs[0] if compute_with_diff: logging.warning("use diff 2333") diff_ob = [] for i in range(input_observation.shape[-1] / 3 - 1): diff_ob.append(input_observation[:, :, :, (i + 1) * 3:(i + 1) * 3 + 3] - input_observation[:, :, :, i * 3:i * 3 + 3]) net_se = network.Network([tf.concat(diff_ob[:], axis=3)], f_se, var_scope="se_1") self.processed_state_shape = copy.copy(state_shape) self.processed_state_shape[-1] = state_shape[-1] - 3 else: net_se = network.Network([input_observation], f_se, var_scope="se_1") self.processed_state_shape = state_shape input_action = inputs[1] action_dim = inputs[2] input_action = tf.one_hot(indices=input_action, depth=action_dim, on_value=1.0, off_value=0.0, axis=-1) se = net_se["se"].op input_reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="input_reward") encode_state = tf.placeholder(dtype=tf.float32, shape=[None, se.shape.as_list()[-1]], name="encode_states") input_frame = tf.placeholder( dtype=tf.float32, shape=[None, state_shape[0], state_shape[1], 3], name="input_frame") rollout = network.Network([se], f_rollout, var_scope="rollout_policy") if not with_ob: net_model = network.Network([se, input_action], f_tran, var_scope="TranModel") net_decoder = network.Network([ tf.concat( (encode_state, encode_state), axis=-1), input_frame ], f_decoder, var_scope="Decoder") else: net_model = network.Network([input_observation, input_action], f_tran, var_scope="TranModelOB") net_decoder = network.Network([input_frame], f_decoder, var_scope="DecoderOB") rollout_encoder = network.Network( [tf.concat((se, se), axis=-1), input_reward], f_encoder, var_scope="rollout_encoder") current_state = se current_ob = input_observation for i in range(rollout_lane): for j in range(rollout_depth): current_rollout = rollout([current_state], name_scope="rollout_%d_%d" % (i, j)) # rollout_action_dist = tf.contrib.distributions.Categorical(rollout_action_function.output().op) # current_action = rollout_action_dist.sample() if not with_ob: tran_model = net_model([ current_state, current_rollout["rollout_action"].op ], name_scope="env_model_%d_%d" % (i, j)) else: tran_model = net_model( [current_ob, current_rollout["rollout_action"].op], name_scope="env_model_%d_%d" % (i, j)) next_goal = tran_model["next_state"].op reward = tran_model["reward"].op if not with_ob: current_state += next_goal else: current_ob = tf.concat( [current_ob[:, :, :, 3:], next_goal], axis=-1) next_goal = tf.stop_gradient( net_se([current_ob])["se"].op) if j == 0: encode_states = next_goal rollout_reward = reward else: encode_states = tf.concat([next_goal, encode_states], axis=-1) rollout_reward = tf.concat([rollout_reward, reward], axis=0) current_state = se current_ob = input_observation input_reward = tf.reshape(rollout_reward, [-1, rollout_depth]) input_reward = tf.split(input_reward, rollout_depth, axis=1) encode_state = tf.split(encode_states, rollout_depth, axis=1) for m in range(rollout_depth): if m == 0: rollout_encoder = rollout_encoder( [ tf.concat([ encode_state[-(m + 1)], encode_state[-(m + 1)] ], axis=-1), input_reward[-(m + 1)] ], name_scope="rollout_encoder_%d_%d" % (i, m)) re = rollout_encoder["re"].op else: rollout_encoder = rollout_encoder( [ tf.concat([re, encode_state[-(m + 1)]], axis=-1), input_reward[-(m + 1)] ], name_scope="rollout_encoder_%d_%d" % (i, m)) re = rollout_encoder["re"].op if i == 0: path = re else: path = tf.concat([path, re], axis=1) if policy_with_iaa: feature = tf.concat([path, se], axis=1) else: feature = se ac = network.Network([feature], f_ac, var_scope='ac') v = ac["v"].op pi_dist = ac["pi"].op return {"v": v, "pi": pi_dist, "rollout_action": None}, \ { "se": net_se, "transition": net_model, "state_decoder": net_decoder } self._log_dir = log_dir self._rollout_depth = rollout_depth if dynamic_rollout is None: self._dynamic_rollout = [1, 3, 5] self._dynamic_skip_step = [5000, 15000] else: self._dynamic_rollout = dynamic_rollout self._dynamic_skip_step = dynamic_skip_step kwargs.update({ "f_iaa": f_iaa, "state_shape": state_shape, "num_action": num_action, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_advantage": max_advantage, "max_gradient": max_gradient, "batch_size": batch_size, }) logging.warning(network_optimizer) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=batch_size) kwargs.update({"sampler": sampler}) super(ActorCriticWithI2A, self).__init__(*args, **kwargs) pi = self.network["pi"] if pi is not None: # discrete action: pi is categorical probability distribution self._pi_function = network.NetworkFunction(self.network["pi"]) self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) # continuous action: mean / stddev for normal distribution # self._rollout_action = network.NetworkFunction(self.network["rollout_action"]) # self._rollout_dist = distribution.DiscreteDistribution(self._rollout_action, self._input_action) if target_estimator is None: target_estimator = target_estimate.NStepTD(self._v_function, discount_factor) # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(ActorCriticUpdater( policy_dist=self._pi_distribution, v_function=self._v_function, target_estimator=target_estimator, entropy=entropy), name="ac") network_optimizer.add_updater(network.L2(self.network), name="l2") # network_optimizer.add_updater( # PolicyNetUpdater(rollout_dist=self._rollout_dist, # rollout_action_function=self._rollout_action, # pi_function=self._pi_function), # name="policy_net" # ) network_optimizer.add_updater(EnvModelUpdater( net_se=self.network.sub_net("se"), net_transition=self.network.sub_net("transition"), net_decoder=self.network.sub_net("state_decoder"), curriculum=self._dynamic_rollout, skip_step=self._dynamic_skip_step, state_shape=state_shape, dim_action=num_action, transition_weight=1.0, with_momentum=with_momentum, compute_with_diff=compute_with_diff, save_image_interval=save_image_interval, with_ob=with_ob, with_goal=with_goal), name="env_model") # network_optimizer.freeze(self.network.sub_net("transition").variables) network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)
def __init__( self, f_se, f_actor, f_critic, f_noise, state_shape, dim_action, dim_noise, # ACUpdate arguments discount_factor, # optimizer arguments network_optimizer=None, max_gradient=10.0, # policy arguments ou_params=(0.0, 0.2, 0.2), noise_stddev=0.5, noise_weight=1.0, noise_mean_weight=1e-2, noise_stddev_weight=1e-4, # target network sync arguments target_sync_interval=10, target_sync_rate=0.01, # sampler arguments replay_size=1000, batch_size=32, disentangle_with_dpg=True, *args, **kwargs): """ :param f_create_net: function, f_create_net([state, action]) => {"q": op_q, "action": op_action} :param state_shape: state shape :param dim_action: action dimension :param discount_factor: :param target_estimator: default to target_estimate.ContinuousActionEstimator :type target_estimator: target_estimate.TargetEstimator :param network_optimizer: default to network.LocalOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: :param ou_params: (mu, theta, sigma) of OU noise arguments :param target_sync_interval: :param target_sync_rate: :param sampler: default to sampling.TransitionSampler :type sampler: sampling.Sampler :param batch_size: :param args: :param kwargs: """ kwargs.update({ "f_se": f_se, "f_actor": f_actor, "f_critic": f_critic, "f_noise": f_noise, "state_shape": state_shape, "dim_action": dim_action, "dim_noise": dim_noise, "discount_factor": discount_factor, "max_gradient": max_gradient, "batch_size": batch_size, "replay_size": replay_size, "ou_params": ou_params, "noise_stddev": noise_stddev, "noise_weight": noise_weight }) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) super(NoisyDPG, self).__init__(*args, **kwargs) self._disentangle_with_dpg = disentangle_with_dpg def make_sample(state, action, reward, next_state, episode_done, noise, **kwargs): sample = sampling.default_make_sample(state, action, reward, next_state, episode_done) sample.update({"noise": noise}) return sample self._sampler = sampling.TransitionSampler( hrl.playback.MapPlayback(replay_size), batch_size, 4, sample_maker=make_sample) self._q_function = network.NetworkFunction( self.network["q"], inputs=[self._input_state, self._input_action]) self._actor_function = network.NetworkFunction( self.network["action"], inputs=[self._input_state, self._input_noise]) self._actor_mean_function = network.NetworkFunction( self.network["action_mean"], inputs=[self._input_state]) self._noise_function = network.NetworkFunction( self.network["action_noise"], inputs=[self._input_state, self._input_noise]) self._target_q_function = network.NetworkFunction( self.network.target["q"], inputs=[self._input_state, self._input_action]) self._target_actor_function = network.NetworkFunction( self.network.target["action"], inputs=[self._input_state, self._input_noise]) target_estimator = NoisyContinuousActionEstimator( self._target_actor_function, self._target_q_function, discount_factor) self.network_optimizer = network_optimizer if disentangle_with_dpg: network_optimizer.add_updater(DisentangleNoisyDPGUpdater( actor=self._actor_function, critic=self._q_function, f_noise=self._noise_function, target_estimator=target_estimator, discount_factor=discount_factor, actor_weight=0.02, actor_mean=self._actor_mean_function, zero_mean_weight=noise_mean_weight, stddev_weight=noise_stddev_weight), name="ac") else: network_optimizer.add_updater(NoisyDPGUpdater( actor=self._actor_function, critic=self._q_function, target_estimator=target_estimator, discount_factor=discount_factor, actor_weight=0.02, actor_mean=self._actor_mean_function), name="ac") network_optimizer.add_updater(DisentangleUpdater( self.network.sub_net("se"), self.network.sub_net("noise"), stddev=noise_stddev), weight=noise_weight, name="disentangle") network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.compile() self._act_all_function = network.NetworkFunction( { "action": self.network["action"], "mean": self.network["action_mean"], "noise": self.network["action_noise"] }, inputs=[self._input_state, self._input_noise]) self._noise_source = OUNoise([dim_noise], *ou_params) self._last_input_noise = None # self._policy = OUExplorationPolicy(self._actor_function, *ou_params) self._target_sync_interval = target_sync_interval self._target_sync_rate = target_sync_rate self._update_count = 0
def __init__( self, env, f_se, f_ac, f_forward, f_inverse, state_shape, # ACUpdate arguments discount_factor, entropy=1e-3, target_estimator=None, max_advantage=10.0, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, batch_size=32, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param max_advantage: advantage regulation: max advantage value in policy gradient step :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param args: :param kwargs: """ def f_icm(inputs): """ :param inputs: a list, [state, next_state, action] :return: a dict of op """ f_se1 = network.Network([inputs[0]], f_se, var_scope='learn_se1') f_se1 = network.NetworkFunction(f_se1["se"]).output().op f_se2 = network.Network([inputs[1]], f_se, var_scope='learn_se2') f_se2 = network.NetworkFunction(f_se2["se"]).output().op f_ac_out = network.Network([f_se1], f_ac, var_scope='learn_ac') v = network.NetworkFunction(f_ac_out["v"]).output().op pi_dist = network.NetworkFunction(f_ac_out["pi"]).output().op one_hot_action = tf.one_hot(indices=inputs[2], depth=env.action_space.n, on_value=1.0, off_value=0.0, axis=-1) f_forward_out = network.Network([one_hot_action, f_se1], f_forward, var_scope='learn_forward') phi2_hat = network.NetworkFunction( f_forward_out["phi2_hat"]).output().op f_inverse_out = network.Network([f_se1, f_se2], f_inverse, var_scope='learn_inverse') logits = network.NetworkFunction( f_inverse_out["logits"]).output().op bonus = 0.05 * tf.reduce_sum(tf.square(f_se2 - phi2_hat), axis=1) return { "pi": pi_dist, "v": v, "logits": logits, "phi1": f_se1, "phi2": f_se2, "phi2_hat": phi2_hat, "bonus": bonus } kwargs.update({ "f_icm": f_icm, "state_shape": state_shape, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_advantage": max_advantage, "max_gradient": max_gradient, "batch_size": batch_size, }) print "network_optimizer:", network_optimizer if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=batch_size) kwargs.update({"sampler": sampler}) super(ActorCriticWithICM, self).__init__(*args, **kwargs) pi = self.network["pi"] if pi is not None: # discrete action: pi is categorical probability distribution self._pi_function = network.NetworkFunction(self.network["pi"]) # self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) # continuous action: mean / stddev for normal distribution self._phi2_hat_function = network.NetworkFunction( self.network["phi2_hat"]) self._phi2_function = network.NetworkFunction(self.network["phi2"]) self._phi1_function = network.NetworkFunction(self.network["phi1"]) self._logits = network.NetworkFunction(self.network["logits"]) self._bonus = network.NetworkFunction(self.network["bonus"]) if target_estimator is None: target_estimator = target_estimate.NStepTD(self._v_function, discount_factor, bonus=self._bonus) # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(ActorCriticUpdater( policy_dist=self._pi_distribution, v_function=self._v_function, target_estimator=target_estimator, entropy=entropy), name="ac") network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.add_updater(ForwardUpdater( forward_function=self._phi2_hat_function, feature_function=self._phi2_function, policy_dist=self._pi_distribution), name="forward") network_optimizer.add_updater(InverseUpdater( inverse_function=self._logits, policy_dist=self._pi_distribution), name="inverse") network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)
def __init__( self, f_create_net, state_shape, # PPO arguments discount_factor, entropy=1e-3, clip_epsilon=0.2, # update arguments epoch_per_step=4, # target estimate target_estimator=None, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, batch_size=32, horizon=1024, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param args: :param kwargs: """ kwargs.update({ "f_create_net": f_create_net, "state_shape": state_shape, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_gradient": max_gradient, "batch_size": batch_size, "horizon": horizon, "clip_epsilon": clip_epsilon, "epoch_per_step": epoch_per_step, }) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=horizon, check_episode_done=False) kwargs.update({"sampler": sampler}) super(PPO, self).__init__(*args, **kwargs) self._epoch_py_step = epoch_per_step self._batch_size = batch_size pi = self.network["pi"] if pi is not None: # discrete action: pi is categorical probability distribution self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_function = network.NetworkFunction(self.network["pi"]) self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) self._old_pi_function = network.NetworkFunction( self._old_network["pi"]) self._old_pi_distribution = distribution.DiscreteDistribution( self._old_pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) self._old_q_function = network.NetworkFunction( self._old_network["q"]) self._old_v_function = GreedyStateValueFunction( self._old_q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) self._old_v_function = network.NetworkFunction( self._old_network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) self._old_pi_function = network.NetworkFunction( outputs={ "mean": self._old_network["mean"], "stddev": self._old_network["stddev"] }, inputs=self._old_network.inputs) self._old_pi_distribution = distribution.NormalDistribution( self._old_pi_function, self._input_action) self._old_v_function = network.NetworkFunction( self._old_network["v"]) if target_estimator is None: target_estimator = target_estimate.GAENStep( self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(PPOUpdater( policy_dist=self._pi_distribution, old_dist=self._old_pi_distribution, v_function=self._v_function, old_v_function=self._old_v_function, target_estimator=target_estimator, entropy=entropy, clip_epsilon=clip_epsilon), name="ppo") network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)