def __init__( self, f_create_net, state_shape, # ACUpdate arguments discount_factor, entropy=1e-3, target_estimator=None, max_advantage=10.0, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, batch_size=32, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param max_advantage: advantage regulation: max advantage value in policy gradient step :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param args: :param kwargs: """ kwargs.update({ "f_create_net": f_create_net, "state_shape": state_shape, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_advantage": max_advantage, "max_gradient": max_gradient, "batch_size": batch_size, }) print "network_optimizer:", network_optimizer if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=batch_size) kwargs.update({"sampler": sampler}) super(ActorCritic, self).__init__(*args, **kwargs) pi = self.network["pi"] # tf.stop_gradient(pi.op) if pi is not None: # discrete action: pi is categorical probability distribution self._pi_function = network.NetworkFunction(self.network["pi"]) self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) # continuous action: mean / stddev for normal distribution if target_estimator is None: target_estimator = target_estimate.NStepTD(self._v_function, discount_factor) # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(ActorCriticUpdater( policy_dist=self._pi_distribution, v_function=self._v_function, target_estimator=target_estimator, entropy=entropy), name="ac") # network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)
def __init__( self, env, f_se, f_ac, f_forward, f_inverse, state_shape, # ACUpdate arguments discount_factor, entropy=1e-3, target_estimator=None, max_advantage=10.0, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, batch_size=32, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param max_advantage: advantage regulation: max advantage value in policy gradient step :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param args: :param kwargs: """ def f_icm(inputs): """ :param inputs: a list, [state, next_state, action] :return: a dict of op """ f_se1 = network.Network([inputs[0]], f_se, var_scope='learn_se1') f_se1 = network.NetworkFunction(f_se1["se"]).output().op f_se2 = network.Network([inputs[1]], f_se, var_scope='learn_se2') f_se2 = network.NetworkFunction(f_se2["se"]).output().op f_ac_out = network.Network([f_se1], f_ac, var_scope='learn_ac') v = network.NetworkFunction(f_ac_out["v"]).output().op pi_dist = network.NetworkFunction(f_ac_out["pi"]).output().op one_hot_action = tf.one_hot(indices=inputs[2], depth=env.action_space.n, on_value=1.0, off_value=0.0, axis=-1) f_forward_out = network.Network([one_hot_action, f_se1], f_forward, var_scope='learn_forward') phi2_hat = network.NetworkFunction( f_forward_out["phi2_hat"]).output().op f_inverse_out = network.Network([f_se1, f_se2], f_inverse, var_scope='learn_inverse') logits = network.NetworkFunction( f_inverse_out["logits"]).output().op bonus = 0.05 * tf.reduce_sum(tf.square(f_se2 - phi2_hat), axis=1) return { "pi": pi_dist, "v": v, "logits": logits, "phi1": f_se1, "phi2": f_se2, "phi2_hat": phi2_hat, "bonus": bonus } kwargs.update({ "f_icm": f_icm, "state_shape": state_shape, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_advantage": max_advantage, "max_gradient": max_gradient, "batch_size": batch_size, }) print "network_optimizer:", network_optimizer if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=batch_size) kwargs.update({"sampler": sampler}) super(ActorCriticWithICM, self).__init__(*args, **kwargs) pi = self.network["pi"] if pi is not None: # discrete action: pi is categorical probability distribution self._pi_function = network.NetworkFunction(self.network["pi"]) # self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) # continuous action: mean / stddev for normal distribution self._phi2_hat_function = network.NetworkFunction( self.network["phi2_hat"]) self._phi2_function = network.NetworkFunction(self.network["phi2"]) self._phi1_function = network.NetworkFunction(self.network["phi1"]) self._logits = network.NetworkFunction(self.network["logits"]) self._bonus = network.NetworkFunction(self.network["bonus"]) if target_estimator is None: target_estimator = target_estimate.NStepTD(self._v_function, discount_factor, bonus=self._bonus) # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(ActorCriticUpdater( policy_dist=self._pi_distribution, v_function=self._v_function, target_estimator=target_estimator, entropy=entropy), name="ac") network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.add_updater(ForwardUpdater( forward_function=self._phi2_hat_function, feature_function=self._phi2_function, policy_dist=self._pi_distribution), name="forward") network_optimizer.add_updater(InverseUpdater( inverse_function=self._logits, policy_dist=self._pi_distribution), name="inverse") network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)
def __init__( self, num_action, f_se, f_ac, f_tran, f_decoder, f_rollout, f_encoder, state_shape, # ACUpdate arguments discount_factor, entropy=1e-3, target_estimator=None, max_advantage=10.0, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, policy_with_iaa=False, compute_with_diff=False, with_momentum=True, rollout_depth=3, rollout_lane=3, dynamic_rollout=None, dynamic_skip_step=None, model_train_depth=3, batch_size=32, save_image_interval=1000, log_dir="./log/img", with_ob=False, with_goal=True, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param max_advantage: advantage regulation: max advantage value in policy gradient step :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param max_rollout: optional, should be an odd number :param args: :param kwargs: """ self.processed_state_shape = [] def f_iaa(inputs): input_observation = inputs[0] if compute_with_diff: logging.warning("use diff 2333") diff_ob = [] for i in range(input_observation.shape[-1] / 3 - 1): diff_ob.append(input_observation[:, :, :, (i + 1) * 3:(i + 1) * 3 + 3] - input_observation[:, :, :, i * 3:i * 3 + 3]) net_se = network.Network([tf.concat(diff_ob[:], axis=3)], f_se, var_scope="se_1") self.processed_state_shape = copy.copy(state_shape) self.processed_state_shape[-1] = state_shape[-1] - 3 else: net_se = network.Network([input_observation], f_se, var_scope="se_1") self.processed_state_shape = state_shape input_action = inputs[1] action_dim = inputs[2] input_action = tf.one_hot(indices=input_action, depth=action_dim, on_value=1.0, off_value=0.0, axis=-1) se = net_se["se"].op input_reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="input_reward") encode_state = tf.placeholder(dtype=tf.float32, shape=[None, se.shape.as_list()[-1]], name="encode_states") input_frame = tf.placeholder( dtype=tf.float32, shape=[None, state_shape[0], state_shape[1], 3], name="input_frame") rollout = network.Network([se], f_rollout, var_scope="rollout_policy") if not with_ob: net_model = network.Network([se, input_action], f_tran, var_scope="TranModel") net_decoder = network.Network([ tf.concat( (encode_state, encode_state), axis=-1), input_frame ], f_decoder, var_scope="Decoder") else: net_model = network.Network([input_observation, input_action], f_tran, var_scope="TranModelOB") net_decoder = network.Network([input_frame], f_decoder, var_scope="DecoderOB") rollout_encoder = network.Network( [tf.concat((se, se), axis=-1), input_reward], f_encoder, var_scope="rollout_encoder") current_state = se current_ob = input_observation for i in range(rollout_lane): for j in range(rollout_depth): current_rollout = rollout([current_state], name_scope="rollout_%d_%d" % (i, j)) # rollout_action_dist = tf.contrib.distributions.Categorical(rollout_action_function.output().op) # current_action = rollout_action_dist.sample() if not with_ob: tran_model = net_model([ current_state, current_rollout["rollout_action"].op ], name_scope="env_model_%d_%d" % (i, j)) else: tran_model = net_model( [current_ob, current_rollout["rollout_action"].op], name_scope="env_model_%d_%d" % (i, j)) next_goal = tran_model["next_state"].op reward = tran_model["reward"].op if not with_ob: current_state += next_goal else: current_ob = tf.concat( [current_ob[:, :, :, 3:], next_goal], axis=-1) next_goal = tf.stop_gradient( net_se([current_ob])["se"].op) if j == 0: encode_states = next_goal rollout_reward = reward else: encode_states = tf.concat([next_goal, encode_states], axis=-1) rollout_reward = tf.concat([rollout_reward, reward], axis=0) current_state = se current_ob = input_observation input_reward = tf.reshape(rollout_reward, [-1, rollout_depth]) input_reward = tf.split(input_reward, rollout_depth, axis=1) encode_state = tf.split(encode_states, rollout_depth, axis=1) for m in range(rollout_depth): if m == 0: rollout_encoder = rollout_encoder( [ tf.concat([ encode_state[-(m + 1)], encode_state[-(m + 1)] ], axis=-1), input_reward[-(m + 1)] ], name_scope="rollout_encoder_%d_%d" % (i, m)) re = rollout_encoder["re"].op else: rollout_encoder = rollout_encoder( [ tf.concat([re, encode_state[-(m + 1)]], axis=-1), input_reward[-(m + 1)] ], name_scope="rollout_encoder_%d_%d" % (i, m)) re = rollout_encoder["re"].op if i == 0: path = re else: path = tf.concat([path, re], axis=1) if policy_with_iaa: feature = tf.concat([path, se], axis=1) else: feature = se ac = network.Network([feature], f_ac, var_scope='ac') v = ac["v"].op pi_dist = ac["pi"].op return {"v": v, "pi": pi_dist, "rollout_action": None}, \ { "se": net_se, "transition": net_model, "state_decoder": net_decoder } self._log_dir = log_dir self._rollout_depth = rollout_depth if dynamic_rollout is None: self._dynamic_rollout = [1, 3, 5] self._dynamic_skip_step = [5000, 15000] else: self._dynamic_rollout = dynamic_rollout self._dynamic_skip_step = dynamic_skip_step kwargs.update({ "f_iaa": f_iaa, "state_shape": state_shape, "num_action": num_action, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_advantage": max_advantage, "max_gradient": max_gradient, "batch_size": batch_size, }) logging.warning(network_optimizer) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=batch_size) kwargs.update({"sampler": sampler}) super(ActorCriticWithI2A, self).__init__(*args, **kwargs) pi = self.network["pi"] if pi is not None: # discrete action: pi is categorical probability distribution self._pi_function = network.NetworkFunction(self.network["pi"]) self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) # continuous action: mean / stddev for normal distribution # self._rollout_action = network.NetworkFunction(self.network["rollout_action"]) # self._rollout_dist = distribution.DiscreteDistribution(self._rollout_action, self._input_action) if target_estimator is None: target_estimator = target_estimate.NStepTD(self._v_function, discount_factor) # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(ActorCriticUpdater( policy_dist=self._pi_distribution, v_function=self._v_function, target_estimator=target_estimator, entropy=entropy), name="ac") network_optimizer.add_updater(network.L2(self.network), name="l2") # network_optimizer.add_updater( # PolicyNetUpdater(rollout_dist=self._rollout_dist, # rollout_action_function=self._rollout_action, # pi_function=self._pi_function), # name="policy_net" # ) network_optimizer.add_updater(EnvModelUpdater( net_se=self.network.sub_net("se"), net_transition=self.network.sub_net("transition"), net_decoder=self.network.sub_net("state_decoder"), curriculum=self._dynamic_rollout, skip_step=self._dynamic_skip_step, state_shape=state_shape, dim_action=num_action, transition_weight=1.0, with_momentum=with_momentum, compute_with_diff=compute_with_diff, save_image_interval=save_image_interval, with_ob=with_ob, with_goal=with_goal), name="env_model") # network_optimizer.freeze(self.network.sub_net("transition").variables) network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution)
class PPO(sampling.TrajectoryBatchUpdate, BaseDeepAgent): def __init__( self, f_create_net, state_shape, # PPO arguments discount_factor, entropy=1e-3, clip_epsilon=0.2, # update arguments epoch_per_step=4, # target estimate target_estimator=None, # optimizer arguments network_optimizer=None, max_gradient=10.0, # sampler arguments sampler=None, batch_size=32, horizon=1024, *args, **kwargs): """ :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values}, in which {inputs} is [input_state], {dist_pi} is probability distribution of policy with shape [None, num_actions], {q_values} is Q values with shape [None, num_actions]; or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v}, in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions, {v} is state value. :param state_shape: :param discount_factor: :param entropy: entropy regulator weight. :param target_estimator: optional, default to target_estimate.NStepTD :type target_estimator.TargetEstimator :param network_optimizer: optional, default to network.LocalNetworkOptimizer :type network_optimizer: network.NetworkOptimizer :param max_gradient: optional, max_gradient clip value :param sampler: optional, default to sampling.TrajectoryOnSampler. if None, a TrajectoryOnSampler will be created using batch_size. :type sampler: sampling.Sampler :param batch_size: optional, batch_size when creating sampler :param args: :param kwargs: """ kwargs.update({ "f_create_net": f_create_net, "state_shape": state_shape, "discount_factor": discount_factor, "entropy": entropy, "target_estimator": target_estimator, "max_gradient": max_gradient, "batch_size": batch_size, "horizon": horizon, "clip_epsilon": clip_epsilon, "epoch_per_step": epoch_per_step, }) if network_optimizer is None: network_optimizer = network.LocalOptimizer(grad_clip=max_gradient) if sampler is None: sampler = sampling.TrajectoryOnSampler(interval=horizon, check_episode_done=False) kwargs.update({"sampler": sampler}) super(PPO, self).__init__(*args, **kwargs) self._epoch_py_step = epoch_per_step self._batch_size = batch_size pi = self.network["pi"] if pi is not None: # discrete action: pi is categorical probability distribution self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._pi_function = network.NetworkFunction(self.network["pi"]) self._pi_distribution = distribution.DiscreteDistribution( self._pi_function, self._input_action) self._old_pi_function = network.NetworkFunction( self._old_network["pi"]) self._old_pi_distribution = distribution.DiscreteDistribution( self._old_pi_function, self._input_action) q = self.network["q"] if q is not None: # network outputs q self._q_function = network.NetworkFunction(q) self._v_function = GreedyStateValueFunction(self._q_function) self._old_q_function = network.NetworkFunction( self._old_network["q"]) self._old_v_function = GreedyStateValueFunction( self._old_q_function) else: # network output v self._v_function = network.NetworkFunction(self.network["v"]) self._old_v_function = network.NetworkFunction( self._old_network["v"]) else: # continuous action: mean / stddev represents normal distribution dim_action = self.network["mean"].op.shape.as_list()[-1] self._input_action = tf.placeholder(dtype=tf.float32, shape=[None, dim_action], name="input_action") self._pi_function = network.NetworkFunction( outputs={ "mean": self.network["mean"], "stddev": self.network["stddev"] }, inputs=self.network.inputs) self._pi_distribution = distribution.NormalDistribution( self._pi_function, self._input_action) self._v_function = network.NetworkFunction(self.network["v"]) self._old_pi_function = network.NetworkFunction( outputs={ "mean": self._old_network["mean"], "stddev": self._old_network["stddev"] }, inputs=self._old_network.inputs) self._old_pi_distribution = distribution.NormalDistribution( self._old_pi_function, self._input_action) self._old_v_function = network.NetworkFunction( self._old_network["v"]) if target_estimator is None: target_estimator = target_estimate.GAENStep( self._v_function, discount_factor) self.network_optimizer = network_optimizer network_optimizer.add_updater(PPOUpdater( policy_dist=self._pi_distribution, old_dist=self._old_pi_distribution, v_function=self._v_function, old_v_function=self._old_v_function, target_estimator=target_estimator, entropy=entropy, clip_epsilon=clip_epsilon), name="ppo") network_optimizer.add_updater(network.L2(self.network), name="l2") network_optimizer.compile() self._policy = StochasticPolicy(self._pi_distribution) def init_network(self, f_create_net, state_shape, *args, **kwargs): input_state = tf.placeholder(dtype=tf.float32, shape=[None] + list(state_shape), name="input_state") net = network.Network([input_state], f_create_net, var_scope="learn") self._old_network = network.Network([input_state], f_create_net, var_scope="old") self._old_network_syncer = network.NetworkSyncer( net, self._old_network) return net def update_on_trajectory(self, batch): # here we receive batch of size horizon. infos = [] info = {} for i in range(self._epoch_py_step): for mini_batch in BatchIterator(batch, self._batch_size, check_episode_done=True): self.network_optimizer.update("ppo", self.sess, mini_batch) self.network_optimizer.update("l2", self.sess) info = self.network_optimizer.optimize_step(self.sess) info = dict([(k, np.mean(info[k])) for k in info]) infos.append(info) self._old_network_syncer.sync(self.sess, 1.0) return to_columnwise(infos), {} def set_session(self, sess): super(PPO, self).set_session(sess) self.network.set_session(sess) self._old_network.set_session(sess) self._pi_distribution.set_session(sess) self._old_pi_distribution.set_session(sess) def act(self, state, **kwargs): return self._policy.act(state, **kwargs)