def __init__(self, actor, critic, target_estimator, discount_factor, actor_weight, actor_mean): """ :param actor: :type actor network.NetworkFunction :param critic: :type critic network.NetworkFunction :param target_estimator: :type target_estimator: target_estimate.TargetEstimator """ super(NoisyDPGUpdater, self).__init__(actor, critic, target_estimator, discount_factor, actor_weight) self._actor_mean = actor_mean with tf.name_scope("NoisyDPGUpdater"): with tf.name_scope("action_mean"): self._input_action_mean_gradient = tf.placeholder( dtype=tf.float32, shape=[None, self._dim_action], name="input_action_mean_gradient") self._actor_mean_loss = tf.reduce_sum( actor_mean.output().op * self._input_action_mean_gradient, axis=1) self._actor_mean_loss = -tf.reduce_mean(self._actor_mean_loss) self._op_loss = (self._actor_loss + self._actor_mean_loss ) * actor_weight + self._critic_loss # self._op_loss = self._actor_mean_loss * actor_weight + self._critic_loss self._update_operation = network.MinimizeLoss( self._op_loss, var_list=self._actor.variables + self._critic.variables + self._actor_mean.variables)
def __init__(self, rollout_dist, rollout_action_function, pi_function, entropy=1e-3): """ Policy Net updater calculate the loss between action derived from A3C and the policy net :param rollout_action: :param pi_function: :param entropy: """ super(PolicyNetUpdater, self).__init__() self._rollout_dist, self._pi_function, self._rollout_action_function = rollout_dist, pi_function, rollout_action_function self._entropy = entropy with tf.name_scope("PolicyNetUpdater"): with tf.name_scope("input"): self._input_action = self._rollout_dist.input_sample() op_pi = self._pi_function.output().op op_mimic_pi = self._rollout_action_function.output().op with tf.name_scope("rollout"): self._rollout_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=op_pi, logits=op_mimic_pi), name="rollout_loss") self._entropy_loss = self._rollout_action_function self._op_loss = self._rollout_loss self._update_operation = network.MinimizeLoss( self._op_loss, var_list=self._rollout_action_function.variables)
def __init__(self, net_se, func, stddev=1.0, stddev_weight=1e-3): super(CenterDisentangleUpdater, self).__init__() self._stddev = stddev state_shape = net_se.inputs[0].shape.as_list() se_dimension = net_se["se"].op.shape.as_list()[-1] noise_shape = func.inputs[1].shape.as_list() with tf.name_scope("input"): self._input_state = tf.placeholder(dtype=tf.float32, shape=state_shape, name="St") self._input_noise = tf.placeholder(dtype=tf.float32, shape=noise_shape, name="Nt") self._input_stddev = tf.placeholder(dtype=tf.float32, name="stddev") with tf.name_scope("disentangle"): net_se_off = net_se([self._input_state], "off_se") net_noise_off = func( [tf.stop_gradient(net_se_off["se"].op), self._input_noise], "off_noise") self._noise_op = net_noise_off["noise"].op mean = tf.reduce_mean(self._noise_op, axis=0, keep_dims=True) mean_loss = tf.reduce_sum(Utils.clipped_square(mean)) stddev = tf.reduce_mean( tf.sqrt( tf.reduce_sum(tf.square(self._noise_op - mean), axis=-1))) stddev_loss = Utils.clipped_square(stddev - self._input_stddev * np.sqrt(se_dimension)) self._op_loss = mean_loss + stddev_loss * stddev_weight self._mean_op, self._stddev_op, self._mean_loss, self._stddev_loss = \ mean, stddev, mean_loss, stddev_loss self._update_operation = network.MinimizeLoss(self._op_loss, var_list=func.variables)
def __init__(self, inverse_function, policy_dist): super(InverseUpdater, self).__init__() self._inverse_function, self._policy_dist = inverse_function, policy_dist with tf.name_scope("InverseUpdater"): with tf.name_scope("input"): self._input_action = policy_dist.input_sample() op_action_hat = inverse_function.output().op # inverse loss calculation with tf.name_scope("inverse"): depth = np.shape(op_action_hat)[1] inverse_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=tf.one_hot(indices=self._input_action, depth=depth, on_value=1, off_value=0, axis=-1), logits=op_action_hat)) # inverse_loss = tf.reduce_mean( # tf.square( # op_action_hat - # tf.one_hot(indices=self._input_action, depth=depth, on_value=1.0, off_value=0.0, axis=-1)) # ) self._inverse_loss = inverse_loss self._op_loss = self._inverse_loss self._update_operation = network.MinimizeLoss( self._op_loss, var_list=self._inverse_function.variables + self._policy_dist._dist_function.variables)
def __init__(self, policy_dist, v_function, target_estimator, entropy=1e-3, actor_weight=1.0): """ Actor Critic methods, for both continuous and discrete action spaces. :param policy_dist: :type policy_dist: distribution.NNDistribution :param v_function: Function calculating state value :type v_function: network.NetworkFunction :param target_estimator: :type target_estimator: :param num_actions: """ super(ActorCriticUpdater, self).__init__() self._policy_dist, self._v_function = policy_dist, v_function self._target_estimator = target_estimator self._entropy = entropy with tf.name_scope("ActorCriticUpdater"): with tf.name_scope("input"): self._input_target_v = tf.placeholder(dtype=tf.float32, shape=[None], name="input_target_v") self._input_action = policy_dist.input_sample() self._input_entropy = tf.placeholder(dtype=tf.float32, shape=[], name="input_entropy") op_v = v_function.output().op with tf.name_scope("value"): td = self._input_target_v - op_v self._q_loss = tf.reduce_mean(network.Utils.clipped_square(td)) with tf.name_scope("policy"): advantage = self._input_target_v - op_v self._advantage = advantage _mean, _var = tf.nn.moments(advantage, axes=[0]) self._std_advantage = advantage / (tf.sqrt(_var) + 1.0) # self._std_advantage = self._advantage pi_loss = tf.reduce_mean(self._policy_dist.log_prob() * tf.stop_gradient(self._std_advantage)) entropy_loss = tf.reduce_mean(self._input_entropy * self._policy_dist.entropy()) self._pi_loss = pi_loss # self._op_loss = self._q_loss - (self._pi_loss + entropy_loss) self._op_loss = self._q_loss print "advantage, self._policy_dist.entropy(), self._policy_dist.log_prob()", advantage, self._policy_dist.entropy( ), self._policy_dist.log_prob() self._update_operation = network.MinimizeLoss( self._op_loss, var_list=self._v_function.variables + self._policy_dist._dist_function.variables)
def __init__(self, policy_dist, v_function, target_estimator, entropy=1e-3, max_advantage=10.0): """ :param policy_dist: :type policy_dist: distribution.DiscreteDistribution :param v_function: :type v_function: network.NetworkFunction :param target_estimator: :type target_estimator: """ super(DiscretePGUpdater, self).__init__() self._policy_dist, self._v_function = policy_dist, v_function self._target_estimator = target_estimator self._entropy = entropy self._num_actions = v_function.output().op.shape.as_list()[-1] with tf.name_scope("DiscreteActorCriticUpdate"): with tf.name_scope("input"): self._input_target_v = tf.placeholder(dtype=tf.float32, shape=[None], name="input_target_q") self._input_action = policy_dist.input_sample() self._input_entropy = tf.placeholder(dtype=tf.float32, shape=[], name="input_entropy") op_v = v_function.output().op with tf.name_scope("policy"): advantage = self._input_target_v - op_v advantage = tf.clip_by_value(advantage, -max_advantage, max_advantage, name="advantage") self._pi_loss = tf.reduce_mean(self._policy_dist.log_prob() * tf.stop_gradient(advantage)) entropy_loss = self._input_entropy * tf.reduce_mean( self._policy_dist.entropy()) self._op_loss = -(self._pi_loss + entropy_loss) self._update_operation = network.MinimizeLoss( self._op_loss, var_list=self._policy_dist.dist_function().variables)
def __init__(self, forward_function, feature_function, policy_dist): super(ForwardUpdater, self).__init__() self._forward_function, self._feature_function, self._policy_dist = \ forward_function, feature_function, policy_dist with tf.name_scope("ForwardUpdater"): op_phi_next_state_hat = forward_function.output().op op_phi_next_state = feature_function.output().op # forward loss calculation with tf.name_scope("forward"): forward_loss = 0.05 * tf.reduce_mean(tf.square( tf.subtract(op_phi_next_state_hat, op_phi_next_state)), name="forward_loss") self._forward_loss = forward_loss self._op_loss = self._forward_loss self._update_operation = network.MinimizeLoss( self._op_loss, var_list=self._forward_function.variables + self._feature_function.variables)
def __init__(self, actor, critic, target_estimator, discount_factor, actor_weight): """ :param actor: :type actor network.NetworkFunction :param critic: :type critic network.NetworkFunction :param target_estimator: :type target_estimator: target_estimate.TargetEstimator """ super(DPGUpdater, self).__init__() self._actor, self._critic, self._target_estimator = \ actor, critic, target_estimator self._dim_action = actor.output().op.shape.as_list()[-1] op_q = critic.output().op with tf.name_scope("DPGUpdater"): with tf.name_scope("input"): self._input_target_q = tf.placeholder(dtype=tf.float32, shape=[None], name="input_target_q") self._input_action_gradient = tf.placeholder(dtype=tf.float32, shape=[None, self._dim_action], name="input_action_gradient") with tf.name_scope("critic"): self._critic_loss = tf.reduce_mean(network.Utils.clipped_square( self._input_target_q - op_q )) with tf.name_scope("actor"): # critic.inputs[1] is input_action self._action_gradient = tf.gradients(critic.output().op, critic.inputs[1])[0] self._gradient_func = network.NetworkFunction( outputs=network.NetworkSymbol(self._action_gradient, "gradient", critic.network), inputs=critic.inputs ) self._actor_loss = tf.reduce_sum(actor.output().op * self._input_action_gradient, axis=1) self._actor_loss = -tf.reduce_mean(self._actor_loss) self._op_loss = self._actor_loss * actor_weight + self._critic_loss self._update_operation = network.MinimizeLoss(self._op_loss, var_list=self._actor.variables + self._critic.variables)
def __init__(self, actor, critic, f_noise, target_estimator, discount_factor, actor_weight, actor_mean, zero_mean_weight=1e-2, stddev_weight=1e-4): super(DisentangleNoisyDPGUpdater, self).__init__(actor, critic, target_estimator, discount_factor, actor_weight, actor_mean) self._f_noise = f_noise self._zero_mean_weight, self._stddev_weight = zero_mean_weight, stddev_weight with tf.name_scope("disentangle"): self._input_weight_mean = tf.placeholder(dtype=tf.float32, name="weight_mean") self._input_weight_stddev = tf.placeholder(dtype=tf.float32, name="weight_stddev") op_a, op_a_mean = self._actor.output().op, self._actor_mean.output( ).op # pull action mean close to noisy action self._zero_mean_loss = network.Utils.clipped_square( tf.stop_gradient(op_a) - op_a_mean) # push noisy action away from action mean self._stddev_loss = -network.Utils.clipped_square( f_noise.output().op) self._disentangle_loss = tf.reduce_mean(self._zero_mean_loss) * self._input_weight_mean \ + tf.reduce_mean(self._stddev_loss) * self._input_weight_stddev self._op_loss = (self._actor_loss + self._actor_mean_loss) * actor_weight + self._critic_loss \ + self._disentangle_loss self._update_operation = network.MinimizeLoss( self._op_loss, var_list=self._actor.variables + self._critic.variables + self._actor_mean.variables)
def __init__(self, net_se, net_transition, net_decoder, state_shape, dim_action, curriculum=None, skip_step=None, transition_weight=0.0, with_momentum=True, compute_with_diff=False, save_image_interval=1000, detailed_decoder=False, with_ob=False, with_goal=True): super(EnvModelUpdater, self).__init__() if curriculum is None: self._curriculum = [1, 3, 5] self._skip_step = [5000, 15000] else: self._curriculum = curriculum self._skip_step = skip_step self._depth = self._curriculum[-1] self.save_image_interval = save_image_interval self._detailed_decoder = detailed_decoder if with_ob: with_momentum = False with tf.name_scope("EnvModelUpdater"): with tf.name_scope("input"): self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") self._input_state = tf.placeholder(dtype=tf.float32, shape=[None] + list(state_shape), name="input_state") self._input_reward = tf.placeholder(dtype=tf.float32, shape=[None], name="input_reward") self._count = tf.placeholder(dtype=tf.int32, name="count") with tf.name_scope("inputs"): s0 = self._input_state[:-1] state_shape = tf.shape(self._input_state)[1:] f0 = s0[:, :, :, -3:] logging.warning("s0:%s, f0:%s", s0.shape, f0.shape) sn, an, rn, fn = [], [], [], [] cur_ob = self._input_state[0:-1] for i in range(self._depth): sn.append(self._input_state[i + 1:]) an.append(self._input_action[i:]) rn.append(self._input_reward[i:]) fn.append(sn[-1][:, :, :, -3:]) with tf.name_scope("rollout"): ses_predict = [] goalfrom0_predict = [] momfrom0_predict = [] action_relatedfrom0_predict = [] r_predict = [] r_predict_loss = [] f_predict = [] image_channel = None f_predict_loss = [] transition_loss = [] relative_transition_loss = [] momentum_loss = [] mom_decoder_predict = [] action_related_decoder_predict = [] if compute_with_diff: diff_ob = [] for i in range(self._input_state.shape[-1] / 3 - 1): diff_ob.append( self._input_state[:, :, :, (i + 1) * 3:(i + 1) * 3 + 3] - self._input_state[:, :, :, i * 3:i * 3 + 3]) ses = net_se([tf.concat(diff_ob[:], axis=3)])["se"].op else: ses = net_se([self._input_state])["se"].op se0 = ses[:-1] sen = [] for i in range(self._depth): sen.append(ses[i + 1:]) cur_se = se0 cur_goal = None cur_mom = None cur_action_related = None se0_truncate, f0_truncate = se0, f0 flows = [] flow_regulations = [] for i in range(self._depth): logging.warning("[%s]: state:%s, action:%s", i, cur_se.shape, an[i].shape) input_action = tf.one_hot(indices=an[i], depth=dim_action, on_value=1.0, off_value=0.0, axis=-1) if not with_ob: net_trans = net_transition([cur_se, input_action], name_scope="transition_%d" % i) else: net_trans = net_transition([cur_ob, input_action], name_scope="transition_%d" % i) if with_momentum: TM_goal = net_trans["momentum"].op action_related = net_trans["action_related"].op cur_mom = TM_goal if cur_goal is None else cur_goal + TM_goal momfrom0_predict.append(cur_mom) cur_action_related = action_related if cur_goal is None else cur_goal + action_related action_relatedfrom0_predict.append(cur_action_related) cur_se_mom = se0_truncate + cur_mom cur_se_action_related = se0_truncate + cur_action_related momentum_loss.append( tf.reduce_mean( network.Utils.clipped_square(cur_se_mom - sen[i]))) goal = net_trans["next_state"].op if not with_ob and with_goal: # socalled_state = net_trans["action_related"].op cur_goal = goal if cur_goal is None else tf.stop_gradient( cur_goal) + goal goalfrom0_predict.append(cur_goal) cur_se = se0_truncate + cur_goal # cur_se = socalled_state elif not with_ob and not with_goal: cur_goal = goal cur_se = cur_goal else: cur_se = goal cur_ob = tf.concat([cur_ob[:, :, :, 3:], goal], axis=-1) ses_predict.append(cur_se) r_predict.append(net_trans["reward"].op) r_predict_loss.append( tf.reduce_mean( network.Utils.clipped_square(r_predict[-1] - rn[i]))) # f_predict.append(net_decoder([tf.concat([se0, cur_goal], axis=1), f0], # name_scope="frame_decoder%d" % i)["next_frame"].op) if detailed_decoder: mom_decoder_predict.append( net_decoder([ tf.concat([se0_truncate, cur_se_mom], axis=1), f0_truncate ], name_scope="mom_decoder%d" % i)["next_frame"].op) action_related_decoder_predict.append( net_decoder([ tf.concat( [se0_truncate, cur_se_action_related], axis=1), f0_truncate ], name_scope="action_related_decoder%d" % i)["next_frame"].op) if not with_ob: net_decoded = net_decoder( [ tf.concat([se0_truncate, cur_goal], axis=1), f0_truncate ], name_scope="frame_decoder%d" % i) else: net_decoded = net_decoder( [cur_se], name_scope="frame_decoder%d" % i) f_predict.append(net_decoded["next_frame"].op) predicted_channel = net_decoded["image_channel"] if predicted_channel is not None and image_channel is None: image_channel = predicted_channel.op frame_2 = net_decoded["frame_2"] frame_losses = [] if frame_2 is not None: sub_i = 1 while True: sub = "frame_%d" % (2**sub_i) sub_frame = net_decoded[sub] if sub_frame is None: break sub_frame = sub_frame.op frame_losses.append( tf.reduce_mean( network.Utils.clipped_square( sub_frame - tf.image.resize_images( fn[i], sub_frame.shape.as_list()[1:3])))) sub_i = sub_i + 1 flow = net_decoded["flow"] if flow is not None: flow = flow.op flows.append(flow) o1_y = flow[:, :-1, :, :] - flow[:, 1:, :, :] o2_y = o1_y[:, :-1, :, :] - o1_y[:, 1:, :, :] o1_x = flow[:, :, :-1, :] - flow[:, :, 1:, :] o2_x = o1_x[:, :, :-1, :] - o1_x[:, :, 1:, :] l1_y = tf.reduce_mean(tf.abs(o2_y)) l1_x = tf.reduce_mean(tf.abs(o2_x)) flow_regulations.append(l1_x + l1_y) frame_losses.append( tf.reduce_mean( network.Utils.clipped_square(f_predict[-1] - fn[i]))) f_predict_loss.append(frame_losses) if not with_ob: mean_se = tf.reduce_mean(sen[i], axis=0) self._se_norm = tf.sqrt( tf.reduce_sum(tf.square(mean_se))) transition_loss.append( tf.reduce_mean( network.Utils.clipped_square(ses_predict[-1] - sen[i]))) relative_transition_loss.append( tf.reduce_mean( tf.sqrt( tf.reduce_sum( tf.square(ses_predict[-1] - sen[i]), axis=-1)) / # tf.sqrt(tf.reduce_sum(tf.square(sen[i] - mean_se), axis=-1)))) tf.sqrt( tf.reduce_sum(tf.square(sen[i]), axis=-1) ))) cur_goal = cur_goal[:-1] cur_se = cur_se[:-1] else: cur_ob = cur_ob[:-1] f0_truncate = f0_truncate[:-1] se0_truncate = se0_truncate[:-1] self._reward_loss = [] self._env_loss = [] self._transition_loss = [] self._relative_transition_loss = [] self._momentum_loss = [] self._flow_regulation_loss = [] for i in range(len(curriculum)): self._reward_loss.append( tf.reduce_mean( tf.add_n(r_predict_loss[0:curriculum[i]]) / float(curriculum[i]), name="reward_loss%d" % curriculum[i]) / 2.0) self._env_loss.append( tf.reduce_mean(tf.add_n( reduce(operator.add, f_predict_loss[0:curriculum[i]], [])) / float(curriculum[i]), name="env_loss%d" % curriculum[i]) / 2.0 * 255.0) if not with_ob: self._transition_loss.append( tf.reduce_mean( tf.add_n(transition_loss[0:curriculum[i]]) / float(curriculum[i]), name="transition_loss%d" % curriculum[i])) self._relative_transition_loss.append( tf.reduce_mean(tf.add_n( relative_transition_loss[0:curriculum[i]]) / float(curriculum[i]), name="transition_loss%d" % curriculum[i])) else: self._transition_loss.append(0.0) self._relative_transition_loss.append(0.0) if with_momentum: self._momentum_loss.append( tf.reduce_mean( tf.add_n(momentum_loss[0:curriculum[i]]) / float(curriculum[i]), name="momentum_loss%d" % curriculum[i])) else: self._momentum_loss.append(0.0) if len(flow_regulations) > 0: self._flow_regulation_loss.append( tf.reduce_mean( tf.add_n(flow_regulations[0:curriculum[i]]) / float(curriculum[i]), name="flow_loss%d" % curriculum[i]) * 1e-1) else: self._flow_regulation_loss.append(0.0) def loss_assign(index): return tf.gather(self._env_loss, index), \ tf.gather(self._reward_loss, index), \ tf.gather(self._transition_loss, index), \ tf.gather(self._relative_transition_loss, index), \ tf.gather(self._momentum_loss, index), \ tf.gather(self._flow_regulation_loss, index), \ self._count self._env_loss, self._reward_loss, self._transition_loss, self._relative_transition_loss, \ self._momentum_loss, self._flow_regulation_loss, self._num = \ loss_assign(tf.where(tf.equal(self._curriculum, self._count))) self._op_loss = self._env_loss \ + self._reward_loss \ + self._transition_loss \ + self._momentum_loss \ + self._flow_regulation_loss self._s0, self._f0, self._fn, self._f_predict = s0, f0, fn, f_predict self._mom_decoder_predict, self._action_related_decoder_predict = \ mom_decoder_predict, action_related_decoder_predict self._flows = flows self._image_channel = image_channel self._update_operation = network.MinimizeLoss( self._op_loss, var_list=net_transition.variables + net_se.variables + net_decoder.variables) self.imshow_count = 0 self.num = 1
def __init__(self, policy_dist, old_dist, v_function, old_v_function, target_estimator, entropy=1e-1, clip_epsilon=0.1, value_weight=1.0): """ :param policy_dist: :type policy_dist: distribution.NNDistribution :param old_dist: :type old_dist: distribution.NNDistribution :param v_function: Function calculating state value :type v_function: network.NetworkFunction :param old_v_function: Function calculation old state value :type old_v_function: network.NetworkFunction :param target_estimator: :type target_estimator: :param entropy: entropy weight, c2 in paper :param value_weight: value function loss weight, c1 in paper :param clip_epsilon: clipped value of prob ratio """ super(PPOUpdater, self).__init__() self._policy_dist, self._old_dist = policy_dist, old_dist self._v_function, self._old_v_function = v_function, old_v_function self._target_estimator = target_estimator self._entropy = entropy with tf.name_scope("PPOUpdater"): with tf.name_scope("input"): self._input_target_v = tf.placeholder(dtype=tf.float32, shape=[None], name="input_target_v") self._input_action = policy_dist.input_sample() self._input_entropy = tf.placeholder(dtype=tf.float32, shape=[], name="input_entropy") op_v = v_function.output().op old_op_v = tf.stop_gradient(old_v_function.output().op) with tf.name_scope("value"): td = self._input_target_v - op_v org_v_loss = network.Utils.clipped_square(td) clipped_v = old_op_v + tf.clip_by_value( op_v - old_op_v, -clip_epsilon, clip_epsilon) clip_v_loss = network.Utils.clipped_square( self._input_target_v - clipped_v) self._v_loss = tf.reduce_mean( tf.maximum(org_v_loss, clip_v_loss)) self._org_v_loss, self._clip_v_loss = org_v_loss, clip_v_loss with tf.name_scope("policy"): advantage = self._input_target_v - op_v self._advantage = advantage _mean, _var = tf.nn.moments(advantage, axes=[0]) self._std_advantage = tf.stop_gradient(advantage / (tf.sqrt(_var) + 1.0)) ratio = tf.exp(policy_dist.log_prob() - tf.stop_gradient(old_dist.log_prob())) clipped_ratio = tf.clip_by_value(ratio, 1.0 - clip_epsilon, 1.0 + clip_epsilon) pi_loss = tf.reduce_mean( tf.minimum(ratio * self._std_advantage, clipped_ratio * self._std_advantage)) entropy_loss = tf.reduce_mean(self._policy_dist.entropy()) self._pi_loss = pi_loss self._ratio, self._clipped_ratio = ratio, clipped_ratio self._op_loss = value_weight * self._v_loss - ( self._pi_loss + self._input_entropy * entropy_loss) self._update_operation = network.MinimizeLoss( self._op_loss, var_list=self._v_function.variables + self._policy_dist._dist_function.variables)