def rrm_loss(regrets, action_utilities, ignore_negative_regrets=True): regrets = tf.convert_to_tensor(regrets) return rrm_loss_given_policy( regrets, rm_policy(regrets), action_utilities, ignore_negative_regrets=ignore_negative_regrets)
def __init__(self, *args, softmax_temperatures=[], use_cumulative_values=False, **kwargs): def f(temp): def g(z): return tf.nn.softmax(z[:, :-1] / self._adjusted_temperature(temp)) policies = ([lambda z: cpea.rm_policy(z[:, :-1] - z[:, -1:])] + list(map(f), softmax_temperatures)) super(SplitRrm, self).__init__(policies, *args, **kwargs)
def loss(self, predictions, policy, cfv): q, v = predictions[:, :-1], predictions[:, -1:] r = q - v pi_rm = cpea.rm_policy(r) q_diffs = tf.square(q - cfv) q_loss = tf.reduce_mean(tf.reduce_sum(q_diffs, axis=1)) / 2.0 ev = tf.stop_gradient(tf.reduce_sum(cfv * pi_rm, axis=1, keepdims=True)) v_loss = tf.reduce_mean(tf.square(v - ev)) / 2.0 return q_loss + v_loss
def rrm_utilities(model, contexts, action_utilities): return utility(rm_policy(model(contexts)), action_utilities)
def policy_activation(self, pre_activations): return rm_policy(pre_activations)
def meta_policy(self): return cpea.rm_policy(self.meta_qregrets)
def loss(self, predictions, policy, cfv): r = tf.stop_gradient( cpea.rm_policy(cfv - tf.reduce_sum(cfv * policy, axis=1, keepdims=True))) error = tf.square(r - predictions) / 2.0 return tf.reduce_mean(tf.reduce_sum(error, axis=1))
def loss(self, predictions, policy, cfv): r = tf.stop_gradient( cpea.rm_policy(cfv - tf.reduce_sum(cfv * policy, axis=1, keepdims=True))) log_policy = tf.log(tf.clip_by_value(policy, 1e-15, 1 - 1e-15)) return -tf.reduce_mean(tf.reduce_sum(r * log_policy, axis=1))
def loss(self, predictions, policy, cfv): pi = cpea.rm_policy(predictions) inst_r = cfv - cpea.utility(pi, cfv) inst_q = tf.stop_gradient(tf.maximum(inst_r, -tf.nn.relu(predictions))) return tf.reduce_mean( tf.reduce_sum(tf.square(predictions - inst_q), axis=1)) / 2.0