def focal_loss_fixed(y_true, y_pred, gamma=2., alpha=.25): pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred)) pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred)) pt_1 = tf.clip(pt_1, 1e-3, .999) pt_0 = tf.clip(pt_0, 1e-3, .999) return -tf.sum(alpha * tf.pow(1. - pt_1, gamma) * tf.log(pt_1)) - tf.sum( (1 - alpha) * tf.pow(pt_0, gamma) * tf.log(1. - pt_0))
def recall(y_true, y_pred): """Recall metric. Computes the recall over the whole batch using threshold_value. """ threshold_value = threshold # Adaptation of the "round()" used before to get the predictions. Clipping to make sure that the predicted raw values are between 0 and 1. y_pred = tf.cast(tf.greater(tf.clip(y_pred, 0, 1), threshold_value), tf.floatx()) # Compute the number of true positives. Rounding in prevention to make sure we have an integer. true_positives = tf.round(tf.sum(tf.clip(y_true * y_pred, 0, 1))) # Compute the number of positive targets. possible_positives = tf.sum(tf.clip(y_true, 0, 1)) recall_ratio = true_positives / (possible_positives + tf.epsilon()) return recall_ratio
def loss_(y_true, y_pred): # scale predictions so that the class probas of each sample sum to 1 y_pred /= tf.sum(y_pred, axis=-1, keepdims=True) # clip to prevent NaN's and Inf's y_pred = tf.clip(y_pred, tf.epsilon(), 1 - tf.epsilon()) # calc loss = y_true * tf.log(y_pred) * weights loss = -tf.sum(loss, -1) return loss
def fade_block(x, block_num): #Inputs: [small-res (a), big-res (1-a), alpha] sr = x[0] br = x[1] alpha = x[2] alpha = tf.reshape(alpha, [-1, 1, 1, 1]) alpha = tf.clip(alpha - block_num, 0, 1) return (sr * alpha) + (br * (1 - alpha))
def build_graph(self): with tf.variable_scope('ppo_core'): self.state = tf.placeholder(tf.float32, shape=[None, None, self.state_size]) n_hid = 128 # Init https://arxiv.org/pdf/1901.03611.pdf fan_in = 4 fan_out = 128 w_mean = 0 w_stddev = 2 / fan_out W1 = tf.get_variable( "W1", [fan_in, n_hid], tf.float32, tf.random_normal_initializer(w_mean, w_stddev) ) b1 = tf.get_variable("b1", [fan_out], tf.float32, tf.constant_initializer(0.)) # import pdb; pdb.set_trace() a1 = tf.matmul(self.state, W1) + b1 fan_in = 128 fan_out = 128 w_mean = 0 w_stddev = 2 / fan_out W2 = tf.get_variable( "W2", [n_hid, n_hid], tf.float32, tf.random_normal_initializer(w_mean, w_stddev) ) b2 = tf.get_variable("b2", [fan_out], tf.float32, tf.constant_initializer(0.)) a2 = tf.matmul(a1, W2) + b2 with tf.variable_scope('ppo_policy_head'): fan_in = 128 fan_out = 2 w_mean = 0 w_stddev = 2 / fan_out W_act = tf.get_variable( "W_act", [n_hid, fan_out], tf.float32, tf.random_normal_initializer(w_mean, w_stddev) ) b_act = tf.get_variable("b_act", [fan_out], tf.float32, tf.constant_initializer(0.)) a_act = tf.matmul(a2, W_act) + b_act self.policy = tf.nn.softmax(a_act, axis=1) self.act_pred = tf.argmax(self.policy, axis=1) with tf.variable_scope('ppo_value_f_head'): fan_in = 128 fan_out = 1 w_mean = 0 w_stddev = 2 / fan_out W_val = tf.get_variable( "W_val", [n_hid, fan_out], tf.float32, tf.random_normal_initializer(w_mean, w_stddev) ) b_val = tf.get_variable("b_val", [fan_out], tf.float32, tf.constant_initializer(0.)) self.val_pred = tf.matmul(a2, W_val) + b_val with tf.variable_scope('train'): self.ex_rewards = tf.placeholder(tf.float32, shape=[None, None, 1]) self.policy_old = tf.placeholder(tf.float32, shape=[None, None, 2]) self.advantages = tf.placeholder(tf.float32, shape=[None, None, 1]) loss_vf = 1 / 2 * tf.reduce_mean(tf.square(self.val_pred - self.ex_rewards)) r_t = self.policy[self.act_pred] / self.policy_old[self.act_pred] clipped_rt = tf.clip(r_t, 1 - self.epsilon, 1 + self.espilon) loss_clip = tf.reduce_mean(tf.min(r_t * self.advantages, clipped_rt * self.advantages)) loss = loss_clip - self.alpha_1 * loss_vf
def build_train(make_obs_ph, model, num_actions, optimizer_f, grad_norm_clipping=None, gamma=1.0, double_q=False, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, test_eps=0.05, learning_rate = 0.001, learning_rate_decay_factor=0.99, learning_rate_growth_factor=1.001): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, model, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, model, num_actions, scope=scope, reuse=reuse) act_greedy = build_act_greedy(make_obs_ph, model, num_actions, scope=scope, reuse=True, eps=test_eps) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # Learning rate adjustment lr = tf.Variable(float(learning_rate), trainable=False, dtype = tf.float32) learning_rate_decay_op = lr.assign(tf.clip_by_value(lr*learning_rate_decay_factor, 1e-5, 1e-3)) learning_rate_growth_op = lr.assign(tf.clip_by_value(lr*learning_rate_growth_factor, 1e-5, 1e-3)) optimizer = optimizer_f(learning_rate = lr) # q network evaluation atom_t = model(obs_t_input.get(), num_outputs, scope="atom_func", reuse=True) # reuse parameters from act atom_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/atom_func") atom_p_t = tf.nn.softmax(atom_t) # target q network evalution atom_tp1 = model(obs_tp1_input.get(), num_outputs, scope="target_atom_func") target_atom_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_atom_func") atom_p_tp1 = tf.nn.softmax(atom_tp1) m_vec = tf.constant(0.0, dtype=tf.float32, shape=(num_atoms)) for j in range(num_atoms): Tz_j = tf.clip(rew_t_ph + gamma * (V_min + j * del_z), V_min, V_max) b_j = (Tz_j - V_min)/del_z l = tf.astype(tf.math.floor(b_j), tf.int32) u = tf.astype(tf.math.ceil(b_j), tf.int32) m_vec[l] = m_vec[l] + cem_loss = tf.reduce_sum(tf.math.multiply(m, tf.log(atom_p))) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, lr], updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function(inputs=[obs_t_input], outputs=q_t) return act_f, act_greedy, q_values, train, update_target, learning_rate_decay_op, learning_rate_growth_op, {'q_values': q_values}
def _inverse(self, y): # We perform clipping in the _inverse function, as is done in TF-Agents. y = jnp.where(jnp.less_equal(jnp.abs(y), 1.), tf.clip(y, -0.99999997, 0.99999997), y) return jnp.arctanh(y)