예제 #1
0
def focal_loss_fixed(y_true, y_pred, gamma=2., alpha=.25):
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))

    pt_1 = tf.clip(pt_1, 1e-3, .999)
    pt_0 = tf.clip(pt_0, 1e-3, .999)

    return -tf.sum(alpha * tf.pow(1. - pt_1, gamma) * tf.log(pt_1)) - tf.sum(
        (1 - alpha) * tf.pow(pt_0, gamma) * tf.log(1. - pt_0))
예제 #2
0
 def recall(y_true, y_pred):
     """Recall metric.
     Computes the recall over the whole batch using threshold_value.
     """
     threshold_value = threshold
     # Adaptation of the "round()" used before to get the predictions. Clipping to make sure that the predicted raw values are between 0 and 1.
     y_pred = tf.cast(tf.greater(tf.clip(y_pred, 0, 1), threshold_value), tf.floatx())
     # Compute the number of true positives. Rounding in prevention to make sure we have an integer.
     true_positives = tf.round(tf.sum(tf.clip(y_true * y_pred, 0, 1)))
     # Compute the number of positive targets.
     possible_positives = tf.sum(tf.clip(y_true, 0, 1))
     recall_ratio = true_positives / (possible_positives + tf.epsilon())
     return recall_ratio
 def loss_(y_true, y_pred):
     # scale predictions so that the class probas of each sample sum to 1
     y_pred /= tf.sum(y_pred, axis=-1, keepdims=True)
     # clip to prevent NaN's and Inf's
     y_pred = tf.clip(y_pred, tf.epsilon(), 1 - tf.epsilon())
     # calc
     loss = y_true * tf.log(y_pred) * weights
     loss = -tf.sum(loss, -1)
     return loss
예제 #4
0
def fade_block(x, block_num):
    #Inputs: [small-res (a), big-res (1-a), alpha]
    sr = x[0]
    br = x[1]
    alpha = x[2]

    alpha = tf.reshape(alpha, [-1, 1, 1, 1])
    alpha = tf.clip(alpha - block_num, 0, 1)

    return (sr * alpha) + (br * (1 - alpha))
예제 #5
0
    def build_graph(self):
        with tf.variable_scope('ppo_core'):
            self.state = tf.placeholder(tf.float32, shape=[None, None, self.state_size])

            n_hid = 128

            # Init https://arxiv.org/pdf/1901.03611.pdf
            fan_in = 4
            fan_out = 128
            w_mean = 0
            w_stddev = 2 / fan_out
            W1 = tf.get_variable(
                "W1",
                [fan_in, n_hid],
                tf.float32,
                tf.random_normal_initializer(w_mean, w_stddev)
            )
            b1 = tf.get_variable("b1", [fan_out], tf.float32, tf.constant_initializer(0.))
            # import pdb; pdb.set_trace()
            a1 = tf.matmul(self.state, W1) + b1

            fan_in = 128
            fan_out = 128
            w_mean = 0
            w_stddev = 2 / fan_out
            W2 = tf.get_variable(
                "W2",
                [n_hid, n_hid],
                tf.float32,
                tf.random_normal_initializer(w_mean, w_stddev)
            )
            b2 = tf.get_variable("b2", [fan_out], tf.float32, tf.constant_initializer(0.))
            a2 = tf.matmul(a1, W2) + b2

        with tf.variable_scope('ppo_policy_head'):
            fan_in = 128
            fan_out = 2
            w_mean = 0
            w_stddev = 2 / fan_out
            W_act = tf.get_variable(
                "W_act",
                [n_hid, fan_out],
                tf.float32,
                tf.random_normal_initializer(w_mean, w_stddev)
            )
            b_act = tf.get_variable("b_act", [fan_out], tf.float32, tf.constant_initializer(0.))
            a_act = tf.matmul(a2, W_act) + b_act
            self.policy = tf.nn.softmax(a_act, axis=1)
            self.act_pred = tf.argmax(self.policy, axis=1)

        with tf.variable_scope('ppo_value_f_head'):
            fan_in = 128
            fan_out = 1
            w_mean = 0
            w_stddev = 2 / fan_out
            W_val = tf.get_variable(
                "W_val",
                [n_hid, fan_out],
                tf.float32,
                tf.random_normal_initializer(w_mean, w_stddev)
            )
            b_val = tf.get_variable("b_val", [fan_out], tf.float32, tf.constant_initializer(0.))
            self.val_pred = tf.matmul(a2, W_val) + b_val

        with tf.variable_scope('train'):
            self.ex_rewards = tf.placeholder(tf.float32, shape=[None, None, 1])
            self.policy_old = tf.placeholder(tf.float32, shape=[None, None, 2])
            self.advantages = tf.placeholder(tf.float32, shape=[None, None, 1])

            loss_vf = 1 / 2 * tf.reduce_mean(tf.square(self.val_pred - self.ex_rewards))

            r_t = self.policy[self.act_pred] / self.policy_old[self.act_pred]
            clipped_rt = tf.clip(r_t, 1 - self.epsilon, 1 + self.espilon)
            loss_clip = tf.reduce_mean(tf.min(r_t * self.advantages, clipped_rt * self.advantages))

            loss = loss_clip - self.alpha_1 * loss_vf
예제 #6
0
def build_train(make_obs_ph, model, num_actions, optimizer_f, grad_norm_clipping=None, gamma=1.0,
    double_q=False, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, test_eps=0.05, 
    learning_rate = 0.001, learning_rate_decay_factor=0.99, learning_rate_growth_factor=1.001):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(make_obs_ph, model, num_actions, scope=scope, reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph, model, num_actions, scope=scope, reuse=reuse)

    act_greedy = build_act_greedy(make_obs_ph, model, num_actions, scope=scope, reuse=True, eps=test_eps)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # Learning rate adjustment
        lr = tf.Variable(float(learning_rate), trainable=False, dtype = tf.float32)
        learning_rate_decay_op = lr.assign(tf.clip_by_value(lr*learning_rate_decay_factor, 1e-5, 1e-3))
        learning_rate_growth_op = lr.assign(tf.clip_by_value(lr*learning_rate_growth_factor, 1e-5, 1e-3))
        optimizer = optimizer_f(learning_rate = lr)

        # q network evaluation
        atom_t = model(obs_t_input.get(), num_outputs, scope="atom_func", reuse=True)  # reuse parameters from act
        atom_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/atom_func")
        atom_p_t = tf.nn.softmax(atom_t)

        # target q network evalution
        atom_tp1 = model(obs_tp1_input.get(), num_outputs, scope="target_atom_func")
        target_atom_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_atom_func")
        atom_p_tp1 = tf.nn.softmax(atom_tp1)

        m_vec = tf.constant(0.0, dtype=tf.float32, shape=(num_atoms))
        for j in range(num_atoms):
            Tz_j = tf.clip(rew_t_ph + gamma * (V_min + j * del_z), V_min, V_max)
            b_j = (Tz_j - V_min)/del_z
            l = tf.astype(tf.math.floor(b_j), tf.int32)
            u = tf.astype(tf.math.ceil(b_j), tf.int32)
            m_vec[l] = m_vec[l] + 





        cem_loss = tf.reduce_sum(tf.math.multiply(m, tf.log(atom_p)))
        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[td_error, lr],
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function(inputs=[obs_t_input], outputs=q_t)

        return act_f, act_greedy, q_values, train, update_target, learning_rate_decay_op, learning_rate_growth_op, {'q_values': q_values}
예제 #7
0
 def _inverse(self, y):
     # We perform clipping in the _inverse function, as is done in TF-Agents.
     y = jnp.where(jnp.less_equal(jnp.abs(y), 1.),
                   tf.clip(y, -0.99999997, 0.99999997), y)
     return jnp.arctanh(y)