Пример #1
0
def generate_noise_matrix(s, Y):
    psx = np.zeros((Y.shape[0], 2))
    for i in range(Y.shape[0]):
        psx[i, int(Y[i])] = 1.
    py, noise_matrix, inverse_noise_matrix, _ = estimate_py_and_noise_matrices_from_probabilities(
        s, psx)
    print(noise_matrix)
    return noise_matrix
Пример #2
0
    def fit(
        self, 
        X,
        s,
        psx = None,
        thresholds = None,
        noise_matrix = None,
        inverse_noise_matrix = None, 
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling.

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 

        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError("Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError("Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes. 
        # Also, if needed, compute P(s=k|x), denoted psx.
        
        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(self.ps, self.noise_matrix)

        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(self.ps, self.inverse_noise_matrix)

        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = \
                estimate_py_noise_matrices_and_cv_pred_proba(
                    X = X,
                    s = s,
                    clf = self.clf,
                    cv_n_folds = self.cv_n_folds,
                    thresholds = thresholds,
                    converge_latent_estimates = self.converge_latent_estimates,
                    seed = self.seed,
                )
            else: # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = \
                estimate_py_and_noise_matrices_from_probabilities(
                    s = s, 
                    psx = psx,
                    thresholds = thresholds,
                    converge_latent_estimates = self.converge_latent_estimates,
                )

        if psx is None: 
            psx = estimate_cv_predicted_probabilities(
                X = X,
                labels = s,
                clf = self.clf,
                cv_n_folds = self.cv_n_folds,
                seed = self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None: # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix,
                class_without_noise=self.pulearning,
            )
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix = self.inverse_noise_matrix,
            confident_joint = self.confident_joint,
            prune_method = self.prune_method,
        ) 
        if self.pulearning is not None:
            self.noise_mask[s != self.pulearning] = False
        return self.noise_mask, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx
Пример #3
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        """This method implements the confident learning. It counts examples
        that are likely labeled correctly and incorrectly and uses their ratio
        to create a predicted confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for
        the noise in both the positive and negative sets.

        Parameters
        ----------
        X : :obj:`np.array`
          Input feature matrix (N, D), 2D numpy array

        s : :obj:`np.array`
          A binary vector of labels, s, which may contain mislabeling.

        psx : :obj:`np.array` (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
          examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx
          should have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : :obj:`iterable` (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). List of probabilities used to determine the cutoff
          predicted probability necessary to consider an example as a given
          class label.
          Default is ``None``. These are computed for you automatically.
          If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. Values in list should be between 0 and 1.

        noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1.

        inverse_noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s). Contains
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Returns
        -------
        tuple
          (noise_mask, sample_weight)"""

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and (np.trace(inverse_noise_matrix)
                                                 <= 1):
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}. Must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = (
                    compute_py_inv_noise_matrix(self.ps, self.noise_matrix))
        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps,
                    self.inverse_noise_matrix,
                )
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                self.confident_joint, psx = \
                    estimate_py_noise_matrices_and_cv_pred_proba(
                        X=X,
                        s=s,
                        clf=self.clf,
                        cv_n_folds=self.cv_n_folds,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                        seed=self.seed,
                    )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                self.confident_joint = \
                    estimate_py_and_noise_matrices_from_probabilities(
                        s=s,
                        psx=psx,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                    )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # if pulearning == the integer specifying the class without noise.
        if self.K == 2 and self.pulearning is not None:  # pragma: no cover
            # pulearning = 1 (no error in 1 class) implies p(s=1|y=0) = 0
            self.noise_matrix[self.pulearning][1 - self.pulearning] = 0
            self.noise_matrix[1 - self.pulearning][1 - self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(y=0|s=1) = 0
            self.inverse_noise_matrix[1 - self.pulearning][self.pulearning] = 0
            self.inverse_noise_matrix[self.pulearning][self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(s=1,y=0) = 0
            self.confident_joint[self.pulearning][1 - self.pulearning] = 0
            self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            n_jobs=self.n_jobs,
        )

        x_mask = ~self.noise_mask
        x_pruned = X[x_mask]
        s_pruned = s[x_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(inspect, 'getfullargspec') and \
                'sample_weight' in inspect.getfullargspec(self.clf.fit).args \
                or hasattr(inspect, 'getargspec') and \
                'sample_weight' in inspect.getargspec(self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                sample_weight_k = 1.0 / self.noise_matrix[k][k]
                self.sample_weight[s_pruned == k] = sample_weight_k

            self.clf.fit(x_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but best we can do if no sample_weight.
            self.clf.fit(x_pruned, s_pruned)

        return self.clf
Пример #4
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifer (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling. 

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 
    
        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the 
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(
                inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(
                    t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the noise matrix you provde."
                warnings.warn(w)
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(
                    self.ps, self.noise_matrix)
        if inverse_noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(inverse_noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' inv noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the inverse noise matrix you provde."
                warnings.warn(w)
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps, self.inverse_noise_matrix)
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba(
                    X=X,
                    s=s,
                    clf=self.clf,
                    cv_n_folds=self.cv_n_folds,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                    seed=self.seed,
                )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities(
                    s=s,
                    psx=psx,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None:  # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix, class_without_noise=self.pulearning)
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            prune_count_method=self.prune_count_method,
            converge_latent_estimates=self.converge_latent_estimates,
        )

        X_mask = ~self.noise_mask
        X_pruned = X[X_mask]
        s_pruned = s[X_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(
                inspect, 'getfullargspec'
        ) and 'sample_weight' in inspect.getfullargspec(
                self.clf.fit).args or hasattr(
                    inspect,
                    'getargspec') and 'sample_weight' in inspect.getargspec(
                        self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                self.sample_weight[s_pruned ==
                                   k] = 1.0 / self.noise_matrix[k][k]

            self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but its all we can do if sample_weight isn't available.
            self.clf.fit(X_pruned, s_pruned)

        return self.clf
Пример #5
0
def get_noise_indices(
    s,
    psx,
    inverse_noise_matrix=None,
    confident_joint=None,
    frac_noise=1.0,
    num_to_remove_per_class=None,
    prune_method='prune_by_noise_rate',
    prune_count_method='inverse_nm_dot_s',
    converge_latent_estimates=False,
    return_sorted_index=False,
    multi_label=False,
):
    '''Returns the indices of most likely (confident) label errors in s. The
    number of indices returned is specified by frac_of_noise. When 
    frac_of_noise = 1.0, all "confidently" estimated noise indices are returned.

    Parameters
    ----------

    s : np.array
      A binary vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \tilde(y), for ASCII encoding reasons.
    
    psx : np.array (shape (N, K))
      P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
      This is the probability distribution over all K classes, for each
      example, regarding whether the example has label s==k P(s=k|x). psx should
      have been computed using 3 (or higher) fold cross-validation.
      
    inverse_noise_matrix : np.array of shape (K, K), K = number of classes 
      A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
      the estimated fraction observed examples in each class k_s, that are
      mislabeled examples from every other class k_y. If None, the 
      inverse_noise_matrix will be computed from psx and s.
      Assumes columns of inverse_noise_matrix sum to 1.
        
    confident_joint : np.array (shape (K, K), type int) (default: None)
      A K,K integer matrix of count(s=k, y=k). Estimatesa a confident subset of
      the joint disribution of the noisy and true labels P_{s,y}.
      Each entry in the matrix contains the number of examples confidently
      counted into every pair (s=j, y=k) classes.
  
    frac_noise : float
      When frac_of_noise = 1.0, return all "confidently" estimated noise indices.
      Value in range (0, 1] that determines the fraction of noisy example 
      indices to return based on the following formula for example class k.
      frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently    
      frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k
      
    num_to_remove_per_class : list of int of length K (# of classes)
      e.g. if K = 3, num_to_remove_per_class = [5, 0, 1] would return 
      the indices of the 5 most likely mislabeled examples in class s = 0,
      and the most likely mislabeled example in class s = 1.
      ***Only set this parameter if prune_method == 'prune_by_class'

    prune_method : str (default: 'prune_by_noise_rate')
      'prune_by_class', 'prune_by_noise_rate', or 'both'. Method used for pruning.
      1. 'prune_by_noise_rate': works by removing examples with *high probability* of 
      being mislabeled for every non-diagonal in the prune_counts_matrix (see pruning.py).
      2. 'prune_by_class': works by removing the examples with *smallest probability* of
      belonging to their given class label for every class.
      3. 'both': Finds the examples satisfying (1) AND (2) and removes their set conjunction. 

    prune_count_method : str (default 'inverse_nm_dot_s')
      Options are 'inverse_nm_dot_s' or 'calibrate_confident_joint'. 
        !DO NOT USE! 'calibrate_confident_joint' if you already know the noise matrix
      and will call .fit(noise_matrix = known_noise_matrix) or
      .fit(inverse_noise_matrix = known_inverse_noise_matrix) because
      'calibrate_confident_joint' will estimate the noise without using this information.
        !IN ALL OTHER CASES! We recommend always using 'calibrate_confident_joint'
      because it is faster and more robust when no noise matrix info is given.
        Determines the method used to estimate the counts of the joint P(s, y) that will 
      be used to determine how many examples to prune
      for every class that are flipped to every other class, as follows:
        if prune_count_method == 'inverse_nm_dot_s':
          prune_count_matrix = inverse_noise_matrix * s_counts # Matrix of counts(y=k and s=l)
        elif prune_count_method == 'calibrate_confident_joint':# calibrate
          prune_count_matrix = confident_joint.T / float(confident_joint.sum()) * len(s)

    converge_latent_estimates : bool (Default: False)
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form 
      equivalences. This will iteratively enforce mathematically consistency.
      
    return_sorted_index : bool
      If true, returns an array of the label error indices (instead of a bool mask)
      where error indices are ordered by the normalized margin (p(s = k) - max(p(s != k)))'''

    # Number of examples in each class of s
    if multi_label:
        s_counts = value_counts([l for l in s])
    else:
        s_counts = value_counts(s)
    # 'ps' is p(s=k)
    ps = s_counts / float(sum(s_counts))
    # Number of classes s
    K = len(psx.T)

    # Ensure labels are of type np.array()
    s = np.asarray(s)

    # Estimate the number of examples to confidently prune for each (s=j, y=k) pair.
    if (inverse_noise_matrix is None
            and prune_count_method == 'inverse_nm_dot_s') or (
                confident_joint is None
                and prune_count_method == 'calibrate_confident_joint'):
        from cleanlab.latent_estimation import estimate_py_and_noise_matrices_from_probabilities
        _, _, inverse_noise_matrix, confident_joint = estimate_py_and_noise_matrices_from_probabilities(
            s,
            psx,
            converge_latent_estimates=converge_latent_estimates,
        )
    if prune_count_method == 'inverse_nm_dot_s':
        prune_count_matrix = inverse_noise_matrix * s_counts  # Matrix of counts(y=k and s=l)
    elif prune_count_method == 'calibrate_confident_joint':
        prune_count_matrix = confident_joint.T / float(
            confident_joint.sum()) * len(s)  # calibrate
    else:
        raise ValueError(
            "prune_count_method should be 'inverse_nm_dot_s' or " +
            "'calibrate_confident_joint', but '" + prune_count_method +
            "' was given.")

    # Leave at least MIN_NUM_PER_CLASS examples per class.
    prune_count_matrix = keep_at_least_n_per_class(
        prune_count_matrix=prune_count_matrix,
        n=MIN_NUM_PER_CLASS,
        frac_noise=frac_noise,
    )

    if num_to_remove_per_class is not None:
        # Estimate joint probability distribution over label errors
        psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
        noise_per_s = psy.sum(axis=1) - psy.diagonal()
        # Calibrate s.t. noise rates sum to num_to_remove_per_class
        tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
        np.fill_diagonal(tmp, s_counts - num_to_remove_per_class)
        prune_count_matrix = np.round(tmp).astype(int)

    # Initialize the boolean mask of noise indices.
    noise_mask = np.zeros(len(psx), dtype=bool)

    # Peform Pruning with threshold probabilities from BFPRT algorithm in O(n)

    if prune_method == 'prune_by_class' or prune_method == 'both':
        for k in range(K):
            if s_counts[
                    k] > MIN_NUM_PER_CLASS:  # Don't prune if not MIN_NUM_PER_CLASS
                num2prune = s_counts[k] - prune_count_matrix[k][k]
                # num2keep'th smallest probability of class k for examples with noisy label k
                s_filter = np.array([k in l
                                     for l in s]) if multi_label else s == k
                threshold = np.partition(psx[:, k][s_filter],
                                         num2prune)[num2prune]
                noise_mask = noise_mask | ((s_filter) &
                                           (psx[:, k] < threshold))

    if prune_method == 'both':
        noise_mask_by_class = noise_mask

    if prune_method == 'prune_by_noise_rate' or prune_method == 'both':
        noise_mask = np.zeros(len(psx), dtype=bool)
        for k in range(K):  # true hidden label index
            if s_counts[
                    k] > MIN_NUM_PER_CLASS:  # Don't prune if not MIN_NUM_PER_CLASS
                for j in range(K):  # noisy label index
                    if k != j:  # Only prune for noise rates
                        num2prune = prune_count_matrix[k][j]
                        if num2prune > 0:
                            # num2prune'th largest p(class k) - p(class j) for x with noisy label j
                            margin = psx[:, k] - psx[:, j]
                            s_filter = np.array([j in l for l in s
                                                 ]) if multi_label else s == j
                            threshold = -np.partition(
                                -margin[s_filter],
                                num2prune - 1)[num2prune - 1]
                            noise_mask = noise_mask | ((s_filter) &
                                                       (margin >= threshold))

    noise_mask = noise_mask & noise_mask_by_class if prune_method == 'both' else noise_mask

    if return_sorted_index:
        return order_label_errors(noise_mask, psx, s)

    return noise_mask