Пример #1
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        """This method implements the confident learning. It counts examples
        that are likely labeled correctly and incorrectly and uses their ratio
        to create a predicted confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for
        the noise in both the positive and negative sets.

        Parameters
        ----------
        X : :obj:`np.array`
          Input feature matrix (N, D), 2D numpy array

        s : :obj:`np.array`
          A binary vector of labels, s, which may contain mislabeling.

        psx : :obj:`np.array` (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
          examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx
          should have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : :obj:`iterable` (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). List of probabilities used to determine the cutoff
          predicted probability necessary to consider an example as a given
          class label.
          Default is ``None``. These are computed for you automatically.
          If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. Values in list should be between 0 and 1.

        noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1.

        inverse_noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s). Contains
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Returns
        -------
        tuple
          (noise_mask, sample_weight)"""

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and (np.trace(inverse_noise_matrix)
                                                 <= 1):
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}. Must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = (
                    compute_py_inv_noise_matrix(self.ps, self.noise_matrix))
        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps,
                    self.inverse_noise_matrix,
                )
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                self.confident_joint, psx = \
                    estimate_py_noise_matrices_and_cv_pred_proba(
                        X=X,
                        s=s,
                        clf=self.clf,
                        cv_n_folds=self.cv_n_folds,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                        seed=self.seed,
                    )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, \
                self.confident_joint = \
                    estimate_py_and_noise_matrices_from_probabilities(
                        s=s,
                        psx=psx,
                        thresholds=thresholds,
                        converge_latent_estimates=(
                            self.converge_latent_estimates),
                    )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # if pulearning == the integer specifying the class without noise.
        if self.K == 2 and self.pulearning is not None:  # pragma: no cover
            # pulearning = 1 (no error in 1 class) implies p(s=1|y=0) = 0
            self.noise_matrix[self.pulearning][1 - self.pulearning] = 0
            self.noise_matrix[1 - self.pulearning][1 - self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(y=0|s=1) = 0
            self.inverse_noise_matrix[1 - self.pulearning][self.pulearning] = 0
            self.inverse_noise_matrix[self.pulearning][self.pulearning] = 1
            # pulearning = 1 (no error in 1 class) implies p(s=1,y=0) = 0
            self.confident_joint[self.pulearning][1 - self.pulearning] = 0
            self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            n_jobs=self.n_jobs,
        )

        x_mask = ~self.noise_mask
        x_pruned = X[x_mask]
        s_pruned = s[x_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(inspect, 'getfullargspec') and \
                'sample_weight' in inspect.getfullargspec(self.clf.fit).args \
                or hasattr(inspect, 'getargspec') and \
                'sample_weight' in inspect.getargspec(self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                sample_weight_k = 1.0 / self.noise_matrix[k][k]
                self.sample_weight[s_pruned == k] = sample_weight_k

            self.clf.fit(x_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but best we can do if no sample_weight.
            self.clf.fit(x_pruned, s_pruned)

        return self.clf
Пример #2
0
    def fit(
        self, 
        X,
        s,
        psx = None,
        thresholds = None,
        noise_matrix = None,
        inverse_noise_matrix = None, 
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifier (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling.

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 

        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError("Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError("Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes. 
        # Also, if needed, compute P(s=k|x), denoted psx.
        
        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(self.ps, self.noise_matrix)

        if inverse_noise_matrix is not None:
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(self.ps, self.inverse_noise_matrix)

        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = \
                estimate_py_noise_matrices_and_cv_pred_proba(
                    X = X,
                    s = s,
                    clf = self.clf,
                    cv_n_folds = self.cv_n_folds,
                    thresholds = thresholds,
                    converge_latent_estimates = self.converge_latent_estimates,
                    seed = self.seed,
                )
            else: # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = \
                estimate_py_and_noise_matrices_from_probabilities(
                    s = s, 
                    psx = psx,
                    thresholds = thresholds,
                    converge_latent_estimates = self.converge_latent_estimates,
                )

        if psx is None: 
            psx = estimate_cv_predicted_probabilities(
                X = X,
                labels = s,
                clf = self.clf,
                cv_n_folds = self.cv_n_folds,
                seed = self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None: # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix,
                class_without_noise=self.pulearning,
            )
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix = self.inverse_noise_matrix,
            confident_joint = self.confident_joint,
            prune_method = self.prune_method,
        ) 
        if self.pulearning is not None:
            self.noise_mask[s != self.pulearning] = False
        return self.noise_mask, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx
Пример #3
0
    def fit(
        self,
        X,
        s,
        psx=None,
        thresholds=None,
        noise_matrix=None,
        inverse_noise_matrix=None,
    ):
        '''This method implements the confident learning. It counts examples that are likely
        labeled correctly and incorrectly and uses their ratio to create a predicted
        confusion matrix.
        This function fits the classifer (self.clf) to (X, s) accounting for the noise in
        both the positive and negative sets.

        Parameters
        ----------
        X : np.array
          Input feature matrix (N, D), 2D numpy array

        s : np.array
          A binary vector of labels, s, which may contain mislabeling. 

        psx : np.array (shape (N, K))
          P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
          This is the probability distribution over all K classes, for each
          example, regarding whether the example has label s==k P(s=k|x). psx should
          have been computed using 3 (or higher) fold cross-validation.
          If you are not sure, leave psx = None (default) and
          it will be computed for you using cross-validation.

        thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
          P(s^=k|s=k). If an example has a predicted probability "greater" than
          this threshold, it is counted as having hidden label y = k. This is
          not used for pruning, only for estimating the noise rates using
          confident counts. This value should be between 0 and 1. Default is None.

        noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
          the fraction of examples in every class, labeled as every other class.
          Assumes columns of noise_matrix sum to 1. 
    
        inverse_noise_matrix : np.array of shape (K, K), K = number of classes
          A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
          the estimated fraction observed examples in each class k_s, that are
          mislabeled examples from every other class k_y. If None, the 
          inverse_noise_matrix will be computed from psx and s.
          Assumes columns of inverse_noise_matrix sum to 1.

        Output
        ------
          Returns (noise_mask, sample_weight)'''

        # Check inputs
        assert_inputs_are_valid(X, s, psx)
        if noise_matrix is not None and np.trace(noise_matrix) <= 1:
            t = np.round(np.trace(noise_matrix), 2)
            raise ValueError(
                "Trace(noise_matrix) is {}, but must exceed 1.".format(t))
        if inverse_noise_matrix is not None and np.trace(
                inverse_noise_matrix) <= 1:
            t = np.round(np.trace(inverse_noise_matrix), 2)
            raise ValueError(
                "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(
                    t))

        # Number of classes
        self.K = len(np.unique(s))

        # 'ps' is p(s=k)
        self.ps = value_counts(s) / float(len(s))

        self.confident_joint = None
        # If needed, compute noise rates (fraction of mislabeling) for all classes.
        # Also, if needed, compute P(s=k|x), denoted psx.

        # Set / re-set noise matrices / psx; estimate if not provided.
        if noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the noise matrix you provde."
                warnings.warn(w)
            self.noise_matrix = noise_matrix
            if inverse_noise_matrix is None:
                self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(
                    self.ps, self.noise_matrix)
        if inverse_noise_matrix is not None:
            if self.prune_count_method == 'calibrate_confident_joint':
                w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'."
                w += "\nwhen .fit(inverse_noise_matrix = something) because"
                w += "\n'calibrate_confident_joint' estimates the noise from scratch and will"
                w += "\nnot use your 'something' inv noise matrix information. Instead, use"
                w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors"
                w += "\nby using the inverse noise matrix you provde."
                warnings.warn(w)
            self.inverse_noise_matrix = inverse_noise_matrix
            if noise_matrix is None:
                self.noise_matrix = compute_noise_matrix_from_inverse(
                    self.ps, self.inverse_noise_matrix)
        if noise_matrix is None and inverse_noise_matrix is None:
            if psx is None:
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba(
                    X=X,
                    s=s,
                    clf=self.clf,
                    cv_n_folds=self.cv_n_folds,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                    seed=self.seed,
                )
            else:  # psx is provided by user (assumed holdout probabilities)
                self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities(
                    s=s,
                    psx=psx,
                    thresholds=thresholds,
                    converge_latent_estimates=self.converge_latent_estimates,
                )

        if psx is None:
            psx = estimate_cv_predicted_probabilities(
                X=X,
                labels=s,
                clf=self.clf,
                cv_n_folds=self.cv_n_folds,
                seed=self.seed,
            )

        # Zero out noise matrix entries if pulearning = the integer specifying the class without noise.
        if self.pulearning is not None:  # pragma: no cover
            self.noise_matrix = remove_noise_from_class(
                self.noise_matrix, class_without_noise=self.pulearning)
            # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning)

        # This is the actual work of this function.

        # Get the indices of the examples we wish to prune
        self.noise_mask = get_noise_indices(
            s,
            psx,
            inverse_noise_matrix=self.inverse_noise_matrix,
            confident_joint=self.confident_joint,
            prune_method=self.prune_method,
            prune_count_method=self.prune_count_method,
            converge_latent_estimates=self.converge_latent_estimates,
        )

        X_mask = ~self.noise_mask
        X_pruned = X[X_mask]
        s_pruned = s[X_mask]

        # Check if sample_weight in clf.fit(). Compatible with Python 2/3.
        if hasattr(
                inspect, 'getfullargspec'
        ) and 'sample_weight' in inspect.getfullargspec(
                self.clf.fit).args or hasattr(
                    inspect,
                    'getargspec') and 'sample_weight' in inspect.getargspec(
                        self.clf.fit).args:
            # Re-weight examples in the loss function for the final fitting
            # s.t. the "apparent" original number of examples in each class
            # is preserved, even though the pruned sets may differ.
            self.sample_weight = np.ones(np.shape(s_pruned))
            for k in range(self.K):
                self.sample_weight[s_pruned ==
                                   k] = 1.0 / self.noise_matrix[k][k]

            self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight)
        else:
            # This is less accurate, but its all we can do if sample_weight isn't available.
            self.clf.fit(X_pruned, s_pruned)

        return self.clf