def test_fit_psx(): from cleanlab.latent_estimation import estimate_cv_predicted_probabilities lnl = LearningWithNoisyLabels() psx = estimate_cv_predicted_probabilities( X=data['X_train'], labels=data['y_train'], ) lnl.fit(X=data['X_train'], s=data['y_train'], psx=psx) score_with_psx = lnl.score(data['X_test'], data['y_test']) lnl = LearningWithNoisyLabels() lnl.fit( X=data['X_train'], s=data['y_train'], ) score_no_psx = lnl.score(data['X_test'], data['y_test']) assert (abs(score_with_psx - score_no_psx) < 1e-6)
def fit( self, X, s, psx=None, thresholds=None, noise_matrix=None, inverse_noise_matrix=None, ): """This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifier (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : :obj:`np.array` Input feature matrix (N, D), 2D numpy array s : :obj:`np.array` A binary vector of labels, s, which may contain mislabeling. psx : :obj:`np.array` (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : :obj:`iterable` (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). List of probabilities used to determine the cutoff predicted probability necessary to consider an example as a given class label. Default is ``None``. These are computed for you automatically. If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. Values in list should be between 0 and 1. noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s). Contains the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Returns ------- tuple (noise_mask, sample_weight)""" # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError( "Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and (np.trace(inverse_noise_matrix) <= 1): t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError( "Trace(inverse_noise_matrix) is {}. Must exceed 1.".format(t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = ( compute_py_inv_noise_matrix(self.ps, self.noise_matrix)) if inverse_noise_matrix is not None: self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse( self.ps, self.inverse_noise_matrix, ) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, \ self.confident_joint, psx = \ estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=self.clf, cv_n_folds=self.cv_n_folds, thresholds=thresholds, converge_latent_estimates=( self.converge_latent_estimates), seed=self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, \ self.confident_joint = \ estimate_py_and_noise_matrices_from_probabilities( s=s, psx=psx, thresholds=thresholds, converge_latent_estimates=( self.converge_latent_estimates), ) if psx is None: psx = estimate_cv_predicted_probabilities( X=X, labels=s, clf=self.clf, cv_n_folds=self.cv_n_folds, seed=self.seed, ) # if pulearning == the integer specifying the class without noise. if self.K == 2 and self.pulearning is not None: # pragma: no cover # pulearning = 1 (no error in 1 class) implies p(s=1|y=0) = 0 self.noise_matrix[self.pulearning][1 - self.pulearning] = 0 self.noise_matrix[1 - self.pulearning][1 - self.pulearning] = 1 # pulearning = 1 (no error in 1 class) implies p(y=0|s=1) = 0 self.inverse_noise_matrix[1 - self.pulearning][self.pulearning] = 0 self.inverse_noise_matrix[self.pulearning][self.pulearning] = 1 # pulearning = 1 (no error in 1 class) implies p(s=1,y=0) = 0 self.confident_joint[self.pulearning][1 - self.pulearning] = 0 self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1 # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix=self.inverse_noise_matrix, confident_joint=self.confident_joint, prune_method=self.prune_method, n_jobs=self.n_jobs, ) x_mask = ~self.noise_mask x_pruned = X[x_mask] s_pruned = s[x_mask] # Check if sample_weight in clf.fit(). Compatible with Python 2/3. if hasattr(inspect, 'getfullargspec') and \ 'sample_weight' in inspect.getfullargspec(self.clf.fit).args \ or hasattr(inspect, 'getargspec') and \ 'sample_weight' in inspect.getargspec(self.clf.fit).args: # Re-weight examples in the loss function for the final fitting # s.t. the "apparent" original number of examples in each class # is preserved, even though the pruned sets may differ. self.sample_weight = np.ones(np.shape(s_pruned)) for k in range(self.K): sample_weight_k = 1.0 / self.noise_matrix[k][k] self.sample_weight[s_pruned == k] = sample_weight_k self.clf.fit(x_pruned, s_pruned, sample_weight=self.sample_weight) else: # This is less accurate, but best we can do if no sample_weight. self.clf.fit(x_pruned, s_pruned) return self.clf
def fit( self, X, s, psx = None, thresholds = None, noise_matrix = None, inverse_noise_matrix = None, ): '''This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifier (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Output ------ Returns (noise_mask, sample_weight)''' # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError("Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and np.trace(inverse_noise_matrix) <= 1: t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError("Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (fraction of mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(self.ps, self.noise_matrix) if inverse_noise_matrix is not None: self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse(self.ps, self.inverse_noise_matrix) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = \ estimate_py_noise_matrices_and_cv_pred_proba( X = X, s = s, clf = self.clf, cv_n_folds = self.cv_n_folds, thresholds = thresholds, converge_latent_estimates = self.converge_latent_estimates, seed = self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = \ estimate_py_and_noise_matrices_from_probabilities( s = s, psx = psx, thresholds = thresholds, converge_latent_estimates = self.converge_latent_estimates, ) if psx is None: psx = estimate_cv_predicted_probabilities( X = X, labels = s, clf = self.clf, cv_n_folds = self.cv_n_folds, seed = self.seed, ) # Zero out noise matrix entries if pulearning = the integer specifying the class without noise. if self.pulearning is not None: # pragma: no cover self.noise_matrix = remove_noise_from_class( self.noise_matrix, class_without_noise=self.pulearning, ) # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning) # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix = self.inverse_noise_matrix, confident_joint = self.confident_joint, prune_method = self.prune_method, ) if self.pulearning is not None: self.noise_mask[s != self.pulearning] = False return self.noise_mask, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx
def fit( self, X, s, psx=None, thresholds=None, noise_matrix=None, inverse_noise_matrix=None, ): '''This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifer (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Output ------ Returns (noise_mask, sample_weight)''' # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError( "Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and np.trace( inverse_noise_matrix) <= 1: t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError( "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format( t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (fraction of mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: if self.prune_count_method == 'calibrate_confident_joint': w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'." w += "\nwhen .fit(noise_matrix = something) because" w += "\n'calibrate_confident_joint' estimates the noise from scratch and will" w += "\nnot use your 'something' noise matrix information. Instead, use" w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors" w += "\nby using the noise matrix you provde." warnings.warn(w) self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix( self.ps, self.noise_matrix) if inverse_noise_matrix is not None: if self.prune_count_method == 'calibrate_confident_joint': w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'." w += "\nwhen .fit(inverse_noise_matrix = something) because" w += "\n'calibrate_confident_joint' estimates the noise from scratch and will" w += "\nnot use your 'something' inv noise matrix information. Instead, use" w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors" w += "\nby using the inverse noise matrix you provde." warnings.warn(w) self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse( self.ps, self.inverse_noise_matrix) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=self.clf, cv_n_folds=self.cv_n_folds, thresholds=thresholds, converge_latent_estimates=self.converge_latent_estimates, seed=self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities( s=s, psx=psx, thresholds=thresholds, converge_latent_estimates=self.converge_latent_estimates, ) if psx is None: psx = estimate_cv_predicted_probabilities( X=X, labels=s, clf=self.clf, cv_n_folds=self.cv_n_folds, seed=self.seed, ) # Zero out noise matrix entries if pulearning = the integer specifying the class without noise. if self.pulearning is not None: # pragma: no cover self.noise_matrix = remove_noise_from_class( self.noise_matrix, class_without_noise=self.pulearning) # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning) # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix=self.inverse_noise_matrix, confident_joint=self.confident_joint, prune_method=self.prune_method, prune_count_method=self.prune_count_method, converge_latent_estimates=self.converge_latent_estimates, ) X_mask = ~self.noise_mask X_pruned = X[X_mask] s_pruned = s[X_mask] # Check if sample_weight in clf.fit(). Compatible with Python 2/3. if hasattr( inspect, 'getfullargspec' ) and 'sample_weight' in inspect.getfullargspec( self.clf.fit).args or hasattr( inspect, 'getargspec') and 'sample_weight' in inspect.getargspec( self.clf.fit).args: # Re-weight examples in the loss function for the final fitting # s.t. the "apparent" original number of examples in each class # is preserved, even though the pruned sets may differ. self.sample_weight = np.ones(np.shape(s_pruned)) for k in range(self.K): self.sample_weight[s_pruned == k] = 1.0 / self.noise_matrix[k][k] self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight) else: # This is less accurate, but its all we can do if sample_weight isn't available. self.clf.fit(X_pruned, s_pruned) return self.clf