def estimation(X, Y, A, ngroups=2): est_error_rates = [] # print(X.shape, Y.shape, A.shape) for z in range(ngroups): print(f"[DEBUG][EST] Estimating Group {z}") X_t = X[A == z] Y_t = Y[A == z] # print(X_t.shape, Y_t.shape) est_py, est_nm, est_inv, confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba( X=X_t, s=Y_t, ) print(f"[DEBUG] Estimated Noise Matrix {est_nm}.") est_error_rates.append([1 - est_nm[0][0], 1 - est_nm[1][1]]) return est_error_rates
def make_data( sparse=False, means=[[3, 2], [7, 7], [0, 8]], covs=[[[5, -1.5], [-1.5, 1]], [[1, 0.5], [0.5, 4]], [[5, 1], [1, 5]]], sizes=[80, 40, 40], avg_trace=0.8, seed=1, # set to None for non-reproducible randomness ): np.random.seed(seed=seed) m = len(means) # number of classes n = sum(sizes) data = [] labels = [] test_data = [] test_labels = [] for idx in range(m): data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) test_data.append( np.random.multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx])) labels.append(np.array([idx for i in range(sizes[idx])])) test_labels.append(np.array([idx for i in range(sizes[idx])])) X_train = np.vstack(data) y_train = np.hstack(labels) X_test = np.vstack(test_data) y_test = np.hstack(test_labels) if sparse: X_train = scipy.sparse.csr_matrix(X_train) X_test = scipy.sparse.csr_matrix(X_test) # Compute p(y=k) py = np.bincount(y_train) / float(len(y_train)) noise_matrix = generate_noise_matrix_from_trace( m, trace=avg_trace * m, py=py, valid_noise_matrix=True, seed=seed, ) # Generate our noisy labels using the noise_marix. s = generate_noisy_labels(y_train, noise_matrix) ps = np.bincount(s) / float(len(s)) # Compute inverse noise matrix inv = compute_inv_noise_matrix(py, noise_matrix, ps) # Estimate psx latent = latent_estimation.estimate_py_noise_matrices_and_cv_pred_proba( X=X_train, s=s, cv_n_folds=3, ) return { "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test, "s": s, "ps": ps, "py": py, "noise_matrix": noise_matrix, "inverse_noise_matrix": inv, "est_py": latent[0], "est_nm": latent[1], "est_inv": latent[2], "cj": latent[3], "psx": latent[4], "m": m, "n": n, }
def fit( self, X, s, psx = None, thresholds = None, noise_matrix = None, inverse_noise_matrix = None, ): '''This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifier (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Output ------ Returns (noise_mask, sample_weight)''' # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError("Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and np.trace(inverse_noise_matrix) <= 1: t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError("Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (fraction of mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(self.ps, self.noise_matrix) if inverse_noise_matrix is not None: self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse(self.ps, self.inverse_noise_matrix) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = \ estimate_py_noise_matrices_and_cv_pred_proba( X = X, s = s, clf = self.clf, cv_n_folds = self.cv_n_folds, thresholds = thresholds, converge_latent_estimates = self.converge_latent_estimates, seed = self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = \ estimate_py_and_noise_matrices_from_probabilities( s = s, psx = psx, thresholds = thresholds, converge_latent_estimates = self.converge_latent_estimates, ) if psx is None: psx = estimate_cv_predicted_probabilities( X = X, labels = s, clf = self.clf, cv_n_folds = self.cv_n_folds, seed = self.seed, ) # Zero out noise matrix entries if pulearning = the integer specifying the class without noise. if self.pulearning is not None: # pragma: no cover self.noise_matrix = remove_noise_from_class( self.noise_matrix, class_without_noise=self.pulearning, ) # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning) # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix = self.inverse_noise_matrix, confident_joint = self.confident_joint, prune_method = self.prune_method, ) if self.pulearning is not None: self.noise_mask[s != self.pulearning] = False return self.noise_mask, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx
def fit( self, X, s, psx=None, thresholds=None, noise_matrix=None, inverse_noise_matrix=None, ): """This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifier (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : :obj:`np.array` Input feature matrix (N, D), 2D numpy array s : :obj:`np.array` A binary vector of labels, s, which may contain mislabeling. psx : :obj:`np.array` (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : :obj:`iterable` (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). List of probabilities used to determine the cutoff predicted probability necessary to consider an example as a given class label. Default is ``None``. These are computed for you automatically. If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. Values in list should be between 0 and 1. noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s). Contains the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Returns ------- tuple (noise_mask, sample_weight)""" # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError( "Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and (np.trace(inverse_noise_matrix) <= 1): t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError( "Trace(inverse_noise_matrix) is {}. Must exceed 1.".format(t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = ( compute_py_inv_noise_matrix(self.ps, self.noise_matrix)) if inverse_noise_matrix is not None: self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse( self.ps, self.inverse_noise_matrix, ) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, \ self.confident_joint, psx = \ estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=self.clf, cv_n_folds=self.cv_n_folds, thresholds=thresholds, converge_latent_estimates=( self.converge_latent_estimates), seed=self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, \ self.confident_joint = \ estimate_py_and_noise_matrices_from_probabilities( s=s, psx=psx, thresholds=thresholds, converge_latent_estimates=( self.converge_latent_estimates), ) if psx is None: psx = estimate_cv_predicted_probabilities( X=X, labels=s, clf=self.clf, cv_n_folds=self.cv_n_folds, seed=self.seed, ) # if pulearning == the integer specifying the class without noise. if self.K == 2 and self.pulearning is not None: # pragma: no cover # pulearning = 1 (no error in 1 class) implies p(s=1|y=0) = 0 self.noise_matrix[self.pulearning][1 - self.pulearning] = 0 self.noise_matrix[1 - self.pulearning][1 - self.pulearning] = 1 # pulearning = 1 (no error in 1 class) implies p(y=0|s=1) = 0 self.inverse_noise_matrix[1 - self.pulearning][self.pulearning] = 0 self.inverse_noise_matrix[self.pulearning][self.pulearning] = 1 # pulearning = 1 (no error in 1 class) implies p(s=1,y=0) = 0 self.confident_joint[self.pulearning][1 - self.pulearning] = 0 self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1 # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix=self.inverse_noise_matrix, confident_joint=self.confident_joint, prune_method=self.prune_method, n_jobs=self.n_jobs, ) x_mask = ~self.noise_mask x_pruned = X[x_mask] s_pruned = s[x_mask] # Check if sample_weight in clf.fit(). Compatible with Python 2/3. if hasattr(inspect, 'getfullargspec') and \ 'sample_weight' in inspect.getfullargspec(self.clf.fit).args \ or hasattr(inspect, 'getargspec') and \ 'sample_weight' in inspect.getargspec(self.clf.fit).args: # Re-weight examples in the loss function for the final fitting # s.t. the "apparent" original number of examples in each class # is preserved, even though the pruned sets may differ. self.sample_weight = np.ones(np.shape(s_pruned)) for k in range(self.K): sample_weight_k = 1.0 / self.noise_matrix[k][k] self.sample_weight[s_pruned == k] = sample_weight_k self.clf.fit(x_pruned, s_pruned, sample_weight=self.sample_weight) else: # This is less accurate, but best we can do if no sample_weight. self.clf.fit(x_pruned, s_pruned) return self.clf
def fit( self, X, s, psx=None, thresholds=None, noise_matrix=None, inverse_noise_matrix=None, ): '''This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifer (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Output ------ Returns (noise_mask, sample_weight)''' # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError( "Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and np.trace( inverse_noise_matrix) <= 1: t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError( "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format( t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (fraction of mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: if self.prune_count_method == 'calibrate_confident_joint': w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'." w += "\nwhen .fit(noise_matrix = something) because" w += "\n'calibrate_confident_joint' estimates the noise from scratch and will" w += "\nnot use your 'something' noise matrix information. Instead, use" w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors" w += "\nby using the noise matrix you provde." warnings.warn(w) self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix( self.ps, self.noise_matrix) if inverse_noise_matrix is not None: if self.prune_count_method == 'calibrate_confident_joint': w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'." w += "\nwhen .fit(inverse_noise_matrix = something) because" w += "\n'calibrate_confident_joint' estimates the noise from scratch and will" w += "\nnot use your 'something' inv noise matrix information. Instead, use" w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors" w += "\nby using the inverse noise matrix you provde." warnings.warn(w) self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse( self.ps, self.inverse_noise_matrix) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=self.clf, cv_n_folds=self.cv_n_folds, thresholds=thresholds, converge_latent_estimates=self.converge_latent_estimates, seed=self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities( s=s, psx=psx, thresholds=thresholds, converge_latent_estimates=self.converge_latent_estimates, ) if psx is None: psx = estimate_cv_predicted_probabilities( X=X, labels=s, clf=self.clf, cv_n_folds=self.cv_n_folds, seed=self.seed, ) # Zero out noise matrix entries if pulearning = the integer specifying the class without noise. if self.pulearning is not None: # pragma: no cover self.noise_matrix = remove_noise_from_class( self.noise_matrix, class_without_noise=self.pulearning) # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning) # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix=self.inverse_noise_matrix, confident_joint=self.confident_joint, prune_method=self.prune_method, prune_count_method=self.prune_count_method, converge_latent_estimates=self.converge_latent_estimates, ) X_mask = ~self.noise_mask X_pruned = X[X_mask] s_pruned = s[X_mask] # Check if sample_weight in clf.fit(). Compatible with Python 2/3. if hasattr( inspect, 'getfullargspec' ) and 'sample_weight' in inspect.getfullargspec( self.clf.fit).args or hasattr( inspect, 'getargspec') and 'sample_weight' in inspect.getargspec( self.clf.fit).args: # Re-weight examples in the loss function for the final fitting # s.t. the "apparent" original number of examples in each class # is preserved, even though the pruned sets may differ. self.sample_weight = np.ones(np.shape(s_pruned)) for k in range(self.K): self.sample_weight[s_pruned == k] = 1.0 / self.noise_matrix[k][k] self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight) else: # This is less accurate, but its all we can do if sample_weight isn't available. self.clf.fit(X_pruned, s_pruned) return self.clf
# load data if not os.path.isfile(train_dataset): make_training_dataset() else: X_train = pd.read_hdf(train_dataset, 'X_train') train_true_labels = pd.read_hdf(train_dataset, 'train_true_labels') train_labels_with_errors = pd.read_hdf(train_dataset, 'train_labels_with_errors') # build models and estimate latent variables if not os.path.isfile(result_latent_vars): # start training est_py, est_nm, est_inv, confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba( X=X_train.values, s=train_labels_with_errors, # clf=RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ) # save results if not os.path.exists('results'): os.makedirs('results') with open(result_latent_vars, 'wb') as output: pickle.dump(est_py, output, pickle.HIGHEST_PROTOCOL) pickle.dump(est_nm, output, pickle.HIGHEST_PROTOCOL) pickle.dump(est_inv, output, pickle.HIGHEST_PROTOCOL) pickle.dump(confident_joint, output, pickle.HIGHEST_PROTOCOL) pickle.dump(psx, output, pickle.HIGHEST_PROTOCOL) else: with open(result_latent_vars, 'rb') as inf: est_py = pickle.load(inf)