def test_pruning_both(n_jobs): remove = 5 s = data['s'] class_idx = pruning.get_noise_indices( s=s, psx=data['psx'], num_to_remove_per_class=remove, prune_method='prune_by_class', n_jobs=n_jobs, ) nr_idx = pruning.get_noise_indices( s=s, psx=data['psx'], num_to_remove_per_class=remove, prune_method='prune_by_noise_rate', n_jobs=n_jobs, ) both_idx = pruning.get_noise_indices( s=s, psx=data['psx'], num_to_remove_per_class=remove, prune_method='both', n_jobs=n_jobs, ) assert (all(s[both_idx] == s[class_idx & nr_idx]))
def test_prune_count_err(): try: pruning.get_noise_indices( s=data['s'], psx=data['psx'], prune_count_method='INVALID_METHOD', ) except ValueError as e: assert ('should be' in str(e)) with pytest.raises(ValueError) as e: pruning.get_noise_indices( s=data['s'], psx=data['psx'], prune_count_method='INVALID_METHOD', )
def test_exact_prune_count(): remove = 5 s = data['s'] noise_idx = pruning.get_noise_indices(s=s, psx=data['psx'], num_to_remove_per_class=remove) assert (all(value_counts(s[noise_idx]) == remove))
def find_noise(self, labeled_sample_list): # get data feature labeled_data_label = [ i.human_label for i in labeled_sample_list if i.human_label ] labeled_data_feature = [ i.feature.toarray().tolist()[0] for i in labeled_sample_list ] # find noise(maybe error index) s = np.array([self.label_id[i] for i in labeled_data_label]) X = np.array(labeled_data_feature) psx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities( X, s, clf=LogisticRegression(max_iter=1000, multi_class='auto', solver='lbfgs')) ordered_label_errors = get_noise_indices( s=s, psx=psx, sorted_index_method='normalized_margin', # Orders label errors ) logger.debug( '[find_noise] ordered_label_errors index: {}, size: {}'.format( ordered_label_errors, len(ordered_label_errors))) noise_samples = [labeled_sample_list[i] for i in ordered_label_errors] return noise_samples
def test_prune_on_small_data(): data = make_data(sizes=[4, 4, 4]) for pm in ['prune_by_noise_rate', 'prune_by_class', 'both']: noise_idx = pruning.get_noise_indices( s=data['s'], psx=data['psx'], prune_method=pm, ) # Num in each class < 5. Nothing should be pruned. assert (not any(noise_idx))
def test_pruning_order_method(): order_methods = ["prob_given_label", "normalized_margin"] results = [] for method in order_methods: results.append(pruning.get_noise_indices( s=data['s'], psx=data['psx'], sorted_index_method=method, )) assert(len(results[0]) == len(results[1]))
def main(output_filepath, noice_pct): ''' Evaluation script and write the noisy indices to a separate csv file ''' logging.info(f"Default config : {json.dumps(config)}") # Loading the data logging.info('Loading the data...') source, df = load_data(URLs.IMAGENETTE) # Default DataLoaders using 5 as noise_percent dls: DataLoaders = get_dls(df, noice_pct=noice_pct, pref=source, size=224) learn: Learner = train(dls, f'resnet34_5ep_3frzep_1e_3_{noice_pct}np.pkl') # Predict using a single image file # learn = load_learner('export.pkl') # learn.predict('TEST_IMAGE_FILE') # Determine the noisy indices on training/validation or an unknown test dataset # Training logging.info('Getting the predictions for training data...') train_preds = learn.get_preds(ds_idx=0, with_decoded=True) # Add Predictions & Confidence Score decoded_train_preds = get_inverse_transform(L(list(train_preds[2])).map(dls.vocab)) # Add confidence & confidenceRange confidence = torch.max(train_preds[0], axis=-1).values # Noisy indices from training dataset. train_ordered_label_errors = get_noise_indices(s=train_preds[1].numpy(), #targets psx=train_preds[0].numpy(),#predictions_prob prune_method="both", # 'prune_by_noise_rate': works by removing examples with *high probability* of being mislabeled for every non-diagonal in the prune_counts_matrix (see pruning.py). #'prune_by_class': works by removing the examples with *smallest probability* of belonging to their given class label for every class. sorted_index_method='normalized_margin') # Actual Noise in the training dataset train_df = df[df.is_valid == False].copy() print("We found {} label errors in the training dataset of size {}.".format(len(train_ordered_label_errors), len(train_df))) train_df['predictions'] = np.array(decoded_train_preds) train_df['confidence'] = confidence noisy_train = train_df.iloc[train_ordered_label_errors] PREDICTIONS_NAME = f'noisy{noice_pct}_train_predictions.csv' logging.info(f'Saving the noisy training data with predictions and confidence at {output_filepath}/{PREDICTIONS_NAME}') noisy_train.to_csv(f'{output_filepath}/{PREDICTIONS_NAME}', index=False)
def test_get_noise_indices_multi_label(): s_ml = [[z, data['y_train'][i]] for i, z in enumerate(data['s'])] for multi_label in [True, False]: for prune_method in ['prune_by_class', 'prune_by_noise_rate']: noise_idx = pruning.get_noise_indices( s=s_ml if multi_label else data['s'], psx=data['psx'], prune_method=prune_method, multi_label=multi_label, ) acc = np.mean((data['s'] != data['y_train']) == noise_idx) # Make sure cleanlab does reasonably well finding the errors. # acc is the accuracy of detecting a label error. assert (acc > 0.85)
def evaluate(df, data_path): # preprocess test data test_df = remove_stop_add_hashtag(df) # tokenize tokenized_df = tokenize_df(test_df, text_cols=["HashTag", "Text"], mark_fields=True, tok_text_col='text') #returns a tuple # load the saved model learn = load_learner(f'{MODELS}/{cfg.model_name}') # test dataloader test_dl = learn.dls.test_dl(tokenized_df[0]) # predictions result = learn.get_preds(dl=test_dl) confidence = torch.max(result[0], axis=1).values _, y = learn.dls.valid.vocab y_predicted = np.array(y[result[0].argmax(axis=1)]) test_df['predicted'] = y_predicted test_df['confidence'] = confidence # metrics # _, metric_value = learn.validate(dl=test_dl) #loss, metrics used # metrics = {each.name: metric_value[idx] for idx, each in enumerate(learn.metrics)} # print(f"Metrics on the test dataset : {metrics}") # Noisy Labels on Test Dataframe test_ordered_label_errors = get_noise_indices( s=Numericalize(vocab=['INFORMATIVE', 'UNINFORMATIVE'])( test_df.Label).numpy(), psx=result[0].numpy(), prune_method= "both", # 'prune_by_noise_rate': works by removing examples with *high probability* of being mislabeled for every non-diagonal in the prune_counts_matrix (see pruning.py). #'prune_by_class': works by removing the examples with *smallest probability* of belonging to their given class label for every class. sorted_index_method='normalized_margin') print(test_ordered_label_errors) test_df.iloc[test_ordered_label_errors].to_csv( f'{data_path}/noisy_text.csv')
def del_mis_label(use_cuda, testloader): model.eval() correctAll = 0 totalAll = 0 with torch.no_grad(): for batch, (inputs, targets) in enumerate(testloader): pred = np.zeros(shape=(targets.shape[0], 10)) label = targets.numpy() if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = torch.autograd.Variable( inputs), torch.autograd.Variable(targets) for i in range(targets.shape[0] // 100): outputs = model(inputs[i * 100:(i + 1) * 100, :, :, :]) pred[i * 100:(i + 1) * 100, :] = outputs.data.cpu() # loss = criterion(outputs, targets) + mcloss(feature, targets) * 10 # loss = bi_tempered_logistic_loss(activations=activations, labels=labels, t1=0.7, t2=1.3) correct, total = accuracy(outputs.data, targets[i * 100:(i + 1) * 100].data) correctAll += correct totalAll += total print('using: batch=%04d' % batch, ' accuracy=%.2f' % (correct / total * 100.0), end='\r') print('USING: batch=%04d' % batch, ' accuracy=%.2f' % (correctAll / totalAll * 100.0)) f = open(log_txt, 'a+') line = str(batch) + " " + str((correctAll / totalAll * 100.0)) + '\n' f.write(line) f.close() print('') from cleanlab.pruning import get_noise_indices ordered_label_errors = get_noise_indices( s=label, psx=pred, sorted_index_method='normalized_margin', # Orders label errors n_jobs=1) return correctAll / totalAll * 100.0, ordered_label_errors
def baseline_argmax_confusion_matrix( psx, s, calibrate=False, prune_method='prune_by_noise_rate', ): '''This is a baseline approach. That uses the a confusion matrix of argmax(psx) and s as the confident joint and then uses cleanlab (confident learning) to find the label errors using this matrix. Parameters ---------- s : np.array A discrete vector of noisy labels, i.e. some labels may be erroneous. psx : np.array (shape (N, K)) P(label=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. Returns ------- A boolean mask that is true if the example belong to that index is label error..''' confident_joint = confusion_matrix(np.argmax(psx, axis=1), s).T if calibrate: confident_joint = calibrate_confident_joint(confident_joint, s) return get_noise_indices( s=s, psx=psx, confident_joint=confident_joint, prune_method=prune_method, )
if not os.path.exists('results'): os.makedirs('results') with open(result_latent_vars, 'wb') as output: pickle.dump(est_py, output, pickle.HIGHEST_PROTOCOL) pickle.dump(est_nm, output, pickle.HIGHEST_PROTOCOL) pickle.dump(est_inv, output, pickle.HIGHEST_PROTOCOL) pickle.dump(confident_joint, output, pickle.HIGHEST_PROTOCOL) pickle.dump(psx, output, pickle.HIGHEST_PROTOCOL) else: with open(result_latent_vars, 'rb') as inf: est_py = pickle.load(inf) est_nm = pickle.load(inf) est_inv = pickle.load(inf) confident_joint = pickle.load(inf) psx = pickle.load(inf) # print flipped labels label_errors = get_noise_indices( s=train_labels_with_errors, # required psx=psx, # required inverse_noise_matrix= est_inv, # not required, include to avoid recomputing confident_joint= confident_joint, # not required, include to avoid recomputing ) print( pd.concat([ train_labels_with_errors, train_true_labels, pd.DataFrame(data=label_errors, columns=['flipped_label']) ], axis=1))
os.system(f'mv {old_path} {new_path}') # Cell mnist = DataBlock(blocks=(ImageBlock(cls=PILImageBW), CategoryBlock), get_items=get_image_files, splitter=GrandparentSplitter(train_name='training', valid_name='testing'), get_y=parent_label) dls = mnist.dataloaders(path, bs=16) dls.show_batch(max_n=36, figsize=(6, 6)) # Cell learn = cnn_learner(dls, resnet18, metrics=accuracy, loss_func=LabelSmoothingCrossEntropyFlat()) # Cell learn.fine_tune(1, 1e-3) # Cell val_preds = learn.get_preds(ds_idx=1, with_decoded=True) # Cell from cleanlab.pruning import get_noise_indices # Cell val_ordered_label_errors = get_noise_indices( s=val_preds[1].numpy(), psx=val_preds[0].numpy(), sorted_index_method='normalized_margin')
def fit( self, X, s, psx = None, thresholds = None, noise_matrix = None, inverse_noise_matrix = None, ): '''This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifier (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Output ------ Returns (noise_mask, sample_weight)''' # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError("Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and np.trace(inverse_noise_matrix) <= 1: t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError("Trace(inverse_noise_matrix) is {}, but must exceed 1.".format(t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (fraction of mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix(self.ps, self.noise_matrix) if inverse_noise_matrix is not None: self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse(self.ps, self.inverse_noise_matrix) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = \ estimate_py_noise_matrices_and_cv_pred_proba( X = X, s = s, clf = self.clf, cv_n_folds = self.cv_n_folds, thresholds = thresholds, converge_latent_estimates = self.converge_latent_estimates, seed = self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = \ estimate_py_and_noise_matrices_from_probabilities( s = s, psx = psx, thresholds = thresholds, converge_latent_estimates = self.converge_latent_estimates, ) if psx is None: psx = estimate_cv_predicted_probabilities( X = X, labels = s, clf = self.clf, cv_n_folds = self.cv_n_folds, seed = self.seed, ) # Zero out noise matrix entries if pulearning = the integer specifying the class without noise. if self.pulearning is not None: # pragma: no cover self.noise_matrix = remove_noise_from_class( self.noise_matrix, class_without_noise=self.pulearning, ) # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning) # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix = self.inverse_noise_matrix, confident_joint = self.confident_joint, prune_method = self.prune_method, ) if self.pulearning is not None: self.noise_mask[s != self.pulearning] = False return self.noise_mask, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx
def fit( self, X, s, psx=None, thresholds=None, noise_matrix=None, inverse_noise_matrix=None, ): """This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifier (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : :obj:`np.array` Input feature matrix (N, D), 2D numpy array s : :obj:`np.array` A binary vector of labels, s, which may contain mislabeling. psx : :obj:`np.array` (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : :obj:`iterable` (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). List of probabilities used to determine the cutoff predicted probability necessary to consider an example as a given class label. Default is ``None``. These are computed for you automatically. If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. Values in list should be between 0 and 1. noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : :obj:`np.array` of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s). Contains the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Returns ------- tuple (noise_mask, sample_weight)""" # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError( "Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and (np.trace(inverse_noise_matrix) <= 1): t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError( "Trace(inverse_noise_matrix) is {}. Must exceed 1.".format(t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = ( compute_py_inv_noise_matrix(self.ps, self.noise_matrix)) if inverse_noise_matrix is not None: self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse( self.ps, self.inverse_noise_matrix, ) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, \ self.confident_joint, psx = \ estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=self.clf, cv_n_folds=self.cv_n_folds, thresholds=thresholds, converge_latent_estimates=( self.converge_latent_estimates), seed=self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, \ self.confident_joint = \ estimate_py_and_noise_matrices_from_probabilities( s=s, psx=psx, thresholds=thresholds, converge_latent_estimates=( self.converge_latent_estimates), ) if psx is None: psx = estimate_cv_predicted_probabilities( X=X, labels=s, clf=self.clf, cv_n_folds=self.cv_n_folds, seed=self.seed, ) # if pulearning == the integer specifying the class without noise. if self.K == 2 and self.pulearning is not None: # pragma: no cover # pulearning = 1 (no error in 1 class) implies p(s=1|y=0) = 0 self.noise_matrix[self.pulearning][1 - self.pulearning] = 0 self.noise_matrix[1 - self.pulearning][1 - self.pulearning] = 1 # pulearning = 1 (no error in 1 class) implies p(y=0|s=1) = 0 self.inverse_noise_matrix[1 - self.pulearning][self.pulearning] = 0 self.inverse_noise_matrix[self.pulearning][self.pulearning] = 1 # pulearning = 1 (no error in 1 class) implies p(s=1,y=0) = 0 self.confident_joint[self.pulearning][1 - self.pulearning] = 0 self.confident_joint[1 - self.pulearning][1 - self.pulearning] = 1 # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix=self.inverse_noise_matrix, confident_joint=self.confident_joint, prune_method=self.prune_method, n_jobs=self.n_jobs, ) x_mask = ~self.noise_mask x_pruned = X[x_mask] s_pruned = s[x_mask] # Check if sample_weight in clf.fit(). Compatible with Python 2/3. if hasattr(inspect, 'getfullargspec') and \ 'sample_weight' in inspect.getfullargspec(self.clf.fit).args \ or hasattr(inspect, 'getargspec') and \ 'sample_weight' in inspect.getargspec(self.clf.fit).args: # Re-weight examples in the loss function for the final fitting # s.t. the "apparent" original number of examples in each class # is preserved, even though the pruned sets may differ. self.sample_weight = np.ones(np.shape(s_pruned)) for k in range(self.K): sample_weight_k = 1.0 / self.noise_matrix[k][k] self.sample_weight[s_pruned == k] = sample_weight_k self.clf.fit(x_pruned, s_pruned, sample_weight=self.sample_weight) else: # This is less accurate, but best we can do if no sample_weight. self.clf.fit(x_pruned, s_pruned) return self.clf
#!/usr/bin/env python # -*- coding:gb18030 -*- """ File : mark_data_clean.py Author: [email protected] Date : 20/09/24 15:02:29 Desc : """ import sys from cleanlab.pruning import get_noise_indices def wrong_label_detect(pred_prob, given_label) wrong_label_indexs = get_noise_indices( s=given_label, psx=pred_prob, sorted_index_method='normalized_margin', # Orders label errors ) return wrong_label_indexs if __name__ == "__main__": pass
def fit( self, X, s, psx=None, thresholds=None, noise_matrix=None, inverse_noise_matrix=None, ): '''This method implements the confident learning. It counts examples that are likely labeled correctly and incorrectly and uses their ratio to create a predicted confusion matrix. This function fits the classifer (self.clf) to (X, s) accounting for the noise in both the positive and negative sets. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling. psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. If you are not sure, leave psx = None (default) and it will be computed for you using cross-validation. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. Output ------ Returns (noise_mask, sample_weight)''' # Check inputs assert_inputs_are_valid(X, s, psx) if noise_matrix is not None and np.trace(noise_matrix) <= 1: t = np.round(np.trace(noise_matrix), 2) raise ValueError( "Trace(noise_matrix) is {}, but must exceed 1.".format(t)) if inverse_noise_matrix is not None and np.trace( inverse_noise_matrix) <= 1: t = np.round(np.trace(inverse_noise_matrix), 2) raise ValueError( "Trace(inverse_noise_matrix) is {}, but must exceed 1.".format( t)) # Number of classes self.K = len(np.unique(s)) # 'ps' is p(s=k) self.ps = value_counts(s) / float(len(s)) self.confident_joint = None # If needed, compute noise rates (fraction of mislabeling) for all classes. # Also, if needed, compute P(s=k|x), denoted psx. # Set / re-set noise matrices / psx; estimate if not provided. if noise_matrix is not None: if self.prune_count_method == 'calibrate_confident_joint': w = "Y\nou should not use self.prune_count_method == 'calibrate_confident_joint'." w += "\nwhen .fit(noise_matrix = something) because" w += "\n'calibrate_confident_joint' estimates the noise from scratch and will" w += "\nnot use your 'something' noise matrix information. Instead, use" w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors" w += "\nby using the noise matrix you provde." warnings.warn(w) self.noise_matrix = noise_matrix if inverse_noise_matrix is None: self.py, self.inverse_noise_matrix = compute_py_inv_noise_matrix( self.ps, self.noise_matrix) if inverse_noise_matrix is not None: if self.prune_count_method == 'calibrate_confident_joint': w = "\nYou should not use self.prune_count_method == 'calibrate_confident_joint'." w += "\nwhen .fit(inverse_noise_matrix = something) because" w += "\n'calibrate_confident_joint' estimates the noise from scratch and will" w += "\nnot use your 'something' inv noise matrix information. Instead, use" w += "\nprune_count_method == 'inverse_nm_dot_s' which will find label errors" w += "\nby using the inverse noise matrix you provde." warnings.warn(w) self.inverse_noise_matrix = inverse_noise_matrix if noise_matrix is None: self.noise_matrix = compute_noise_matrix_from_inverse( self.ps, self.inverse_noise_matrix) if noise_matrix is None and inverse_noise_matrix is None: if psx is None: self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint, psx = estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=self.clf, cv_n_folds=self.cv_n_folds, thresholds=thresholds, converge_latent_estimates=self.converge_latent_estimates, seed=self.seed, ) else: # psx is provided by user (assumed holdout probabilities) self.py, self.noise_matrix, self.inverse_noise_matrix, self.confident_joint = estimate_py_and_noise_matrices_from_probabilities( s=s, psx=psx, thresholds=thresholds, converge_latent_estimates=self.converge_latent_estimates, ) if psx is None: psx = estimate_cv_predicted_probabilities( X=X, labels=s, clf=self.clf, cv_n_folds=self.cv_n_folds, seed=self.seed, ) # Zero out noise matrix entries if pulearning = the integer specifying the class without noise. if self.pulearning is not None: # pragma: no cover self.noise_matrix = remove_noise_from_class( self.noise_matrix, class_without_noise=self.pulearning) # TODO: self.inverse_noise_matrix = remove_noise_from_class(self.inverse_noise_matrix, class_without_noise=self.pulearning) # This is the actual work of this function. # Get the indices of the examples we wish to prune self.noise_mask = get_noise_indices( s, psx, inverse_noise_matrix=self.inverse_noise_matrix, confident_joint=self.confident_joint, prune_method=self.prune_method, prune_count_method=self.prune_count_method, converge_latent_estimates=self.converge_latent_estimates, ) X_mask = ~self.noise_mask X_pruned = X[X_mask] s_pruned = s[X_mask] # Check if sample_weight in clf.fit(). Compatible with Python 2/3. if hasattr( inspect, 'getfullargspec' ) and 'sample_weight' in inspect.getfullargspec( self.clf.fit).args or hasattr( inspect, 'getargspec') and 'sample_weight' in inspect.getargspec( self.clf.fit).args: # Re-weight examples in the loss function for the final fitting # s.t. the "apparent" original number of examples in each class # is preserved, even though the pruned sets may differ. self.sample_weight = np.ones(np.shape(s_pruned)) for k in range(self.K): self.sample_weight[s_pruned == k] = 1.0 / self.noise_matrix[k][k] self.clf.fit(X_pruned, s_pruned, sample_weight=self.sample_weight) else: # This is less accurate, but its all we can do if sample_weight isn't available. self.clf.fit(X_pruned, s_pruned) return self.clf
label_id_map[l]: v for l, v in label_map.items() if l in label_id_map } # Be sure you compute probs in a holdout/out-of-sample manner (e.g. cross-validation) # Now getting label errors is trivial with cleanlab... its one line of code. # Label errors are ordered by likelihood of being an error. First index is most likely error. if preds.shape[0] > 100000: print('Large predictions take a long time. Only using top 100,000.') preds = preds[:100000, :] labels = labels[:100000] ordered_label_errors = pruning.get_noise_indices( s=labels, psx=preds, sorted_index_method='normalized_margin', # Orders label errors ) data_path = Path(args.data_dir) text_path = data_path / 'txt' class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m'
K, trace=1.5, py=py, valid_noise_matrix=True, ) # Generate our noisy labels using the noise_marix. s = generate_noisy_labels(y_train, noise_matrix) ps = np.bincount(s) / float(len(s)) confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(X_train, s, seed=seed) est_py, est_noise_matrix, est_inverse_noise_matrix = estimate_latent( confident_joint, s) idx_errors = get_noise_indices(s, psx) # #### To show off the power of **cleanlab**, we've chosen an example of multiclass learning with noisy labels in which over 50% of the training labels are wrong. # Toggle the ```trace``` parameter in ```generate_noise_matrix_from_trace``` above to try out different amounts of noise. Note, as we prove in our paper, learning becomes impossible if the ```trace <= 1```, so choose a value greater than 1, but less than, or equal to, the number of classes (3). # In[4]: est_joint = cleanlab.latent_estimation.estimate_joint( s=s, psx=psx, confident_joint=confident_joint, ) true_joint_distribution_of_label_errors = (noise_matrix * py) percent_error_str = 'Percent of training examples that have wrong labels: ' + str( int(round(100 - 100 * true_joint_distribution_of_label_errors.trace()))) + "%"
# Cell learn_5.save('learn_5') # Cell train_preds = learn_5.get_preds(ds_idx=0, with_decoded=True) # Cell val_preds = learn_5.get_preds(ds_idx=1, with_decoded=True) # Cell from cleanlab.pruning import get_noise_indices # Cell train_ordered_label_errors = get_noise_indices( s=train_preds[1].numpy(), #targets psx=train_preds[0].numpy(), #predictions_prob sorted_index_method='normalized_margin') # Internal Cell print("We found {} label errors.".format(len(train_ordered_label_errors))) # Cell noisy_train = train_df.loc[train_ordered_label_errors] # Cell train_preds = learn_50.get_preds(ds_idx=0, with_decoded=True) from cleanlab.pruning import get_noise_indices # Cell # Get the noisy indices train_ordered_label_errors = get_noise_indices( s=train_preds[1].numpy(), #targets
# That's all we need for confident learning. # STEP 1 - Compute confident joint # Verify inputs s = np.asarray(s) psx = np.asarray(psx) # Find the number of unique classes if K is not given K = len(np.unique(s)) from cleanlab.pruning import get_noise_indices ordered_label_errors = get_noise_indices( s=s, psx=psx, sorted_index_method='normalized_margin', # Orders label errors ) print('orderd_label_errors:', ordered_label_errors) print(np.array(sorted(ordered_label_errors))) idx_errors = ordered_label_errors label_errors_idx = np.array(sorted(ordered_label_errors)) score = sum([e in label_errors_idx for e in actual_label_errors]) / actual_num_errors print('% actual errors that confident learning found: {:.0%}'.format(score)) score = sum([e in actual_label_errors for e in label_errors_idx]) / len(label_errors_idx) print(
kv = line.split("\t") label = 0 if float(kv[3]) > 3: #if float(kv[2]) > 3: label = 1 y_true.append(label) y_scores.append([float(kv[5].split('|')[0]), float(kv[5].split('|')[1])]) #y_scores.append([float(kv[4].split('|')[0]), float(kv[4].split('|')[1])]) lines.append(line) numpy_array_of_noisy_labels = np.array(y_true) numpy_array_of_predicted_probabilities = np.array(y_scores) #print (numpy_array_of_noisy_labels) #print (numpy_array_of_predicted_probabilities) ordered_label_errors = get_noise_indices( s=numpy_array_of_noisy_labels, psx=numpy_array_of_predicted_probabilities, sorted_index_method='normalized_margin', # Orders label errors ) print(ordered_label_errors[:10]) #index_dict = {} #for i in np.arange(len(lines)): # index_dict[i] = lines[i].strip("\n") #for i in ordered_label_errors: # line = index_dict[i] # kv = line.split("\t") # #line = "%s\t%s\t%s\t%s"%(kv[0], kv[1], kv[2], float(kv[4].split('|')[1])) # print (line) #exit() #for i in np.arange(len(lines)):