Пример #1
0
        LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto'),
    ),
]:
    print("\n", "=" * len(name), "\n", name, '\n', "=" * len(name))
    np.random.seed(seed=0)
    clf_copy = copy.deepcopy(clf)
    # Compute p(y=k), the ground truth class prior on the labels.
    py = np.bincount(y_train) / float(len(y_train))
    # Generate the noisy channel to characterize the label errors.
    noise_matrix = generate_noise_matrix_from_trace(
        K=num_classes,
        trace=num_classes * avg_trace,
        py=py,
        frac_zero_noise_rates=frac_zero_noise_rates,
    )
    print_noise_matrix(noise_matrix)
    # Create the noisy labels. This method is exact w.r.t. the noise_matrix.
    y_train_with_errors = generate_noisy_labels(y_train, noise_matrix)
    lnl_cv = GridSearch(
        model=LearningWithNoisyLabels(clf),
        param_grid=param_grid,
        num_threads=4,
        seed=0,
    )
    lnl_cv.fit(
        X_train=X_train,
        y_train=y_train_with_errors,
        X_val=X_val,
        y_val=y_val,
        verbose=False,
    )
Пример #2
0
            linewidth=1)
    _ = plt.scatter(
        X_train[~idx_errors][:, 0][s[~idx_errors] != y_train[~idx_errors]],
        X_train[~idx_errors][:, 1][s[~idx_errors] != y_train[~idx_errors]],
        s=400,
        facecolors='none',
        edgecolors='black',
        linewidth=2,
        alpha=0.5)
    _ = plt.title('Dataset after pruning detected label errors.', fontsize=30)
    plt.show()
except:
    print("Plotting is only supported in an iPython interface.")

print('The actual, latent, underlying noise matrix.')
print_noise_matrix(noise_matrix)
print('Our estimate of the noise matrix.')
print_noise_matrix(est_noise_matrix)
print()
print('The actual, latent, underlying joint distribution matrix.')
cleanlab.util.print_joint_matrix(true_joint_distribution_of_label_errors)
print('Our estimate of the joint distribution matrix.')
cleanlab.util.print_joint_matrix(est_joint)
print("Accuracy Comparison")
print("-------------------")
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
baseline_score = accuracy_score(y_test, clf.fit(X_train, s).predict(X_test))
print("Logistic regression:", baseline_score)
rp = LearningWithNoisyLabels(seed=seed)
rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))
print("Logistic regression (+rankpruning):", rp_score)
Пример #3
0
def test_print_noise_matrix():
    for m in [noise_matrix, noise_matrix_2, single_element]:
        util.print_noise_matrix(noise_matrix)
    assert (True)
# In[17]:

# Create noisy labels for both CIFAR-10 and CIFAR-100
# Store dictionary as json
import numpy as np
import pickle
from cleanlab import util
for cifar_dataset in ["cifar10"]:  #, "cifar100"]:
    data_path = '/datasets/datasets/{}/{}/'.format(cifar_dataset,
                                                   cifar_dataset)
    for noise_amount in np.arange(0.2, 0.61, 0.2):
        for frac_zero_noise_rates in np.arange(0, 0.61, 0.2):
            # Print the noise matrix
            rfn_base = '{}_noisy_labels__frac_zero_noise_rates__{}__noise_amount__{}'.format(
                cifar_dataset,
                "0.0" if frac_zero_noise_rates < 1e-4 else round(
                    frac_zero_noise_rates, 1),
                "0.0" if noise_amount < 1e-4 else round(noise_amount, 1),
            )
            rfn = data_path + "noisy_labels/" + rfn_base
            rfn_base = "{}_noise_matrix".format(
                cifar_dataset) + "__" + "__".join(rfn_base.split("__")[1:])
            rfn = data_path + "noisy_labels/" + rfn_base
            with open(rfn + ".pickle", 'rb') as rf:
                nm = pickle.load(rf)
            actual_noise = 0.7 if abs(noise_amount -
                                      0.6) < 1e-3 else noise_amount
            print('Noise amount:', round(actual_noise, 3), "| Sparsity:",
                  round(frac_zero_noise_rates, 3))
            util.print_noise_matrix(nm)