def compute_py_inv_noise_matrix(ps, noise_matrix): '''Compute py := P(y=k), and the inverse noise matrix. Parameters ---------- ps : np.array (shape (K, 1)) The fraction (prior probability) of each observed, noisy class label, P(s = k) noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1.''' # Number of classes K = len(ps) # 'py' is p(y=k) = noise_matrix^(-1) * p(s=k) # because in *vector computation*: P(s=k|y=k) * p(y=k) = P(s=k) # The pseudoinverse is used when noise_matrix is not invertible. py = np.linalg.inv(noise_matrix).dot(ps) # No class should have probability 0 so we use .001 # Make sure valid probabilites that sum to 1.0 py = clip_values(py, low=0.001, high=1.0, new_sum = 1.0) # All the work is done in this function (below) return py, compute_inv_noise_matrix(py, noise_matrix, ps)
def estimate_latent( confident_joint, s, py_method='cnt', converge_latent_estimates=False, ): '''Computes the latent prior p(y), the noise matrix P(s|y) and the inverse noise matrix P(y|s) from the `confident_joint` count(s, y). The `confident_joint` estimated by `compute_confident_joint` by counting confident examples. Parameters ---------- s : np.array A discrete vector of labels, s, which may contain mislabeling. "s" denotes the noisy label instead of \tilde(y), for ASCII encoding reasons. confident_joint : np.array (shape (K, K), type int) A K,K integer matrix of count(s=k, y=k). Estimatesa a confident subset of the joint disribution of the noisy and true labels P_{s,y}. Each entry in the matrix contains the number of examples confidently counted into every pair (s=j, y=k) classes. py_method : str (Options: ["cnt", "eqn", "marginal", "marginal_ps"]) How to compute the latent prior p(y=k). Default is "cnt" as it often works well even when the noise matrices are estimated poorly by using the matrix diagonals instead of all the probabilities. converge_latent_estimates : bool If true, forces numerical consistency of estimates. Each is estimated independently, but they are related mathematically with closed form equivalences. This will iteratively make them mathematically consistent. Returns ------ A tuple containing (py, noise_matrix, inv_noise_matrix).''' # Number of classes K = len(np.unique(s)) # 'ps' is p(s=k) ps = value_counts(s) / float(len(s)) # Ensure labels are of type np.array() s = np.asarray(s) # Number of training examples confidently counted from each noisy class s_count = confident_joint.sum(axis=1).astype(float) # Number of training examples confidently counted into each true class y_count = confident_joint.sum(axis=0).astype(float) # Confident Counts Estimator for p(s=k_s|y=k_y) ~ |s=k_s and y=k_y| / |y=k_y| noise_matrix = confident_joint / y_count # Confident Counts Estimator for p(y=k_y|s=k_s) ~ |y=k_y and s=k_s| / |s=k_s| inv_noise_matrix = confident_joint.T / s_count # Compute the prior p(y), the latent (uncorrupted) class distribution. py = compute_py(ps, noise_matrix, inv_noise_matrix, py_method, y_count) # Clip noise rates to be valid probabilities. noise_matrix = clip_noise_rates(noise_matrix) inv_noise_matrix = clip_noise_rates(inv_noise_matrix) # Make latent estimates mathematically agree in their algebraic relations. if converge_latent_estimates: py, noise_matrix, inv_noise_matrix = converge_estimates( ps, py, noise_matrix, inv_noise_matrix) # Again clip py and noise rates into proper range [0,1) py = clip_values(py, low=1e-5, high=1.0, new_sum=1.0) noise_matrix = clip_noise_rates(noise_matrix) inv_noise_matrix = clip_noise_rates(inv_noise_matrix) return py, noise_matrix, inv_noise_matrix
def compute_py(ps, noise_matrix, inverse_noise_matrix, py_method = 'cnt', y_count = None): '''Compute py := P(y=k) from ps := P(s=k), noise_matrix, and inverse noise matrix. This method is ** ROBUST ** when py_method = 'cnt' It may work well even when the noise matrices are estimated poorly by using the diagonals of the matrices instead of all the probabilities in the entire matrix. Parameters ---------- ps : np.array (shape (K, ) or (1, K)) The fraction (prior probability) of each observed, noisy class label, P(s = k). noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(s=k_s|y=k_y) containing the fraction of examples in every class, labeled as every other class. Assumes columns of noise_matrix sum to 1. inverse_noise_matrix : np.array of shape (K, K), K = number of classes A conditional probablity matrix of the form P(y=k_y|s=k_s) representing the estimated fraction observed examples in each class k_s, that are mislabeled examples from every other class k_y. If None, the inverse_noise_matrix will be computed from psx and s. Assumes columns of inverse_noise_matrix sum to 1. py_method : str (Options: ["cnt", "eqn", "marginal", "marginal_ps"]) How to compute the latent prior p(y=k). Default is "cnt" as it often works well even when the noise matrices are estimated poorly by using the matrix diagonals instead of all the probabilities. y_count : np.array (shape (K, ) or (1, K)) The marginal counts of the confident joint (like cj.sum(axis = 0)) Output ------ py : np.array (shape (K, ) or (1, K)) The fraction (prior probability) of each observed, noisy class label, P(y = k).''' if len(np.shape(ps)) > 2 or (len(np.shape(ps)) == 2 and np.shape(ps)[0] != 1): w = 'Input parameter np.array ps has shape ' + str(np.shape(ps)) w += ', but shape should be (K, ) or (1, K)' warnings.warn(w) if py_method == 'marginal' and y_count is None: err = 'py_method == "marginal" requires y_count, but y_count is None.' err += ' Provide parameter y_count.' raise ValueError(err) if py_method == 'cnt': # Computing py this way avoids dividing by zero noise rates. # More robust bc error est_p(y|s) / est_p(s|y) ~ p(y|s) / p(s|y) py = inverse_noise_matrix.diagonal() / noise_matrix.diagonal() * ps # Equivalently, # py = (y_count / s_count) * ps elif py_method == 'eqn': py = np.linalg.inv(noise_matrix).dot(ps) elif py_method == 'marginal': py = y_count / float(sum(y_count)) elif py_method == 'marginal_ps': py = np.dot(inverse_noise_matrix, ps) else: err = 'py_method {}'.format(py_method) err += ' should be in [cnt, eqn, marginal, marginal_ps]' raise ValueError(err) # Clip py (0,1), .s.t. no class should have prob 0, hence 1e-5 py = clip_values(py, low=1e-5, high=1.0, new_sum = 1.0) return py