def _kl_divergence(self, X, W, H): """ Calculate the generalized Kullback-Leibler divergence (also called Information Divergence or I-Divergence) between two matrices. """ B = W.dot(H) return np.sum(kl_div(X,B))
def compute_kl_div(X, X_prime): """ Compute K-L divergence. """ prior_vector = compute_prior_vector() p_prime = compute_p(prior_vector, X_prime) p = compute_p(prior_vector, X) flat_div = kl_div(p_prime, p) return flat_div.sum()
def cat_kl(p, q): # the following version has problems for p near 0 # return (p*np.log(p/q)).sum(axis=1) return special.kl_div(p,q).sum(axis=1) #pylint: disable=E1101
r0_twolevel2[i] = len(sim3.people.transtree.targets[0]) print('R0 constant viral load: ', np.mean(r0_const), ' +- ', np.std(r0_const)) print('R0 two level viral load: ', np.mean(r0_twolevel), ' +- ', np.std(r0_twolevel)) print('R0 two level diff params: ', np.mean(r0_twolevel2), ' +- ', np.std(r0_twolevel2)) import matplotlib.pyplot as plt hist1 = plt.hist(r0_const, bins=np.arange(-0.5, 10.5), density=True) hist2 = plt.hist(r0_twolevel, bins=np.arange(-0.5, 10.5), density=True) hist3 = plt.hist(r0_twolevel2, bins=np.arange(-0.5, 10.5), density=True) plt.show() # Test that the R0 did not change substantially, though the std is large assert (abs(np.mean(r0_const) - np.mean(r0_twolevel)) < np.std(r0_const)) assert (abs(np.mean(r0_const) - np.mean(r0_twolevel2)) < np.std(r0_const)) assert (abs(np.mean(r0_twolevel) - np.mean(r0_twolevel2)) < np.std(r0_twolevel)) # adding some sudo counts to distribution because a 0 in the second distribution # where there isn't a 0 in the first distribution gives inf hist1[0][hist1[0] == 0] = 1e-10 hist1[0][:] = hist1[0] / sum(hist1[0]) hist2[0][hist2[0] == 0] = 1e-10 hist2[0][:] = hist2[0] / sum(hist2[0]) hist3[0][hist3[0] == 0] = 1e-10 hist3[0][:] = hist3[0] / sum(hist3[0]) # Test the KL diverage of the distributions of R0. Since std is large this is # likely a stronger test than the above R0 comparisons. assert (sum(kl_div(hist1[0], hist2[0])) < 1) assert (sum(kl_div(hist1[0], hist3[0])) < 1) assert (sum(kl_div(hist2[0], hist3[0])) < 1)
def get_closest_topics( *matrices: List[Union[np.ndarray, DataFrame]], ref: int = 0, method: str = "klb", thres: float = 0.9, top_words: int = 100, verbose: bool = True) -> Tuple[np.ndarray, np.ndarray]: """Finding closest topics in models. Parameters ---------- *matrices : List[Union[DataFrame, np.ndarray]] Sequence of topics vs words matrices (T x W). This matrix can be accessed using ``matrix_words_topics_`` model attribute. ref : int = 0 Index of reference matrix (zero-based indexing). method : str = "klb" Comparison method. Possible variants: 1) "klb" - Kullback-Leibler divergence. Topics are compared by words probabilities distributions. 2) "jaccard" - Jaccard index. Topics are compared by top words sets. thres : float = 0.9 Threshold for topic filtering. top_words : int = 100 Number of top words in each topic to use in Jaccard index calculation. verbose : bool = True Verbose output (progress bar). Returns ------- closest_topics : np.ndarray Closest topics indices in one two-dimensional array. Columns correspond to the compared matrices (their indices), rows are the closest topics pairs. dist : np.ndarray Kullback-Leibler (if ``method`` is set to ``klb``) or Jaccard index values corresponding to the matrix of the closest topics. """ matrices_num = len(matrices) ref = matrices_num - 1 if ref >= matrices_num else ref matrix_ref = matrices[ref] topics_num = matrix_ref.shape[0] closest_topics = np.zeros(shape=(topics_num, matrices_num), dtype=int) closest_topics[:, ref] = np.arange(topics_num) def enum_func(x): return enumerate(tqdm.tqdm(x)) if verbose else enumerate(x) if method == "klb": kldiv = np.zeros(shape=(topics_num, matrices_num), dtype=float) for mid, matrix in enum_func(matrices): if mid == ref: continue kld_values = np.zeros((topics_num, topics_num)) for t_ref in range(topics_num): for t in range(topics_num): # kld_raw = 0.5 * ( # ssp.kl_div(matrix[t, :], matrix_ref[t_ref, :]) + # ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :])) kld_raw = ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :]) kld_values[t_ref, t] = kld_raw[np.isfinite(kld_raw)].sum() closest_topics[:, mid] = np.argmin(kld_values, axis=1) kldiv[:, mid] = np.min(kld_values, axis=1) return closest_topics, kldiv elif method == "jaccard": jaccard = np.zeros(shape=(topics_num, matrices_num), dtype=float) for mid, matrix in enum_func(matrices): if mid == ref: continue jaccard_values = np.zeros_like(matrix_ref) for t_ref in range(topics_num): for t in range(topics_num): a = np.argsort(matrix_ref[t_ref, :])[:-top_words-1:-1] b = np.argsort(matrix[t, :])[:-top_words-1:-1] j_num = np.intersect1d(a, b, assume_unique=False).size j_den = np.union1d(a, b).size jaccard_value = j_num / j_den jaccard_values[t_ref, t] = jaccard_value closest_topics[:, mid] = np.argmax(jaccard_values, axis=1) jaccard[:, mid] = np.max(jaccard_values, axis=1) return closest_topics, jaccard return None
def evaluate(self, f): return self.scale * np.sum(kl_div(f, self.fdes))
def cat_kl(p, q): # the following version has problems for p near 0 # return (p*np.log(p/q)).sum(axis=1) return special.kl_div(p, q).sum(axis=1) #pylint: disable=E1101