Exemplo n.º 1
0
 def _kl_divergence(self, X, W, H):
     """
     Calculate the generalized Kullback-Leibler divergence (also called Information Divergence or
     I-Divergence) between two matrices.
     """
     B = W.dot(H)
     return np.sum(kl_div(X,B))
def compute_kl_div(X, X_prime):
    """
    Compute K-L divergence.

    """
    prior_vector = compute_prior_vector()
    p_prime = compute_p(prior_vector, X_prime)
    p = compute_p(prior_vector, X)
    flat_div = kl_div(p_prime, p)
    return flat_div.sum()
Exemplo n.º 3
0
def cat_kl(p, q):
    # the following version has problems for p near 0
    #   return (p*np.log(p/q)).sum(axis=1)
    return special.kl_div(p,q).sum(axis=1) #pylint: disable=E1101
Exemplo n.º 4
0
    r0_twolevel2[i] = len(sim3.people.transtree.targets[0])

print('R0 constant viral load: ', np.mean(r0_const), ' +- ', np.std(r0_const))
print('R0 two level viral load: ', np.mean(r0_twolevel), ' +- ',
      np.std(r0_twolevel))
print('R0 two level diff params: ', np.mean(r0_twolevel2), ' +- ',
      np.std(r0_twolevel2))

import matplotlib.pyplot as plt
hist1 = plt.hist(r0_const, bins=np.arange(-0.5, 10.5), density=True)
hist2 = plt.hist(r0_twolevel, bins=np.arange(-0.5, 10.5), density=True)
hist3 = plt.hist(r0_twolevel2, bins=np.arange(-0.5, 10.5), density=True)
plt.show()
# Test that the R0 did not change substantially, though the std is large
assert (abs(np.mean(r0_const) - np.mean(r0_twolevel)) < np.std(r0_const))
assert (abs(np.mean(r0_const) - np.mean(r0_twolevel2)) < np.std(r0_const))
assert (abs(np.mean(r0_twolevel) - np.mean(r0_twolevel2)) <
        np.std(r0_twolevel))
# adding some sudo counts to distribution because a 0 in the second distribution
# where there isn't a 0 in the first distribution gives inf
hist1[0][hist1[0] == 0] = 1e-10
hist1[0][:] = hist1[0] / sum(hist1[0])
hist2[0][hist2[0] == 0] = 1e-10
hist2[0][:] = hist2[0] / sum(hist2[0])
hist3[0][hist3[0] == 0] = 1e-10
hist3[0][:] = hist3[0] / sum(hist3[0])
# Test the KL diverage of the distributions of R0. Since std is large this is
# likely a stronger test than the above R0 comparisons.
assert (sum(kl_div(hist1[0], hist2[0])) < 1)
assert (sum(kl_div(hist1[0], hist3[0])) < 1)
assert (sum(kl_div(hist2[0], hist3[0])) < 1)
Exemplo n.º 5
0
def get_closest_topics(
        *matrices: List[Union[np.ndarray, DataFrame]],
        ref: int = 0,
        method: str = "klb",
        thres: float = 0.9,
        top_words: int = 100,
        verbose: bool = True) -> Tuple[np.ndarray, np.ndarray]:
    """Finding closest topics in models.

    Parameters
    ----------
    *matrices : List[Union[DataFrame, np.ndarray]]
        Sequence of topics vs words matrices (T x W).
        This matrix can be accessed using ``matrix_words_topics_``
        model attribute.
    ref : int = 0
        Index of reference matrix (zero-based indexing).
    method : str = "klb"
        Comparison method. Possible variants:
        1) "klb" - Kullback-Leibler divergence. Topics are compared by words
        probabilities distributions.
        2) "jaccard" - Jaccard index. Topics are compared by top words sets.
    thres : float = 0.9
        Threshold for topic filtering.
    top_words : int = 100
        Number of top words in each topic to use in Jaccard index calculation.
    verbose : bool = True
        Verbose output (progress bar).

    Returns
    -------
    closest_topics : np.ndarray
        Closest topics indices in one two-dimensional array.
        Columns correspond to the compared matrices (their indices),
        rows are the closest topics pairs.
    dist : np.ndarray
        Kullback-Leibler (if ``method`` is set to ``klb``) or Jaccard index
        values corresponding to the matrix of the closest topics.
    """
    matrices_num = len(matrices)
    ref = matrices_num - 1 if ref >= matrices_num else ref
    matrix_ref = matrices[ref]
    topics_num = matrix_ref.shape[0]
    closest_topics = np.zeros(shape=(topics_num, matrices_num), dtype=int)
    closest_topics[:, ref] = np.arange(topics_num)

    def enum_func(x):
        return enumerate(tqdm.tqdm(x)) if verbose else enumerate(x)

    if method == "klb":
        kldiv = np.zeros(shape=(topics_num, matrices_num), dtype=float)

        for mid, matrix in enum_func(matrices):
            if mid == ref:
                continue
            kld_values = np.zeros((topics_num, topics_num))

            for t_ref in range(topics_num):
                for t in range(topics_num):
                    # kld_raw = 0.5 * (
                    #     ssp.kl_div(matrix[t, :], matrix_ref[t_ref, :]) +
                    #     ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :]))
                    kld_raw = ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :])
                    kld_values[t_ref, t] = kld_raw[np.isfinite(kld_raw)].sum()

            closest_topics[:, mid] = np.argmin(kld_values, axis=1)
            kldiv[:, mid] = np.min(kld_values, axis=1)

        return closest_topics, kldiv
    elif method == "jaccard":
        jaccard = np.zeros(shape=(topics_num, matrices_num), dtype=float)

        for mid, matrix in enum_func(matrices):
            if mid == ref:
                continue
            jaccard_values = np.zeros_like(matrix_ref)

            for t_ref in range(topics_num):
                for t in range(topics_num):
                    a = np.argsort(matrix_ref[t_ref, :])[:-top_words-1:-1]
                    b = np.argsort(matrix[t, :])[:-top_words-1:-1]
                    j_num = np.intersect1d(a, b, assume_unique=False).size
                    j_den = np.union1d(a, b).size
                    jaccard_value = j_num / j_den
                    jaccard_values[t_ref, t] = jaccard_value

            closest_topics[:, mid] = np.argmax(jaccard_values, axis=1)
            jaccard[:, mid] = np.max(jaccard_values, axis=1)

        return closest_topics, jaccard
    return None
Exemplo n.º 6
0
 def evaluate(self, f):
     return self.scale * np.sum(kl_div(f, self.fdes))
Exemplo n.º 7
0
def cat_kl(p, q):
    # the following version has problems for p near 0
    #   return (p*np.log(p/q)).sum(axis=1)
    return special.kl_div(p, q).sum(axis=1)  #pylint: disable=E1101