示例#1
0
def significance(
    TTM: sp.csc_matrix,
    metric: Union[Callable, KeynessMetric],
    normalize: bool = False,
    n_contexts=None,
    n_words=None,
) -> sp.csc_matrix:
    """Computes statistical significance tf co-occurrences using `metric`.

    Args:
        TTM (sp.csc_matrix): [description]
        normalize (bool, optional): [description]. Defaults to False.

    Returns:
        sp.csc_matrix: [description]
    """
    metric = metric if callable(metric) else METRIC_FUNCTION.get(
        metric, _undefined)

    K: float = n_contexts
    N: float = n_words
    """Total number of observations (counts)"""
    Z: float = float(TTM.sum())
    """Number of observations per context (document, row sum)"""
    Zr = np.array(TTM.sum(axis=1), dtype=np.float64).flatten()
    """Row and column indices of non-zero elements."""
    ii, jj = TTM.nonzero()

    Cij: np.ndarray = np.array(TTM[ii, jj], dtype=np.float64).flatten()
    """Compute weights (with optional normalize)."""
    weights: np.ndarray = metric(Cij=Cij,
                                 Z=Z,
                                 Zr=Zr,
                                 ii=ii,
                                 jj=jj,
                                 K=K,
                                 N=N,
                                 normalize=normalize)

    np.nan_to_num(
        weights,
        copy=False,
        posinf=0.0,
        neginf=0.0,
        nan=0.0,
    )

    nz_indices: np.ndarray = weights.nonzero()

    return (weights[nz_indices], (ii[nz_indices], jj[nz_indices]))
示例#2
0
def get_distances(node: Tuple[int, int],
                  nodes: sparse.csc_matrix) -> List[int]:
    non_zero = [coord for coord in zip(*nodes.nonzero())]
    distances = spatial.distance.cdist([node], non_zero,
                                       metric='cityblock').flatten().tolist()
    return distances