Exemplo n.º 1
0
def recall(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array:

    selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1)
    hit = selected.sum(axis=-1)

    maxhit = np.minimum(X_true.getnnz(axis=1), R)

    return np.squeeze(np.asarray(hit)) / maxhit
Exemplo n.º 2
0
def _trim_empty(X: sparse.csr_matrix,
                rows: bool = True,
                cols: bool = True) -> sparse.csr_matrix:
    if cols:
        X = X[:, :np.max(X.indices) + 1]
    if rows:
        X = X[:np.where(X.getnnz(axis=1))[0][-1] + 1, :]
    return X
Exemplo n.º 3
0
def recall(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array:
    """ Calculates recall@R for each users in X_true and X_top_k matrices

    Args:
        X_true: Matrix containing True values for user-item interactions
        X_top_k: Matrix containing indices picked by model
        R: Number of elements taken into consideration

    Returns:
        Numpy array containing calculated recall@R for each user
    """

    selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1)
    hit = selected.sum(axis=-1)

    maxhit = np.minimum(X_true.getnnz(axis=1), R)

    return np.squeeze(np.asarray(hit)) / maxhit
Exemplo n.º 4
0
def collect_basic_statistics(
    X: csr_matrix,
    cluster_labels: List[str],
    cond_labels: List[str],
    gene_names: List[str],
    n_jobs: int,
    temp_folder: str,
    verbose: bool,
) -> List[pd.DataFrame]:
    """ Collect basic statistics, triggering calc_basic_stat in parallel
    """
    start = time.perf_counter()

    sum_vec = cnt_vec = None
    if cond_labels is None:
        sum_vec = X.sum(axis=0).A1
        cnt_vec = X.getnnz(axis=0)

    result_list = Parallel(n_jobs=n_jobs, max_nbytes=1e7, temp_folder=temp_folder)(
        delayed(calc_basic_stat)(
            clust_id,
            X.data,
            X.indices,
            X.indptr,
            X.shape,
            cluster_labels,
            cond_labels,
            gene_names,
            sum_vec,
            cnt_vec,
            verbose,
        )
        for clust_id in cluster_labels.categories
    )

    end = time.perf_counter()
    if verbose:
        logger.info(
            "Collecting basic statistics is done. Time spent = {:.2f}s.".format(
                end - start
            )
        )

    return result_list
Exemplo n.º 5
0
def fisher_test(
    X: csr_matrix,
    cluster_labels: List[str],
    cond_labels: List[str],
    gene_names: List[str],
    n_jobs: int,
    temp_folder: str,
    verbose: bool,
) -> List[pd.DataFrame]:
    """ Run Fisher's exact test, triggering calc_fisher in parallel
    """
    start = time.time()

    cnt_vec = None
    if cond_labels is None:
        cnt_vec = X.getnnz(axis=0)

    result_list = Parallel(n_jobs=n_jobs,
                           max_nbytes=1e7,
                           temp_folder=temp_folder)(delayed(calc_fisher)(
                               clust_id,
                               X.data,
                               X.indices,
                               X.indptr,
                               X.shape,
                               cluster_labels,
                               cond_labels,
                               gene_names,
                               cnt_vec,
                               verbose,
                           ) for clust_id in cluster_labels.categories)

    end = time.time()
    if verbose:
        logger.info(
            "Fisher's exact test is done. Time spent = {:.2f}s.".format(end -
                                                                        start))

    return result_list
Exemplo n.º 6
0
Arquivo: ndcg.py Projeto: tgrel/VAE-CF
def ndcg(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array:
    """ Calculate ndcg@R for each users in X_true and X_pred matrices

    Args:
        X_true: Matrix containing True values for user-item interactions
        X_top_k: Matrix containing inidices picked by model
        R: Number of elements taken into consideration

    Returns:
        Numpy array containing calculated ndcg@R for each user
    """

    penalties = 1. / np.log2(np.arange(2, R + 2))
    selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1)

    DCG = selected * penalties

    cpenalties = np.empty(R + 1)
    np.cumsum(penalties, out=cpenalties[1:])
    cpenalties[0] = 0
    maxhit = np.minimum(X_true.getnnz(axis=1), R)
    IDCG = cpenalties[maxhit]

    return DCG / IDCG
Exemplo n.º 7
0
def _mutual_proximity_empiric_sparse(S: csr_matrix,
                                     test_set_ind: np.ndarray = None,
                                     min_nnz=0,
                                     verbose: int = 0,
                                     log=None,
                                     n_jobs=None):
    """MP empiric for sparse similarity matrices.

    Please do not directly use this function, but invoke via 
    mutual_proximity_empiric()
    """
    if verbose and log:
        log.message("Starting MP empiric for sparse matrices.")
    self_value = 1.  # similarity matrix
    n = S.shape[0]
    if not n_jobs:
        n_jobs = 1
    elif n_jobs == -1:
        n_jobs = cpu_count()
    else:
        pass

    # This will become S_mp.data
    shared_data = Array(ctypes.c_double, S.data.size)
    shared_data_np = np.ctypeslib.as_array(shared_data.get_obj())

    if verbose and log:
        log.message("Spawning processes and starting MP computation.")
    with Pool(processes=n_jobs,
              initializer=_mpes_init,
              initargs=(S, shared_data)) as pool:
        S_nonzero = filterfalse(lambda ij: ij[0] > ij[1], zip(*S.nonzero()))
        for _ in pool.imap(func=partial(_mpes_sec_dist,
                                        args=(verbose, log, n, min_nnz)),
                           iterable=S_nonzero,
                           chunksize=int(1e5)):
            pass  # output stored by function in shared array
    pool.join()
    if verbose and log:
        log.message("Assemble upper-triangular MP matrix.")
    S_mp = csr_matrix((shared_data_np, S.indices, S.indptr),
                      shape=S.shape,
                      copy=False).tolil()
    del shared_data, shared_data_np
    if verbose and log:
        log.message("Symmetrizing matrix.")
    S_mp += S_mp.T
    # Retain original distances for objects with too few neighbors.
    # That is, keep distances FROM these objects to others (rows), but
    # set distances of other objects TO them to NaN (columns).
    # Returned matrix is thus NOT SYMMETRIC.
    if verbose and log:
        log.message(("Retain original similarities for objects with too few "
                     "neighbors. If there are any, the output matrix will "
                     "not be symmetric anymore! (Rows corresponding to these "
                     "objects will be in original space; corresponding "
                     "columns will contain NaN)."))
    for row in np.argwhere(S.getnnz(axis=1) <= min_nnz):
        row = row[0]  # use scalar for indexing instead of array
        S_mp[row, :] = S.getrow(row)
    if verbose and log:
        log.message("Setting self similarities.")
    for i in range(n):
        S_mp[i, i] = self_value  #need to set self values
    if verbose and log:
        log.message("Converting to CSR matrix and returning.")
    return S_mp.tocsr()