def recall(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array: selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1) hit = selected.sum(axis=-1) maxhit = np.minimum(X_true.getnnz(axis=1), R) return np.squeeze(np.asarray(hit)) / maxhit
def _trim_empty(X: sparse.csr_matrix, rows: bool = True, cols: bool = True) -> sparse.csr_matrix: if cols: X = X[:, :np.max(X.indices) + 1] if rows: X = X[:np.where(X.getnnz(axis=1))[0][-1] + 1, :] return X
def recall(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array: """ Calculates recall@R for each users in X_true and X_top_k matrices Args: X_true: Matrix containing True values for user-item interactions X_top_k: Matrix containing indices picked by model R: Number of elements taken into consideration Returns: Numpy array containing calculated recall@R for each user """ selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1) hit = selected.sum(axis=-1) maxhit = np.minimum(X_true.getnnz(axis=1), R) return np.squeeze(np.asarray(hit)) / maxhit
def collect_basic_statistics( X: csr_matrix, cluster_labels: List[str], cond_labels: List[str], gene_names: List[str], n_jobs: int, temp_folder: str, verbose: bool, ) -> List[pd.DataFrame]: """ Collect basic statistics, triggering calc_basic_stat in parallel """ start = time.perf_counter() sum_vec = cnt_vec = None if cond_labels is None: sum_vec = X.sum(axis=0).A1 cnt_vec = X.getnnz(axis=0) result_list = Parallel(n_jobs=n_jobs, max_nbytes=1e7, temp_folder=temp_folder)( delayed(calc_basic_stat)( clust_id, X.data, X.indices, X.indptr, X.shape, cluster_labels, cond_labels, gene_names, sum_vec, cnt_vec, verbose, ) for clust_id in cluster_labels.categories ) end = time.perf_counter() if verbose: logger.info( "Collecting basic statistics is done. Time spent = {:.2f}s.".format( end - start ) ) return result_list
def fisher_test( X: csr_matrix, cluster_labels: List[str], cond_labels: List[str], gene_names: List[str], n_jobs: int, temp_folder: str, verbose: bool, ) -> List[pd.DataFrame]: """ Run Fisher's exact test, triggering calc_fisher in parallel """ start = time.time() cnt_vec = None if cond_labels is None: cnt_vec = X.getnnz(axis=0) result_list = Parallel(n_jobs=n_jobs, max_nbytes=1e7, temp_folder=temp_folder)(delayed(calc_fisher)( clust_id, X.data, X.indices, X.indptr, X.shape, cluster_labels, cond_labels, gene_names, cnt_vec, verbose, ) for clust_id in cluster_labels.categories) end = time.time() if verbose: logger.info( "Fisher's exact test is done. Time spent = {:.2f}s.".format(end - start)) return result_list
def ndcg(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array: """ Calculate ndcg@R for each users in X_true and X_pred matrices Args: X_true: Matrix containing True values for user-item interactions X_top_k: Matrix containing inidices picked by model R: Number of elements taken into consideration Returns: Numpy array containing calculated ndcg@R for each user """ penalties = 1. / np.log2(np.arange(2, R + 2)) selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1) DCG = selected * penalties cpenalties = np.empty(R + 1) np.cumsum(penalties, out=cpenalties[1:]) cpenalties[0] = 0 maxhit = np.minimum(X_true.getnnz(axis=1), R) IDCG = cpenalties[maxhit] return DCG / IDCG
def _mutual_proximity_empiric_sparse(S: csr_matrix, test_set_ind: np.ndarray = None, min_nnz=0, verbose: int = 0, log=None, n_jobs=None): """MP empiric for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_empiric() """ if verbose and log: log.message("Starting MP empiric for sparse matrices.") self_value = 1. # similarity matrix n = S.shape[0] if not n_jobs: n_jobs = 1 elif n_jobs == -1: n_jobs = cpu_count() else: pass # This will become S_mp.data shared_data = Array(ctypes.c_double, S.data.size) shared_data_np = np.ctypeslib.as_array(shared_data.get_obj()) if verbose and log: log.message("Spawning processes and starting MP computation.") with Pool(processes=n_jobs, initializer=_mpes_init, initargs=(S, shared_data)) as pool: S_nonzero = filterfalse(lambda ij: ij[0] > ij[1], zip(*S.nonzero())) for _ in pool.imap(func=partial(_mpes_sec_dist, args=(verbose, log, n, min_nnz)), iterable=S_nonzero, chunksize=int(1e5)): pass # output stored by function in shared array pool.join() if verbose and log: log.message("Assemble upper-triangular MP matrix.") S_mp = csr_matrix((shared_data_np, S.indices, S.indptr), shape=S.shape, copy=False).tolil() del shared_data, shared_data_np if verbose and log: log.message("Symmetrizing matrix.") S_mp += S_mp.T # Retain original distances for objects with too few neighbors. # That is, keep distances FROM these objects to others (rows), but # set distances of other objects TO them to NaN (columns). # Returned matrix is thus NOT SYMMETRIC. if verbose and log: log.message(("Retain original similarities for objects with too few " "neighbors. If there are any, the output matrix will " "not be symmetric anymore! (Rows corresponding to these " "objects will be in original space; corresponding " "columns will contain NaN).")) for row in np.argwhere(S.getnnz(axis=1) <= min_nnz): row = row[0] # use scalar for indexing instead of array S_mp[row, :] = S.getrow(row) if verbose and log: log.message("Setting self similarities.") for i in range(n): S_mp[i, i] = self_value #need to set self values if verbose and log: log.message("Converting to CSR matrix and returning.") return S_mp.tocsr()