Exemplos de argpartition em Python, exemplos de cupy.argpartition em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: predictor.py Projeto: shuyangli94/LatentLinearCritiquingforConvRecSys

def sub_routine(vector_u, matrix_V, vector_train, bias, topK=500, gpu=False):

    train_index = vector_train.nonzero()[1]

    vector_predict = matrix_V.dot(vector_u)

    if bias is not None:
        if gpu:
            import cupy as cp
            vector_predict = vector_predict + cp.array(bias)
        else:
            vector_predict = vector_predict + bias

    if gpu:
        import cupy as cp
        candidate_index = cp.argpartition(
            -vector_predict, topK + len(train_index))[:topK + len(train_index)]
        vector_predict = candidate_index[
            vector_predict[candidate_index].argsort()[::-1]]
        vector_predict = cp.asnumpy(vector_predict).astype(np.float32)
    else:
        candidate_index = np.argpartition(
            -vector_predict, topK + len(train_index))[:topK + len(train_index)]
        vector_predict = candidate_index[
            vector_predict[candidate_index].argsort()[::-1]]
    vector_predict = np.delete(
        vector_predict,
        np.isin(vector_predict, train_index).nonzero()[0])

    return vector_predict[:topK]

Exemplo n.º 2

0

Exibir arquivo

def batch_sub_routine(subset_U,
                      matrix_V,
                      subset_Train,
                      bias,
                      measure,
                      topK=50,
                      gpu=False):
    train_indecs = [i.nonzero()[1] for i in subset_Train]
    train_num_ratings = [i.nnz for i in subset_Train]
    if measure == "Cosine":
        batch_predict = subset_U.dot(matrix_V.T)
    else:
        if gpu:
            import cupy as cp
            batch_predict = (subset_U**2).sum(
                axis=-1)[:, np.newaxis] + (matrix_V**2).sum(axis=-1)
            batch_predict -= 2 * cp.squeeze(
                subset_U.dot(matrix_V[..., np.newaxis]), axis=-1)
            batch_predict **= 0.5
            batch_predict = -batch_predict
        else:
            batch_predict = (subset_U**2).sum(
                axis=-1)[:, np.newaxis] + (matrix_V**2).sum(axis=-1)
            batch_predict -= 2 * np.squeeze(
                subset_U.dot(matrix_V[..., np.newaxis]), axis=-1)
            batch_predict **= 0.5
            batch_predict = -batch_predict
    if bias is not None:
        if gpu:
            import cupy as cp
            batch_predict = batch_predict + cp.array(bias)
        else:
            batch_predict = batch_predict + bias

    if gpu:
        import cupy as cp
        candidate_indecs = cp.argpartition(
            -batch_predict,
            range(topK + max(train_num_ratings)))[:topK +
                                                  max(train_num_ratings)]
        candidate_indecs = cp.asnumpy(candidate_indecs)
    else:
        candidate_indecs = np.argpartition(
            -batch_predict,
            range(topK + max(train_num_ratings)))[:topK +
                                                  max(train_num_ratings)]

    batch_predict = []
    for i, vector_predict in enumerate(candidate_indecs):

        if train_num_ratings[i] > 0:
            batch_predict.append(
                np.delete(
                    vector_predict,
                    np.isin(vector_predict,
                            train_indecs[i]).nonzero()[0])[:topK])
        else:
            batch_predict.append(np.zeros(topK))

    return np.array(batch_predict)

Exemplo n.º 3

0

Exibir arquivo

    def recommend(self,
                  userid,
                  user_items,
                  N=10,
                  filter_already_liked_items=True,
                  filter_items=None,
                  recalculate_user=False):
        if recalculate_user:
            raise NotImplementedError(
                "recalculate_user isn't support on GPU yet")

        user = self.user_factors[userid]

        liked = set()
        if filter_already_liked_items:
            liked.update(user_items[userid].indices)
        if filter_items:
            liked.update(filter_items)

        # calculate the top N items, removing the users own liked items from the results
        scores = self.item_factors.dot(user)

        count = N + len(liked)
        if count < len(scores):
            ids = cp.argpartition(scores, -count)[-count:]
            best = sorted(zip(ids.tolist(), scores[ids].tolist()),
                          key=lambda x: -x[1])
        else:
            best = sorted(enumerate(scores.tolist()), key=lambda x: -x[1])

        return list(
            itertools.islice((rec for rec in best if rec[0] not in liked), N))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: predictor.py Projeto: shuyangli94/LatentLinearCritiquingforConvRecSys

def predict_items(prediction_scores, topK, matrix_Train, gpu=False):
    prediction = []

    for user_index in tqdm(range(prediction_scores.shape[0])):
        vector_u = prediction_scores[user_index]
        vector_train = matrix_Train[user_index]
        if len(vector_train.nonzero()[0]) > 0:
            train_index = vector_train.nonzero()[1]

            if gpu:
                import cupy as cp
                candidate_index = cp.argpartition(
                    -vector_u,
                    topK + len(train_index))[:topK + len(train_index)]
                vector_u = candidate_index[vector_u[candidate_index].argsort()
                                           [::-1]]
                vector_u = cp.asnumpy(vector_u).astype(np.float32)
            else:
                candidate_index = np.argpartition(
                    -vector_u,
                    topK + len(train_index))[:topK + len(train_index)]
                vector_u = candidate_index[vector_u[candidate_index].argsort()
                                           [::-1]]
            vector_u = np.delete(vector_u,
                                 np.isin(vector_u, train_index).nonzero()[0])

            vector_predict = vector_u[:topK]
        else:
            vector_predict = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predict)

    return np.vstack(prediction)

Exemplo n.º 5

0

Exibir arquivo

 def similar_users(self, userid, N=10):
     factor = self.user_factors[userid]
     norm = self.user_norms[userid]
     scores = self.user_factors.dot(factor) / (norm * self.user_norms)
     best = cp.argpartition(scores, -N)[-N:]
     return sorted(zip(best.tolist(), scores[best].tolist()),
                   key=lambda x: -x[1])

Exemplo n.º 6

0

Exibir arquivo

def top(x, a):
    dim = x.shape[0]
    if a == 0:
        return 0
    if a >= dim:
        return x
    index_array = xp.argpartition(x, kth=a, axis=0)[a:]
    xp.put_along_axis(x, index_array, 0, axis=0)
    return x

Exemplo n.º 7

0

Exibir arquivo

Arquivo: function_utils.py Projeto: ramannamaneni/HNN

def top_k(array, k, axis=0, biggest=True):
    """ Return the topK index along the specified dimension,
        The returned indices are such that their array values are sorted
        
        -Input:
        array: 1d or 2d array
        k: the top `k` (k>0, integer)
        axis: futile if array is 1d, otherwise sorting along the specified axis
              default to 0
        biggest: whether the top-k biggest or smallest, default to True

        -Output:
        inds: indices
        vals: array values at the indices
    """
    assert array.ndim == 1 or array.ndim == 2
    assert axis == 0 or axis == 1
    if biggest:
        array = -array
    
    if array.ndim == 1:
        inds = xp.argpartition(array, k)[:k]
        vals = array[inds]
        sort_inds = xp.argsort(vals)
        inds = inds[sort_inds]
        vals = vals[sort_inds]

    elif axis == 0:
        inds = xp.argpartition(array, k, axis=0)[:k, :]
        vals = array[inds, xp.arange(array.shape[1])[None, :]]
        sort_inds = xp.argsort(vals, axis=0)
        inds = inds[sort_inds, xp.arange(array.shape[1])[None, :]]
        vals = vals[sort_inds, xp.arange(array.shape[1])[None, :]]

    else:
        inds = xp.argpartition(array, k, axis=1)[:, :k]
        vals = array[xp.arange(array.shape[0])[:, None], inds]
        sort_inds = xp.argsort(vals, axis=1)
        inds = inds[xp.arange(array.shape[0])[:, None], sort_inds]
        vals = vals[xp.arange(array.shape[0])[:, None], sort_inds]

    if biggest:
        vals = -vals
    return inds, vals

Exemplo n.º 8

0

Exibir arquivo

Arquivo: cupy.py Projeto: pacificlion/evoflow

def bottom_k_indices(tensor, k, axis=-1):
    """
    Finds the indices of the k smallest entries alongside an axis.

    Args:
        tensor (ndarray): Tensor with a last dimension of at least k size.

        k (i): number of elements to return.

    """
    idxs = cp.argpartition(tensor, k)[:k]
    return idxs[cp.argsort(tensor[idxs])]

Exemplo n.º 9

0

Exibir arquivo

def sub_routine(vector_predict,
                train_index,
                active_index,
                sample_from_all,
                iterative,
                history_item,
                topK=500,
                gpu=False):

    #    sort_length = topK + len(train_index)

    #    if sort_length + 1 > len(vector_predict) or not sample_from_all:
    #        sort_length = len(vector_predict) - 1

    sort_length = len(vector_predict) - 1
    #    import ipdb; ipdb.set_trace()
    if gpu:
        import cupy as cp
        candidate_index = cp.argpartition(-vector_predict,
                                          sort_length)[:sort_length]
        vector_predict = candidate_index[
            vector_predict[candidate_index].argsort()[::-1]]
        vector_predict = cp.asnumpy(vector_predict).astype(np.float32)
    else:
        candidate_index = np.argpartition(-vector_predict,
                                          sort_length)[:sort_length]
        vector_predict = candidate_index[
            vector_predict[candidate_index].argsort()[::-1]]
#    import ipdb; ipdb.set_trace()

    vector_predict = np.delete(
        vector_predict,
        np.isin(vector_predict, train_index).nonzero()[0])
    #    import ipdb; ipdb.set_trace()

    if history_item.size != 0 and iterative:
        vector_predict = np.delete(
            vector_predict,
            np.isin(vector_predict, history_item).nonzero()[0])
#    import ipdb; ipdb.set_trace()

    if not sample_from_all:
        vector_predict, index, _ = np.intersect1d(vector_predict,
                                                  active_index,
                                                  return_indices=True)
        vector_predict = vector_predict[index.argsort()]
#    import ipdb; ipdb.set_trace()

#    predict_items = vector_predict[:topK]
#    history_item = np.concatenate([history_item, predict_items])

    return vector_predict[:topK]

Exemplo n.º 10

0

Exibir arquivo

def knn(x, k=20, axis=None, gpu=False):
    x_arr = x.array
    """
    if x.data.dtype == cp.float64 or x.data.dtype == cp.float32:
        res = cp.argpartition(x_arr,kth=k)
    elif x.data.dtype == np.float64 or x.data.dtype == np.float32:
        res = np.argpartition(x_arr,kth=k)
    """
    if gpu:
        res = cp.argpartition(x_arr, kth=k)
    else:
        res = np.argpartition(x_arr, kth=k)

    return res[:, :, 0:k]

Exemplo n.º 11

0

Exibir arquivo

 def similar_items(self,
                   itemid,
                   N=10,
                   react_users=None,
                   recalculate_item=False):
     if recalculate_item:
         raise NotImplementedError(
             "recalculate_item isn't support on GPU yet")
     factor = self.item_factors[itemid]
     norm = self.item_norms[itemid]
     scores = self.item_factors.dot(factor) / (norm * self.item_norms)
     best = cp.argpartition(scores, -N)[-N:]
     return sorted(zip(best.tolist(), scores[best].tolist()),
                   key=lambda x: -x[1])

Exemplo n.º 12

0

Exibir arquivo

Arquivo: cupy.py Projeto: ameya7295/geneflow

def bottom_k_indices(tensor, k, axis=-1):
    """
    Finds the indices of the k smallest entries alongside an axis.

    Args:
        tensor (ndarray): Tensor with a last dimension of at least k size.

        k (i): number of elements to return.

        axis (int or None) - Axis along which to sort. Default is -1,
        which is the last axis. If None is supplied,
        the array will be flattened before sorting.

    """
    idxs = cp.argpartition(tensor, k)[:k]
    return idxs[cp.argsort(tensor[idxs])]

Exemplo n.º 13

0

Exibir arquivo

    def predict_proba_cuda(self, dist_mat, val_ground, metric='cosine'):



        # dist_mat = cp.array(dist_mat)
        val_ground = cp.array(val_ground)
        if metric == 'cosine':
            neigh = cp.argpartition(dist_mat, self.n, axis=1)[:, :self.n]
            neigh_ground = val_ground[neigh]

        else:
            neigh_ground = val_ground[cp.argsort(dist_mat, axis=1)[:, -self.n:]]

        marginals = cp.ones((dist_mat.shape[0], 2))
        marginals[:, 1] = cp.sum(neigh_ground == 1, axis=1) / self.n

        return marginals[:, 1]

Exemplo n.º 14

0

Exibir arquivo

Arquivo: predictor.py Projeto: wuga214/MultiModesPreferenceEstimation

def sub_routine(vector_u,
                matrix_V,
                vector_train,
                bias,
                measure,
                topK=500,
                gpu=False):

    train_index = vector_train.nonzero()[1]
    if measure == "Cosine":
        if len(vector_u.shape) > 1:
            # import ipdb; ipdb.set_trace()
            vector_predict = np.max(matrix_V.dot(vector_u.T), axis=1)
        else:
            vector_predict = matrix_V.dot(vector_u)
    else:
        if gpu:
            import cupy as cp
            vector_predict = -cp.sum(cp.square(matrix_V - vector_u), axis=1)
        else:
            vector_predict = -np.sum(np.square(matrix_V - vector_u), axis=1)
    if bias is not None:
        if gpu:
            import cupy as cp
            vector_predict = vector_predict + cp.array(bias)
        else:
            vector_predict = vector_predict + bias

    if gpu:
        import cupy as cp
        candidate_index = cp.argpartition(
            -vector_predict, topK + len(train_index))[:topK + len(train_index)]
        vector_predict = candidate_index[
            vector_predict[candidate_index].argsort()[::-1]]
        vector_predict = cp.asnumpy(vector_predict).astype(np.float32)
    else:
        candidate_index = np.argpartition(
            -vector_predict, topK + len(train_index))[:topK + len(train_index)]
        vector_predict = candidate_index[
            vector_predict[candidate_index].argsort()[::-1]]
    vector_predict = np.delete(
        vector_predict,
        np.isin(vector_predict, train_index).nonzero()[0])

    return vector_predict[:topK]

Exemplo n.º 15

0

Exibir arquivo

Arquivo: cupy.py Projeto: pacificlion/evoflow

def top_k_indices(tensor, k):
    """
    Finds the indices of the k largest entries alongside an axis.

    Args:
        tensor (ndarray): Tensor with a last dimension of at least k size.

        k (i): number of elements to return.

    """
    k = -k  # reverse to get top elements

    # we do partition and then sort to maximizes speed and have an avg
    # complexity of O(k log k).
    idxs = cp.argpartition(tensor, k)[k:]

    # ! mind the - argsort return in increasing order we want largest
    return idxs[cp.argsort(-tensor[idxs])]

Exemplo n.º 16

0

Exibir arquivo

Arquivo: cupy.py Projeto: ameya7295/geneflow

def top_k_indices(tensor, k, axis=-1):
    """
    Finds the indices of the k largest entries alongside an axis.

    Args:
        tensor (ndarray): Tensor with a last dimension of at least k size.

        k (i): number of elements to return.

        axis (int or None) - Axis along which to sort. Default is -1,
        which is the last axis. If None is supplied,
        the array will be flattened before sorting.

    """
    k = -k  # reverse to get top elements

    # we do partition and then sort to maximizes speed and have an avg
    # complexity of O(k log k).
    idxs = cp.argpartition(tensor, k)[k:]

    # ! mind the - argsort return in increasing order we want largest
    return idxs[cp.argsort(-tensor[idxs])]

Exemplo n.º 17

0

Exibir arquivo

Arquivo: AlignedVectors.py Projeto: mcyph/pos_tagger

    def get_similar_words(self, find_me, n=30):
        """
        Given a input vector for a given word,
        get the most similar words.

        :param find_me: a vector found using get_vector_for_word()
        """

        # I've tried to make sure I'm not transferring many times
        # between gpu/main memory, as that can be very slow!!

        # Use cosine similarity
        # Could use sklearn, but best to use generic
        # numpy ops so as to be able to parallelize
        #from sklearn.metrics.pairwise import cosine_similarity
        #LCands = cosine_similarity(find_me.reshape(1, -1), self.LVectors).reshape(-1)

        a = find_me
        b = self.LVectors
        LCands = np.sum(a * b, axis=1)  # dot product for each row
        LCands = LCands / (linalg.norm(a) * linalg.norm(b, axis=1))
        LCands = LCands.reshape(-1)

        LLargestIdx = np.argpartition(LCands, -n)[-n:]
        LCands = LCands[LLargestIdx]

        if using_gpu:
            LLargestIdx = np.asnumpy(LLargestIdx)
            LCands = np.asnumpy(LCands)

        LRtn = []
        for idx, score in zip(LLargestIdx, LCands):
            # (score, word_index/frequency)
            LRtn.append(
                (int(idx), float(score), self.word_index_to_word(int(idx))))
        LRtn.sort(key=lambda i: i[1], reverse=True)
        return LRtn

Exemplo n.º 18

0

Exibir arquivo

Arquivo: rapids_scanpy_funcs.py Projeto: yohannes-didana/showcase

def rank_genes_groups(
    X,
    labels,  # louvain results
    var_names,
    groupby=str,
    groups=None,
    reference='rest',
    n_genes=100,
    key_added=None,
    layer=None,
    **kwds,
):

    #### Wherever we see "adata.obs[groupby], we should just replace w/ the groups"

    import time

    start = time.time()

    # for clarity, rename variable
    if groups == 'all':
        groups_order = 'all'
    elif isinstance(groups, (str, int)):
        raise ValueError('Specify a sequence of groups')
    else:
        groups_order = list(groups)
        if isinstance(groups_order[0], int):
            groups_order = [str(n) for n in groups_order]
        if reference != 'rest' and reference not in set(groups_order):
            groups_order += [reference]
    if (reference != 'rest' and reference not in set(labels.cat.categories)):
        cats = labels.cat.categories.tolist()
        raise ValueError(
            f'reference = {reference} needs to be one of groupby = {cats}.')

    groups_order, groups_masks = select_groups(labels, groups_order)

    original_reference = reference

    n_vars = len(var_names)

    # for clarity, rename variable
    n_genes_user = n_genes
    # make sure indices are not OoB in case there are less genes than n_genes
    if n_genes_user > X.shape[1]:
        n_genes_user = X.shape[1]
    # in the following, n_genes is simply another name for the total number of genes
    n_genes = X.shape[1]

    n_groups = groups_masks.shape[0]
    ns = cp.zeros(n_groups, dtype=int)
    for imask, mask in enumerate(groups_masks):
        ns[imask] = cp.where(mask)[0].size
    if reference != 'rest':
        ireference = cp.where(groups_order == reference)[0][0]
    reference_indices = cp.arange(n_vars, dtype=int)

    rankings_gene_scores = []
    rankings_gene_names = []
    rankings_gene_logfoldchanges = []
    rankings_gene_pvals = []
    rankings_gene_pvals_adj = []

    #     if 'log1p' in adata.uns_keys() and adata.uns['log1p']['base'] is not None:
    #         expm1_func = lambda x: np.expm1(x * np.log(adata.uns['log1p']['base']))
    #     else:
    #         expm1_func = np.expm1

    # Perform LogReg

    # if reference is not set, then the groups listed will be compared to the rest
    # if reference is set, then the groups listed will be compared only to the other groups listed
    from cuml.linear_model import LogisticRegression
    reference = groups_order[0]
    if len(groups) == 1:
        raise Exception(
            'Cannot perform logistic regression on a single cluster.')
    grouping_mask = labels.astype('int').isin(cudf.Series(groups_order))
    grouping = labels.loc[grouping_mask]

    X = X[grouping_mask.
          values, :]  # Indexing with a series causes issues, possibly segfault
    y = labels.loc[grouping]

    clf = LogisticRegression(**kwds)
    clf.fit(X.get(), grouping.to_array().astype('float32'))
    scores_all = cp.array(clf.coef_).T

    for igroup, group in enumerate(groups_order):
        if len(groups_order) <= 2:  # binary logistic regression
            scores = scores_all[0]
        else:
            scores = scores_all[igroup]

        partition = cp.argpartition(scores, -n_genes_user)[-n_genes_user:]
        partial_indices = cp.argsort(scores[partition])[::-1]
        global_indices = reference_indices[partition][partial_indices]
        rankings_gene_scores.append(scores[global_indices].get(
        ))  ## Shouldn't need to take this off device
        rankings_gene_names.append(var_names[global_indices].to_pandas())
        if len(groups_order) <= 2:
            break

    groups_order_save = [str(g) for g in groups_order]
    if (len(groups) == 2):
        groups_order_save = [g for g in groups_order if g != reference]

    print("Ranking took (GPU): " + str(time.time() - start))

    start = time.time()

    scores = np.rec.fromarrays(
        [n for n in rankings_gene_scores],
        dtype=[(rn, 'float32') for rn in groups_order_save],
    )

    names = np.rec.fromarrays(
        [n for n in rankings_gene_names],
        dtype=[(rn, 'U50') for rn in groups_order_save],
    )

    print("Preparing output np.rec.fromarrays took (CPU): " +
          str(time.time() - start))
    print("Note: This operation will be accelerated in a future version")

    return scores, names, original_reference

Exemplo n.º 19

0

Exibir arquivo

def cga(fobj,obj, bounds,angle_in, mut=0.1, crossp=0.6,nk=50, popsize=1000, its=4000,pcj=0.5,pmj=1):
    global errores_mejor
    errores_mejor=cp.zeros(its)
    pop=initGauss(bounds,nk,popsize,angle_in)
    #pop=initanh(bounds,nk,popsize,angle_in)
    Pfitness = cp.asarray([fobj(ind,obj,angle_in,nk) for ind in pop])
    best_idx = cp.argmax(Pfitness)
    best = pop[best_idx]
    hall_of_fame=best
    termino=False
    elite_total=int(popsize/10)
    try:
        a=tqdm(range(its))
    except:
        a=range(its)

    for i in a:
        if termino:
            errores_mejor[i]=Pfitness[best_idx]
            yield best,Pfitness[best_idx]
        else:
            fitnessT=cp.zeros(popsize)
            childs=cp.zeros((popsize,nk,4))
            for j in range(popsize):
                fitnessT[j]=fobj(pop[j],obj,angle_in,nk)
            max_fitness=cp.sum(fitnessT)
            elite_index=cp.argpartition(fitnessT, -1*elite_total)[-1*elite_total:]
            fitnessRel=fitnessT/max_fitness
            aceptados=0
            aceptados_array=cp.random.choice(popsize, popsize, p=fitnessRel)
            
            childs[0:elite_total]=pop[elite_index]
            


            for j in range((popsize/2)-elite_total):
                indexP1=aceptados_array[j*2]
                indexP2=aceptados_array[j*2+1]
                while indexP1==indexP2:
                    indexP2=(indexP2+1)%(popsize-1)
                P1=pop[aceptados_array[indexP1]]
                P2=pop[aceptados_array[indexP2]]
                if cp.random.rand()<crossp:
                    C1,C2=cross(P1,P2,pcj)
                else:
                    C1 = P1.copy()
                    C2 = P2.copy()
                if cp.random.rand()<mut:
                    C1=cp.array(mutation(C1,bounds,pmj))
                if cp.random.rand()<mut:
                    C2=cp.array(mutation(C2,bounds,pmj))
                childs[j*2+elite_total]=C1.copy()
                childs[j*2+1+elite_total]=C2.copy()

            pop=childs.copy()
            Pfitness = cp.asarray([fobj(ind,obj,angle_in,nk) for ind in pop])
            best_idx=cp.argmax(Pfitness)
            best=pop[best_idx]
            if i%10==0:
                df = pd.DataFrame(cp.asnumpy(best))
                df.to_csv("trayectorias/file_path{}.csv".format(i))
            if fobj(hall_of_fame,obj,angle_in,nk)<Pfitness[best_idx]:
                print("Encontre uno mejor con fitness igual a {}".format(Pfitness[best_idx]))
                print("alcance un fitness de:{} con promedio {}".format(Pfitness[best_idx],cp.mean(Pfitness)))
                hall_of_fame=best.copy()

            if Pfitness[best_idx]>=2.0-0.001:
                print("alcance un fitness de:{}".format(Pfitness[best_idx]))
                termino=True
            #print(i)
            errores_mejor[i]=Pfitness[best_idx]
            yield best, Pfitness[best_idx]

Exemplo n.º 20

0

Exibir arquivo

Arquivo: rapids_scanpy_funcs.py Projeto: asif7adil/rapids-single-cell-examples

def rank_genes_groups(
    X,
    labels,  # louvain results
    var_names,
    groups=None,
    reference='rest',
    n_genes=100,
    **kwds,
):
    """
    Rank genes for characterizing groups.

    Parameters
    ----------

    X : cupy.ndarray of shape (n_cells, n_genes)
        The cellxgene matrix to rank genes

    labels : cudf.Series of size (n_cells,)
        Observations groupings to consider

    var_names : cudf.Series of size (n_genes,)
        Names of genes in X

    groups : Iterable[str] (default: 'all')
        Subset of groups, e.g. ['g1', 'g2', 'g3'], to which comparison
        shall be restricted, or 'all' (default), for all groups.

    reference : str (default: 'rest')
        If 'rest', compare each group to the union of the rest of the group.
        If a group identifier, compare with respect to this group.

    n_genes : int (default: 100)
        The number of genes that appear in the returned tables.
    """

    #### Wherever we see "adata.obs[groupby], we should just replace w/ the groups"

    import time

    start = time.time()

    # for clarity, rename variable
    if groups == 'all':
        groups_order = 'all'
    elif isinstance(groups, (str, int)):
        raise ValueError('Specify a sequence of groups')
    else:
        groups_order = list(groups)
        if isinstance(groups_order[0], int):
            groups_order = [str(n) for n in groups_order]
        if reference != 'rest' and reference not in set(groups_order):
            groups_order += [reference]
    if (reference != 'rest' and reference not in set(labels.cat.categories)):
        cats = labels.cat.categories.tolist()
        raise ValueError(
            f'reference = {reference} needs to be one of groupby = {cats}.')

    groups_order, groups_masks = select_groups(labels, groups_order)

    original_reference = reference

    n_vars = len(var_names)

    # for clarity, rename variable
    n_genes_user = n_genes
    # make sure indices are not OoB in case there are less genes than n_genes
    if n_genes_user > X.shape[1]:
        n_genes_user = X.shape[1]
    # in the following, n_genes is simply another name for the total number of genes
    n_genes = X.shape[1]

    n_groups = groups_masks.shape[0]
    ns = cp.zeros(n_groups, dtype=int)
    for imask, mask in enumerate(groups_masks):
        ns[imask] = cp.where(mask)[0].size
    if reference != 'rest':
        ireference = cp.where(groups_order == reference)[0][0]
    reference_indices = cp.arange(n_vars, dtype=int)

    rankings_gene_scores = []
    rankings_gene_names = []

    # Perform LogReg

    # if reference is not set, then the groups listed will be compared to the rest
    # if reference is set, then the groups listed will be compared only to the other groups listed
    from cuml.linear_model import LogisticRegression
    reference = groups_order[0]
    if len(groups) == 1:
        raise Exception(
            'Cannot perform logistic regression on a single cluster.')
    grouping_mask = labels.astype('int').isin(cudf.Series(groups_order))
    grouping = labels.loc[grouping_mask]

    X = X[grouping_mask.
          values, :]  # Indexing with a series causes issues, possibly segfault
    y = labels.loc[grouping]

    clf = LogisticRegression(**kwds)
    clf.fit(X.get(), grouping.to_array().astype('float32'))
    scores_all = cp.array(clf.coef_).T

    for igroup, group in enumerate(groups_order):
        if len(groups_order) <= 2:  # binary logistic regression
            scores = scores_all[0]
        else:
            scores = scores_all[igroup]

        partition = cp.argpartition(scores, -n_genes_user)[-n_genes_user:]
        partial_indices = cp.argsort(scores[partition])[::-1]
        global_indices = reference_indices[partition][partial_indices]
        rankings_gene_scores.append(scores[global_indices].get(
        ))  ## Shouldn't need to take this off device
        rankings_gene_names.append(var_names[global_indices].to_pandas())
        if len(groups_order) <= 2:
            break

    groups_order_save = [str(g) for g in groups_order]
    if (len(groups) == 2):
        groups_order_save = [g for g in groups_order if g != reference]

    print("Ranking took (GPU): " + str(time.time() - start))

    start = time.time()

    scores = np.rec.fromarrays(
        [n for n in rankings_gene_scores],
        dtype=[(rn, 'float32') for rn in groups_order_save],
    )

    names = np.rec.fromarrays(
        [n for n in rankings_gene_names],
        dtype=[(rn, 'U50') for rn in groups_order_save],
    )

    print("Preparing output np.rec.fromarrays took (CPU): " +
          str(time.time() - start))
    print("Note: This operation will be accelerated in a future version")

    return scores, names, original_reference

Exemplo n.º 21

0

Exibir arquivo

def max_atoms(D,yn,S):
    All_Abs = cp.abs(D.T @ yn)
    I = cp.argpartition(All_Abs, -S)[-S:]
    return I