Exemplo n.º 1
0
def run_pca(data, device, n_components=300, var_explained=0.85):
    """Run PCA

    :param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components
    :param n_components: Number of principal components
    :param var_explained: Include components that explain amount variance. Note
    number of components = min(n_components, components explaining var_explained)
    :return: PCA projections of the data and the explained variance
    """
    init_components = min([n_components, data.shape[0]])
    if device == "gpu":
        from cuml import PCA
        pca = PCA(n_components=init_components)
    elif device == "cpu":
        from sklearn.decomposition import PCA
        pca = PCA(n_components=init_components, svd_solver='randomized')
    pca.fit(data)
    if pca.explained_variance_ratio_.sum() >= 0.85:
        n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= var_explained)[0][0]

    print(f'Running PCA with {n_components} components')
    pca_projections = pca.fit_transform(data)
    pca_projections = pd.DataFrame(pca_projections, index=data.index)
    return pca_projections, pca.explained_variance_ratio_
Exemplo n.º 2
0
def PCA_concat(df, n_components=2):
    pca_float = PCA(n_components=n_components)
    pca_float.fit(df[df.columns[df.dtypes == np.float32]])
    scores = pca_float.transform(df[df.columns[df.dtypes == np.float32]])
    scores.columns = ['PC' + str(x) for x in range(n_components)]
    return cudf.concat([df, scores], axis=1)
Exemplo n.º 3
0
def PCA_concat(df, components=100):
    pca_float = PCA(n_components=2)
    pca_float.fit(df[df.columns[df.dtypes == np.float32]])
    scores = pca_float.transform(df[df.columns[df.dtypes == np.float32]])
    return cudf.concat([df, scores], axis=1)