def rand_index(adata, title):
    if not os.path.exists('RandInd_dictionaries'):
        os.makedirs('RandInd_dictionaries')

    resamp_perc = 0.9
    adata = adata.copy()
    indx_array = adata.obs.index.values
    n_cells = range(adata.shape[0])
    resamp_size = round(adata.shape[0] * resamp_perc)

    for res in [0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.1]:#, 1.3,1.5, 1.7, 1.9]:
        print(res)
        rand_indx_dict = {}
        pg.neighbors(adata,rep="pca_harmony")
        pg.leiden(adata, rep="pca_harmony", resolution = res)
        rand_list = []
        for iter in range(20) :
            samp_indx = random.sample(n_cells, resamp_size)
            samp_indx = indx_array[samp_indx]
            samp_data = adata[samp_indx]
            true_class = samp_data.obs["leiden_labels"]

            pg.neighbors(samp_data, rep="pca_harmony")
            pg.leiden(samp_data, rep = "pca_harmony", resolution = res)
            new_class = samp_data.obs["leiden_labels"]

            rand_list.append(adjusted_rand_score(true_class, new_class))

        rand_indx_dict[str(res)] = rand_list
        file_name = "RandInd_dictionaries/Dict_"+ title +"_"+str(res)+".pckl"
        filehandler = open(file_name,"wb")
        pickle.dump(rand_indx_dict, filehandler)
        filehandler.close()
Пример #2
0
def test_cell_lines():
    print("Testing on Cell Lines...")

    z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"):
        X = np.loadtxt("./data/cell_lines/pca.txt")
        df_metadata = pd.read_csv("./data/cell_lines/metadata.csv")

    if os.path.exists("./result/cell_lines_cpu_z.npy"):
        Z_cpu = np.load("./result/cell_lines_cpu_z.npy")
        print("Precalculated CPU mode result is loaded.")
    else:
        start_cpu = time.time()
        Z_cpu = harmonize(X, df_metadata, 'dataset')
        end_cpu = time.time()

        print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu))
        np.save("./result/cell_lines_cpu_z.npy", Z_cpu)

    if os.path.exists("./result/cell_lines_gpu_z.npy"):
        Z_gpu = np.load("./result/cell_lines_gpu_z.npy")
        print("Precalculated GPU mode result is loaded.")
    else:
        start_gpu = time.time()
        Z_gpu = harmonize(X, df_metadata, 'dataset', use_gpu = True)
        end_gpu = time.time()

        print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu))
        np.save("./result/cell_lines_gpu_z.npy", Z_gpu)

    Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt")

    check_metrics(Z_cpu, Z_R, prefix = "cell_lines_cpu")
    check_metrics(Z_gpu, Z_R, prefix = "cell_lines_gpu")

    if os.path.exists("./result/cell_lines_result.h5ad"):
        adata = None
    else:
        n_obs = X.shape[0]
        adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata)
        adata.obsm['X_pca'] = X

        pg.neighbors(adata, rep = 'pca')
        pg.umap(adata)

    umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf")]
    if len(umap_list) < 4:
        plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix = "cell_lines", batch_key = 'dataset')
def subcluster_1_preprocess(data, hvg_no, PCs_no, tit, in_place=False):
    #adata.obsm = None # Removing former PCA and umap values
    data.uns.clear()  # To eliminate problems with size of uns (specifically "fmat_highly_variable_features")

    if not in_place:
        adata=data.copy()
    else:
        adata = data

    # Preprocessing
    pg.qc_metrics(adata, min_genes=200, min_umis=400)
    pg.highly_variable_features(adata, consider_batch=False, n_top=hvg_no)
    pg.pca(adata, n_components=PCs_no)
    adata.obs['Channel'] = adata.obs['sample']
    pg.run_harmony(adata)
    pg.neighbors(adata, rep="pca_harmony")
    pg.umap(adata, rep="pca_harmony")
    pg.leiden(adata, rep="pca_harmony", resolution=0.5)
    sc.pl.umap(adata, size=15, title=tit + " (#HVG: " + str(hvg_no) + ", #PCs: " + str(PCs_no) + ")",
               color="leiden_labels")
Пример #4
0
def plot_umap(adata, Z_torch, Z_py, Z_R, prefix, batch_key):
    if adata is not None:
        adata.obsm['X_torch'] = Z_torch
        adata.obsm['X_py'] = Z_py
        adata.obsm['X_harmony'] = Z_R

        pg.neighbors(adata, rep = 'torch')
        pg.umap(adata, rep = 'torch', out_basis = 'umap_torch')

        pg.neighbors(adata, rep = 'py')
        pg.umap(adata, rep = 'py', out_basis = 'umap_py')

        pg.neighbors(adata, rep = 'harmony')
        pg.umap(adata, rep = 'harmony', out_basis = 'umap_harmony')

        pg.write_output(adata, "./result/{}_result".format(prefix))
    else:
        print("Use precalculated AnnData result.")

    if os.system("pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.before.umap.pdf".format(name = prefix, attr = batch_key)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_torch --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.torch.umap.pdf".format(name = prefix, attr = batch_key)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_py --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.py.umap.pdf".format(name = prefix, attr = batch_key)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.harmony.umap.pdf".format(name = prefix, attr = batch_key)):
        sys.exit(1)
Пример #5
0
def plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix, batch_key):
    if adata is not None:
        adata.obsm['X_cpu'] = Z_cpu
        adata.obsm['X_gpu'] = Z_gpu
        adata.obsm['X_harmony'] = Z_R

        pg.neighbors(adata, rep = 'cpu')
        pg.umap(adata, rep = 'cpu', out_basis = 'umap_cpu')

        pg.neighbors(adata, rep = 'gpu')
        pg.umap(adata, rep = 'gpu', out_basis = 'umap_gpu')

        pg.neighbors(adata, rep = 'harmony')
        pg.umap(adata, rep = 'harmony', out_basis = 'umap_harmony')

        pg.write_output(adata, "./result/{}_result".format(prefix))
    else:
        print("Use precalculated AnnData result.")

    if os.system("pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.before.umap.pdf".format(attr = batch_key, prefix = prefix)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_cpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.cpu.umap.pdf".format(attr = batch_key, prefix = prefix)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_gpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.gpu.umap.pdf".format(attr = batch_key, prefix = prefix)):
        sys.exit(1)

    if os.system("pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.harmony.umap.pdf".format(attr = batch_key, prefix = prefix)):
        sys.exit(1)
Пример #6
0
def test_cell_lines():
    print("Testing on cell lines dataset...")

    z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)]
    if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"):
        X = np.loadtxt("./data/cell_lines/pca.txt")
        df_metadata = pd.read_csv("./data/cell_lines/metadata.csv")
        source_loaded = True

    if os.path.exists("./result/cell_lines_torch_z.npy"):
        Z_torch = np.load("./result/cell_lines_torch_z.npy")
        print("Precalculated embedding by harmony-pytorch is loaded.")
    else:
        start_torch = time.time()
        Z_torch = harmonize(X, df_metadata, batch_key = 'dataset')
        end_torch = time.time()

        print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch))
        np.save("./result/cell_lines_torch_z.npy", Z_torch)

    if os.path.exists("./result/cell_lines_py_z.npy"):
        Z_py = np.load("./result/cell_lines_py_z.npy")
        print("Precalculated embedding by harmonypy is loaded.")
    else:
        start_py = time.time()
        ho = run_harmony(X, df_metadata, ['dataset'])
        end_py = time.time()

        print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))
        print(ho.objective_harmony)

        Z_py = np.transpose(ho.Z_corr)
        np.save("./result/cell_lines_py_z.npy", Z_py)

    Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt")

    check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'r')
    check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'L2')

    if os.path.exists("./result/cell_lines_result.h5ad"):
        adata = None
    else:
        n_obs = X.shape[0]
        adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata)
        adata.obsm['X_pca'] = X

        pg.neighbors(adata, rep = 'pca')
        pg.umap(adata)

    umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)]
    if len(umap_list) < 4:
        plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "cell_lines", batch_key = "dataset")

    if os.path.exists("./result/cell_lines_result.h5ad"):
       adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode = 'r')

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'harmony')
       print("kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'py')
       print("kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))

       stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'torch')
       print("kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))
Пример #7
0
def main():

    args = my_args()

    out = args.output
    command = "pegasus aggregate_matrix %s %s" % (args.input_csv, out)
    os.system(command)
    zarr_file = "%s.zarr.zip" % (out)

    data = pg.read_input(zarr_file)
    if args.citeseq:
        data.select_data("%s-rna" % (data.uns['genome']))
    pg.qc_metrics(data,
                  percent_mito=args.MT_percent,
                  mito_prefix=args.MT_prefix,
                  max_genes=args.max_genes)
    df_qc = pg.get_filter_stats(data)
    df_qc.to_csv("%s_qc_get_filter_stats.csv" % (out))

    pg.qcviolin(data, plot_type='gene')
    plt.savefig("%s_qcviolin_gene.pdf" % (out), bbox_inches='tight')

    pg.qcviolin(data, plot_type='count')
    plt.savefig("%s_qcviolin_UMI_count.pdf" % (out), bbox_inches='tight')

    pg.qcviolin(data, plot_type='mito')
    plt.savefig("%s_qcviolin_UMI_mito.pdf" % (out), bbox_inches='tight')

    # filtering
    pg.filter_data(data)
    pg.identify_robust_genes(data, percent_cells=0.05)
    pg.log_norm(data)

    print(data.obs['Channel'].value_counts())
    # save log norm data, rna
    df = pd.DataFrame.sparse.from_spmatrix(data.X)
    df.index = data.obs.index.tolist()
    df.columns = data.var.index.tolist()
    df.to_pickle("%s.rna.log_norm.pkl" % (out))

    if args.citeseq:
        data.select_data("%s-citeseq" % (data.uns['genome']))
        df = pd.DataFrame.sparse.from_spmatrix(data.X)
        df.index = data.obs.index.tolist()
        df.columns = data.var.index.tolist()
        df.to_pickle("%s.antibody.log_norm.pkl" % (out))
        data.select_data("%s-rna" % (data.uns['genome']))

    # without batch correction
    data_baseline = data.copy()
    pg.highly_variable_features(data_baseline,
                                consider_batch=False,
                                n_top=4000)
    data_baseline.var.loc[
        data_baseline.var['highly_variable_features']].sort_values(
            by='hvf_rank')

    pg.hvfplot(data_baseline)
    plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight')

    pg.pca(data_baseline, n_components=200)
    pg.neighbors(data_baseline, K=200)
    pg.louvain(data_baseline, resolution=2)
    pg.umap(data_baseline, n_neighbors=10, min_dist=0.4)
    pg.scatter(data_baseline,
               attrs=['louvain_labels', 'Channel'],
               basis='umap')
    plt.savefig("%s_without_BC.pdf" % (out), bbox_inches='tight')

    # with batch correction
    pg.highly_variable_features(data, consider_batch=True, n_top=4000)
    data.var.loc[data.var['highly_variable_features']].sort_values(
        by='hvf_rank')

    pg.hvfplot(data)
    plt.savefig("%s_hvfplot_noBC.pdf" % (out), bbox_inches='tight')

    data_harmony = data.copy()
    pg.pca(data_harmony, n_components=200)
    harmony_key = pg.run_harmony(data_harmony)
    pg.neighbors(data_harmony, rep=harmony_key, K=200)
    pg.louvain(data_harmony, rep=harmony_key, resolution=2)
    pg.umap(data_harmony, rep=harmony_key, n_neighbors=10, min_dist=0.4)
    pg.scatter(data_harmony, attrs=['louvain_labels', 'Channel'], basis='umap')
    plt.savefig("%s_Harmony_BC.pdf" % (out), bbox_inches='tight')
    pg.write_output(data_harmony, "%s_harmony.zarr" % (out))

    ddf = pd.DataFrame.sparse.from_spmatrix(data_harmony.X)
    ddf.index = data_harmony.obs.index.tolist()
    ddf.columns = data_harmony.var.index.tolist()
    data_harmony.select_data("%s-citeseq" % (data_harmony.uns['genome']))
    ddf2 = pd.DataFrame.sparse.from_spmatrix(data_harmony.X)
    ddf2.index = data_harmony.obs.index.tolist()
    ddf2.columns = data_harmony.var.index.tolist()
    df_all = pd.concat([ddf, ddf2], axis=1)
    df_all = df_all.sparse.to_dense()
    df_all = df_all.round(3)
    df_all.to_csv("%s.Harmony_correction.data.csv" % (out))
    ### original harmony UMAP data
    out = data_harmony.obs.copy()
    out['UMAP1'] = data_harmony.obsm['X_umap'][:, 0]
    out['UMAP2'] = data_harmony.obsm['X_umap'][:, 1]
    from anndata import AnnData
    ann = AnnData(X=out[['UMAP1', 'UMAP2']],
                  obs=out[['Channel', 'louvain_labels']])
    import scanpy as sc
    from matplotlib import rcParams
    sc.pl.scatter(ann,
                  x="UMAP1",
                  y="UMAP2",
                  color='louvain_labels',
                  legend_loc='on data',
                  legend_fontsize=12,
                  legend_fontoutline=2,
                  frameon=False,
                  title='clustering of cells')
    plt.savefig("%s_Scapy_UMAP.png" % (args.output), bbox_inches='tight')
    out.to_csv("%s_Harmony_UMAP.csv" % (args.output))
Пример #8
0
    q = q / q.sum()
    adata_subsamp += [
        adata_[np.random.choice(adata_.shape[0], size=args.numcells, p=q), :],
    ]
# create perturbed anndata
adata_s = adata_subsamp[0].concatenate(adata_subsamp[1:])
days = adata_s.obs.day.unique()
days_tot = adata_s.obs.day.unique().shape[0]
# sampled perturbed proportions
props_subsamp = [
    adata_s[adata_s.obs.day == i, :].obs.iloc[:, 3:].sum(0) for i in t_map
]
props_subsamp = [p / p.sum() for p in props_subsamp]
# compute PCA after all subsampling is done
pg.pca(adata_s, n_components=args.pcadim, features=None)
pg.neighbors(adata_s)
pg.diffmap(adata_s)
adata_s.obsm['X_fle'] = np.array(adata_s.obsm['X_fle'])
# compute cost scale
c_means = np.array([
    gwot.anndata_utils.get_C_mean(adata_s,
                                  t_map[i],
                                  t_next=t_map[i + 1],
                                  mode="tr")
    for i in range(0, len(t_map[:-1]))
])
c_means_self = np.array(
    [gwot.anndata_utils.get_C_mean(adata_s, t, mode="self") for t in t_map])

dt = np.array([t_map[i + 1] - t_map[i] for i in range(0, len(t_map) - 1)])