示例#1
0
def test_scrublet_batched():
    """
    Test that Scrublet run works with batched data.

    Check that scrublet runs and detects some doublets.
    """
    pytest.importorskip("scrublet")

    adata = sc.datasets.pbmc3k()
    adata.obs['batch'] = 1350 * ['a'] + 1350 * ['b']
    split = [adata[adata.obs["batch"] == x].copy() for x in ("a", "b")]

    sce.pp.scrublet(adata, use_approx_neighbors=False, batch_key='batch')

    # replace assertions by conditions
    assert "predicted_doublet" in adata.obs.columns
    assert "doublet_score" in adata.obs.columns

    assert adata.obs["predicted_doublet"].any(), "Expect some doublets to be identified"
    assert (
        'batches' in adata.uns['scrublet'].keys()
    ), "Expect .uns to contain batch info"

    # Check that results are independent
    for s in split:
        sce.pp.scrublet(s, use_approx_neighbors=False)
    merged = sc.concat(split)

    pd.testing.assert_frame_equal(adata.obs[merged.obs.columns], merged.obs)
示例#2
0
def normalizeMultiAd(multiAd, removeAmbiguous=True):
    """
    对二代三代数据分开normalize, 最终获得的每个细胞有3e4的reads
    """
    multiCountAd = multiAd[:, ~multiAd.var.index.str.contains("_")]
    multiOtherAd = multiAd[:, multiAd.var.index.str.contains("_")]
    sc.pp.normalize_total(multiCountAd, target_sum=1e4)
    sc.pp.normalize_total(multiOtherAd, target_sum=2e4)
    multiAd = sc.concat([multiCountAd, multiOtherAd], axis=1)
    if removeAmbiguous:
        multiAd = multiAd[:, ~(multiAd.var.index.str.contains("Ambiguous")
                               | multiAd.var.index.str.contains("_N_")), ]
    return multiAd
示例#3
0
def combineAdataUseScanorama(adataLs,
                             batchKey,
                             batchCateLs,
                             subSample=False,
                             subSampleCounts=0):
    """
    利用Scanorama整合不同adata
    adataLs:
        [adata1, adata2]
    batchKey:
        添加的label
    batchCateLs:
        每个batch的名称 需要和adataLs一致
    subSampleCounts:
        下采样的样本数。
    return:
        整合后的adata
    """
    import scanorama

    adataLs = [x.copy() for x in adataLs]
    if subSample:
        sampleSize = min([x.shape[0] for x in adataLs])
        if subSampleCounts:
            sampleSize = min(sampleSize, subSampleCounts)
        logger.info(f"sample size: {sampleSize}")
        [sc.pp.subsample(x, n_obs=sampleSize) for x in adataLs]

    for adata in adataLs:
        sc.pp.normalize_total(adata, inplace=True)
        sc.pp.log1p(adata)
        sc.pp.highly_variable_genes(adata,
                                    flavor="seurat",
                                    n_top_genes=2000,
                                    inplace=True)

    print(f"↓↓↓↓↓↓↓↓↓{batchCateLs}↓↓↓↓↓↓↓↓")
    combineScanoramaLs = scanorama.correct_scanpy(adataLs, return_dimred=True)
    print(f"↑↑↑↑↑↑↑↑↑{batchCateLs}↑↑↑↑↑↑↑↑")
    combineAdata = sc.concat(combineScanoramaLs,
                             label=batchKey,
                             index_unique="-",
                             keys=batchCateLs)
    sc.pp.neighbors(combineAdata, n_pcs=50, use_rep="X_scanorama")
    sc.tl.umap(combineAdata)
    return combineAdata
示例#4
0
def combineAdataUseScanoramaOld(
    adataLs, batchKey, batchCateLs, subSample=False, subSampleCounts=0
):
    """
    利用Scanorama整合不同adata
    adataLs:
        [adata1, adata2]
    batchKey:
        添加的label
    batchCateLs:
        每个batch的名称 需要和adataLs一致
    subSampleCounts:
        下采样的样本数。
    return:
        整合后的adata
    """
    import scanorama

    adataLs = [x.copy() for x in adataLs]
    if subSample:
        sampleSize = min([x.shape[0] for x in adataLs])
        if subSampleCounts:
            sampleSize = min(sampleSize, subSampleCounts)
        logger.info(f"sample size: {sampleSize}")
        [sc.pp.subsample(x, n_obs=sampleSize) for x in adataLs]

    combineAdata = adataLs[0].concatenate(
        adataLs[1:], batch_key=batchKey, batch_categories=batchCateLs
    )

    sc.pp.normalize_per_cell(combineAdata, counts_per_cell_after=1e4)
    sc.pp.log1p(combineAdata)

    sc.pp.highly_variable_genes(
        combineAdata, min_mean=0.0125, max_mean=3, min_disp=1.5, batch_key=batchKey
    )
    sc.pl.highly_variable_genes(combineAdata)

    varGenes = combineAdata.var.highly_variable

    varGenes = varGenes[varGenes].keys()

    varGenes = list(varGenes)

    alldata = {}

    for oneBatchName in combineAdata.obs[batchKey].unique():
        alldata[oneBatchName] = combineAdata[
            combineAdata.obs[batchKey] == oneBatchName, varGenes
        ]

    combineAdataLs = list(alldata.values())

    print(f"↓↓↓↓↓↓↓↓↓{batchCateLs}↓↓↓↓↓↓↓↓")
    combineScanoramaLs = scanorama.correct_scanpy(combineAdataLs, return_dimred=True)
    print(f"↑↑↑↑↑↑↑↑↑{batchCateLs}↑↑↑↑↑↑↑↑")
    combineAdata = sc.concat(combineScanoramaLs)
    #     import pdb; pdb.set_trace()
    #     combineScanoramaAr = np.concatenate(combineScanoramaLs)

    #     combineAdata.obsm["SC"] = combineScanoramaAr

    #     combineAdata.raw = combineAdata
    #     combineAdata = combineAdata[:, varGenes]
    #     sc.pp.scale(combineAdata, max_value=10)
    #     sc.tl.pca(combineAdata, svd_solver="arpack", n_comps=50)
    #     sc.pl.pca(combineAdata)
    sc.pp.neighbors(combineAdata, n_pcs=50, use_rep="X_scanorama")
    sc.tl.umap(combineAdata)
    return combineAdata