Пример #1
0
def read_10X(data_path, var_names='gene_symbols'):

    adata = read_mtx(data_path + '/matrix.mtx').T

    genes = pd.read_csv(data_path + '/genes.tsv', header=None, sep='\t')
    adata.var['gene_ids'] = genes[0].values
    adata.var['gene_symbols'] = genes[1].values

    assert var_names == 'gene_symbols' or var_names == 'gene_ids', \
        'var_names must be "gene_symbols" or "gene_ids"'

    if var_names == 'gene_symbols':
        var_names = genes[1]
    else:
        var_names = genes[0]

    if not var_names.is_unique:
        var_names = make_index_unique(pd.Index(var_names))
        print('var_names are not unique, "make_index_unique" has applied')

    adata.var_names = var_names

    cells = pd.read_csv(data_path + '/barcodes.tsv', header=None, sep='\t')
    adata.obs['barcode'] = cells[0].values
    adata.obs_names = cells[0]
    return adata
Пример #2
0
def test_make_index_unique():
    index = pd.Index(["val", "val", "val-1", "val-1"])
    with pytest.warns(UserWarning):
        result = make_index_unique(index)
    expected = pd.Index(["val", "val-2", "val-1", "val-1-1"])
    assert list(expected) == list(result)
    assert result.is_unique
Пример #3
0
def adata_processing_TF_link(adata,
                             nt_layers,
                             TF_list,
                             gene_filter_rate=0.1,
                             cell_filter_UMI=10000):
    """preprocess adata and get ready for TF-target gene analysis"""

    n_obs, n_var = adata.n_obs, adata.n_vars

    # filter genes
    print(f"Original gene number: {n_var}")

    gene_filter_new = (adata.layers[nt_layers[0]] >
                       0).sum(0) > (gene_filter_rate * n_obs)
    gene_filter_tot = (adata.layers[nt_layers[1]] >
                       0).sum(0) > (gene_filter_rate * n_obs)
    if issparse(adata.layers[nt_layers[0]]):
        gene_filter_new = gene_filter_new.A1
    if issparse(adata.layers[nt_layers[1]]):
        gene_filter_tot = gene_filter_tot.A1
    adata = adata[:, gene_filter_new * gene_filter_tot]

    print(
        f"Gene number after filtering: {sum(gene_filter_new * gene_filter_tot)}"
    )

    # filter cells
    print(f"Original cell number: {n_obs}")

    cell_filter = adata.layers[nt_layers[1]].sum(1) > cell_filter_UMI
    if issparse(adata.layers[nt_layers[1]]):
        cell_filter = cell_filter.A1
    adata = adata[cell_filter, :]

    print(f"Cell number after filtering: {adata.n_obs}")

    # generate the expression matrix for downstream analysis
    new = adata.layers[nt_layers[0]]
    total = adata.layers[nt_layers[1]]

    # recalculate size factor
    from ..preprocessing import szFactor

    adata = szFactor(adata,
                     method="mean-geometric-mean-total",
                     round_exprs=True,
                     total_layers=["total"])
    szfactors = adata.obs["Size_Factor"][:, None]

    # normalize data (size factor correction, log transform and the scaling)
    if issparse(new):
        new = new.A
    if issparse(total):
        total = total.A
    new_mat = normalize_data(new, szfactors, pseudo_expr=0.1)
    tot_mat = normalize_data(total, szfactors, pseudo_expr=0.1)
    new_mat = pd.DataFrame(new_mat,
                           index=adata.obs_names,
                           columns=adata.var_names)
    tot_mat = pd.DataFrame(tot_mat,
                           index=adata.obs_names,
                           columns=adata.var_names)

    # compute the labeling reads rate in each cell
    obs = adata.obs
    var = adata.var
    var.loc[:, "gene_short_name"] = make_index_unique(
        var.loc[:, "gene_short_name"].astype("str"))
    obs.loc[:, "labeling_rate"] = adata.layers["new"].sum(
        1) / adata.layers["total"].sum(1)

    # extract the TF matrix
    var_TF = var.query("gene_short_name in @TF_list")
    print(f"\nNumber of TFs found in the list: {var_TF.shape[0]}")

    TF_matrix = tot_mat.loc[:, var_TF.loc[:, "gene_id"]]
    TF_matrix.columns = var_TF.loc[:, "gene_short_name"]

    return (tot_mat.T, new_mat.T, obs, var, var_TF, TF_matrix.T)