def read_10X(data_path, var_names='gene_symbols'): adata = read_mtx(data_path + '/matrix.mtx').T genes = pd.read_csv(data_path + '/genes.tsv', header=None, sep='\t') adata.var['gene_ids'] = genes[0].values adata.var['gene_symbols'] = genes[1].values assert var_names == 'gene_symbols' or var_names == 'gene_ids', \ 'var_names must be "gene_symbols" or "gene_ids"' if var_names == 'gene_symbols': var_names = genes[1] else: var_names = genes[0] if not var_names.is_unique: var_names = make_index_unique(pd.Index(var_names)) print('var_names are not unique, "make_index_unique" has applied') adata.var_names = var_names cells = pd.read_csv(data_path + '/barcodes.tsv', header=None, sep='\t') adata.obs['barcode'] = cells[0].values adata.obs_names = cells[0] return adata
def test_make_index_unique(): index = pd.Index(["val", "val", "val-1", "val-1"]) with pytest.warns(UserWarning): result = make_index_unique(index) expected = pd.Index(["val", "val-2", "val-1", "val-1-1"]) assert list(expected) == list(result) assert result.is_unique
def adata_processing_TF_link(adata, nt_layers, TF_list, gene_filter_rate=0.1, cell_filter_UMI=10000): """preprocess adata and get ready for TF-target gene analysis""" n_obs, n_var = adata.n_obs, adata.n_vars # filter genes print(f"Original gene number: {n_var}") gene_filter_new = (adata.layers[nt_layers[0]] > 0).sum(0) > (gene_filter_rate * n_obs) gene_filter_tot = (adata.layers[nt_layers[1]] > 0).sum(0) > (gene_filter_rate * n_obs) if issparse(adata.layers[nt_layers[0]]): gene_filter_new = gene_filter_new.A1 if issparse(adata.layers[nt_layers[1]]): gene_filter_tot = gene_filter_tot.A1 adata = adata[:, gene_filter_new * gene_filter_tot] print( f"Gene number after filtering: {sum(gene_filter_new * gene_filter_tot)}" ) # filter cells print(f"Original cell number: {n_obs}") cell_filter = adata.layers[nt_layers[1]].sum(1) > cell_filter_UMI if issparse(adata.layers[nt_layers[1]]): cell_filter = cell_filter.A1 adata = adata[cell_filter, :] print(f"Cell number after filtering: {adata.n_obs}") # generate the expression matrix for downstream analysis new = adata.layers[nt_layers[0]] total = adata.layers[nt_layers[1]] # recalculate size factor from ..preprocessing import szFactor adata = szFactor(adata, method="mean-geometric-mean-total", round_exprs=True, total_layers=["total"]) szfactors = adata.obs["Size_Factor"][:, None] # normalize data (size factor correction, log transform and the scaling) if issparse(new): new = new.A if issparse(total): total = total.A new_mat = normalize_data(new, szfactors, pseudo_expr=0.1) tot_mat = normalize_data(total, szfactors, pseudo_expr=0.1) new_mat = pd.DataFrame(new_mat, index=adata.obs_names, columns=adata.var_names) tot_mat = pd.DataFrame(tot_mat, index=adata.obs_names, columns=adata.var_names) # compute the labeling reads rate in each cell obs = adata.obs var = adata.var var.loc[:, "gene_short_name"] = make_index_unique( var.loc[:, "gene_short_name"].astype("str")) obs.loc[:, "labeling_rate"] = adata.layers["new"].sum( 1) / adata.layers["total"].sum(1) # extract the TF matrix var_TF = var.query("gene_short_name in @TF_list") print(f"\nNumber of TFs found in the list: {var_TF.shape[0]}") TF_matrix = tot_mat.loc[:, var_TF.loc[:, "gene_id"]] TF_matrix.columns = var_TF.loc[:, "gene_short_name"] return (tot_mat.T, new_mat.T, obs, var, var_TF, TF_matrix.T)