def normalize(adata: scanpy.AnnData, filter_min_counts=None, size_factors=False, scale_input=False, logtrans_input=False): if filter_min_counts: scanpy.pp.filter_genes(adata, min_counts=filter_min_counts) scanpy.pp.filter_cells(adata, min_counts=filter_min_counts) adata.raw = adata dataset = GeneExpressionDataset() dataset.from_data(adata.X, raw=adata.raw.X) if size_factors: scanpy.pp.normalize_per_cell(adata) size_factor_cell = dataset.nb_cell_counts / np.median( dataset.nb_cell_counts) dataset.initialize_cell_attribute("size_factor", size_factor_cell.reshape((-1, 1))) if logtrans_input: scanpy.pp.log1p(adata) if scale_input: scanpy.pp.scale(adata) dataset.data = adata.X obs_index = adata.obs.columns for i, index in enumerate(obs_index): dataset.initialize_cell_attribute(index, adata.obs.iloc[:, i].values) return dataset
def postprocess_mnnpy(adata, bdata): """ postprocessing to generate a newly functional AnnData object After running mnnpy_mnncorrect we obtain ann AnnData object bdata. Since mnn_correct automatically truncates all the genes contained in .raw to contain only the highly variable genes this function creates a new AnnData object that contains .X from bdata but .raw from AnnData (which still contains all the genes, not only the highly variable ones). Before creation of the new AnnData object the matrices are sorted according to cellbarcode so that we ensure the labelings are correct. parameters ---------- adata: the uncorrected AnnData object bdata: the batch correted AnnData object returns ------- AnnData AnnData object with adata.X containing the corrected values and .raw all of the original values """ corrected_matrix = DataFrame(data = bdata.X, index = bdata.obs_names.tolist(), columns = bdata.var_names.tolist()) corrected_matrix.sort_index(inplace=True) new_adata = AnnData(corrected_matrix.values) new_adata.obs = bdata.obs.sort_index() new_adata.var_names = bdata.var_names new_adata.obs_names = bdata.obs_names.sort_values() new_adata.var = bdata.var #need to sort raw object to match the batch corrected order raw_matrix = DataFrame(data=(adata.raw.X.todense() if scipy.sparse.issparse(adata.raw.X) else adata.raw.X), index=adata.obs_names.tolist(), columns=adata.raw.var_names.tolist()) raw_matrix.sort_index(inplace=True) #recreate raw raw = AnnData(raw_matrix.values) raw.var_names = adata.raw.var_names raw.obs_names = adata.obs_names.sort_values() raw.var = adata.raw.var #add raw back in new_adata.raw = raw #ensure that indices are preserved adata.obs_names = adata.obs.CELL adata.obs.index = adata.obs.CELL return(new_adata)