def load_file(filepath): if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': # TODO remove transpose adata = anndata.read_csv(filepath).T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[ -4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") adata.uns['dataset'] = dataset return adata
def _read(filename, backed=False, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, suppress_cache_warning=False): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' + avail_exts) else: ext = is_valid_filename(filename, return_ext=True) is_present = check_datafile_present_and_download(filename, backup_url=backup_url) if not is_present: logg.msg('... did not find original file', filename) # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.msg('reading sheet', sheet, 'from file', filename, v=4) return read_hdf(filename, sheet) # read other file types filename_cache = (settings.cachedir + filename.lstrip( './').replace('/', '-').replace('.' + ext, '.h5ad')) if cache and os.path.exists(filename_cache): logg.info('... reading from cache file', filename_cache) adata = read_h5ad(filename_cache, backed=False) else: if not is_present: raise FileNotFoundError('Did not find file {}.'.format(filename)) logg.msg('reading', filename, v=4) if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( 'Provide `sheet` parameter when reading \'.xlsx\' files.') else: adata = read_excel(filename, sheet) elif ext == 'mtx': adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.msg('... assuming \'.data\' means tab or white-space ' 'separated text file', v=3) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) else: raise ValueError('Unkown extension {}.'.format(ext)) if cache: logg.info('... writing an', settings.file_format_data, 'cache file to speedup reading next time') if not os.path.exists(os.path.dirname(filename_cache)): os.makedirs(os.path.dirname(filename_cache)) # write for faster reading when calling the next time adata.write(filename_cache) return adata
def load_data_mtx(self, mtx_file, mtx_obs=None, mtx_feature=None, meta_data_file=None, meta_data_handler=DEFAULT_METADATA, gene_data_file=None, gene_name_column=None): data = anndata.read_mtx(self.input_path(mtx_file)) row_names = self._load_list_from_file( self.input_path(mtx_obs)) if mtx_obs is not None else None col_names = self._load_list_from_file( self.input_path(mtx_feature)) if mtx_feature is not None else None meta_data = self.load_metadata_tsv(meta_data_file, data.obs_names, meta_data_handler=meta_data_handler) gene_metadata = self.load_gene_metadata_tsv(gene_data_file, gene_name_column) data = InferelatorData(data, meta_data=meta_data, gene_data=gene_metadata, sample_names=row_names, gene_names=col_names) return data
def read_10X(data_path, var_names='gene_symbols'): adata = read_mtx(data_path + '/matrix.mtx').T genes = pd.read_csv(data_path + '/genes.tsv', header=None, sep='\t') adata.var['gene_ids'] = genes[0].values adata.var['gene_symbols'] = genes[1].values assert var_names == 'gene_symbols' or var_names == 'gene_ids', \ 'var_names must be "gene_symbols" or "gene_ids"' if var_names == 'gene_symbols': var_names = genes[1] else: var_names = genes[0] if not var_names.is_unique: var_names = make_index_unique(pd.Index(var_names)) print('var_names are not unique, "make_index_unique" has applied') adata.var_names = var_names cells = pd.read_csv(data_path + '/barcodes.tsv', header=None, sep='\t') adata.obs['barcode'] = cells[0].values adata.obs_names = cells[0] return adata
def read_dataset(path, obs=None, var=None, obs_filter=None, var_filter=None, **keywords): """ Read h5ad, loom, mtx, 10X h5, and csv formatted files Parameters ---------- path: str File name of data file. obs: {str, pd.DataFrame} Path to obs data file or a data frame var: {str, pd.DataFrame} Path to var data file or a data frame obs_filter {str, pd.DataFrame} File with one id per line, name of a boolean field in obs, or a list of ids var_filter: {str, pd.DataFrame} File with one id per line, name of a boolean field in obs, or a list of ids Returns ------- Annotated data matrix. """ _, ext = os.path.splitext(str(path).lower()) if ext == '.txt': df = pd.read_csv(path, engine='python', header=0, sep=None, index_col=0) adata = anndata.AnnData(X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns)) elif ext == '.h5ad': adata = anndata.read(path) elif ext == '.loom': adata = anndata.read_loom(path) elif ext == '.mtx': adata = anndata.read_mtx(path) elif ext == '.zarr': adata = anndata.read_zarr(path) else: raise ValueError('Unknown file format: {}'.format(ext)) def get_df(meta): if not isinstance(meta, pd.DataFrame): tmp_path = None if meta.startswith('gs://'): tmp_path = download_gs_url(meta) meta = tmp_path meta = pd.read_csv(meta, sep=None, index_col='id', engine='python') if tmp_path is not None: os.remove(tmp_path) return meta if obs is not None: if not isinstance(obs, list) and not isinstance(obs, tuple): obs = [obs] for item in obs: adata.obs = adata.obs.join(get_df(item)) if var is not None: if not isinstance(var, list) and not isinstance(var, tuple): var = [var] for item in var: adata.var = adata.var.join(get_df(item)) return filter_adata(adata, obs_filter=obs_filter, var_filter=var_filter)
def main(): """Run the script""" parser = build_parser() args = parser.parse_args() cell_df = pd.read_csv( args.cell_info, delimiter="," if utils.get_file_extension_no_gz(args.cell_info) == "csv" else "\t", index_col=args.cellindexcol, header=None if args.noheader else "infer", # 'infer' is default ) if "Barcodes" in cell_df.columns and args.cellindexcol is not None: cell_df.index = cell_df["Barcodes"] cell_df.index = cell_df.index.rename("barcode") cell_df.columns = cell_df.columns.map(str) logging.info(f"Read cell metadata from {args.cell_info} {cell_df.shape}") logging.info(f"Cell metadata cols: {cell_df.columns}") logging.info(cell_df) var_df = pd.read_csv( args.var_info, delimiter="," if utils.get_file_extension_no_gz(args.var_info) == "csv" else "\t", index_col=args.varindexcol, header=None if args.noheader else "infer", # 'infer' is default ) if "Feature" in var_df.columns and args.varindexcol is not None: var_df.index = [ensure_sane_interval(s) for s in var_df["Feature"]] var_df.index = var_df.index.rename("ft") var_df.columns = var_df.columns.map(str) # var_df.index = var_df.index.map(str) logging.info(f"Read variable metadata from {args.var_info} {var_df.shape}") logging.info(f"Var metadata cols: {var_df.columns}") logging.info(var_df) # Transpose because bio considers rows to be features adata = ad.read_mtx(args.mat_file).T logging.info(f"Read matrix {args.mat_file} {adata.shape}") adata.obs = cell_df adata.var = var_df logging.info(f"Created AnnData object: {adata}") logging.info(f"Obs names: {adata.obs_names}") logging.info(f"Var names: {adata.var_names}") if args.reindexvar: assert args.varindexcol is not None, "Must provide var index col to reindex var" target_vars = utils.read_delimited_file(args.reindexvar) logging.info( f"Read {args.reindexvar} for {len(target_vars)} vars to reindex") adata = adata_utils.reindex_adata_vars(adata, target_vars) adata.X = csr_matrix(adata.X) logging.info(f"Writing to {args.out_h5ad}") adata.write_h5ad(args.out_h5ad, compression=None)
def pretrainFolder(folder, species_list, data_type_list=None, out_dir=".", initial_file="", n_mouse=21122, n_human=21183, n_shared=15494, batch_size=100, pretrain_kwargs={}): mtx_files = [ y for x in os.walk(folder) for y in glob(os.path.join(x[0], '*.mtx')) ] nonmissing_files = [ y for x in os.walk(folder) for y in glob(os.path.join(x[0], '*nonmissing.txt')) ] if data_type_list is None: data_type_list = ['UMI'] * len(mtx_files) if len(species_list) == 1: species_list = species_list * len(mtx_files) idx = np.arange(len(mtx_files)) np.random.seed(42) np.random.shuffle(idx) nonmissing_indicator_list = [] for f in nonmissing_files: nonmissing_indicator_list.append(np.loadtxt(f)) data_list = [] for ff in mtx_files: print(ff) data_list.append(anndata.read_mtx(ff).transpose()) print(species_list) print(data_type_list) for i in range(len(mtx_files)): data_list[i].uns['species'] = species_list[i] print(species_list[i]) data_list[i].uns['data_type'] = data_type_list[i] print(data_type_list[i]) result = SaverXTrain(data_list, n_human, n_mouse, n_shared, out_dir=out_dir, nonmissing_indicator_list=nonmissing_indicator_list, initial_file=initial_file, batch_size=batch_size, **pretrain_kwargs)
def combine_celltypes(data_dir): '''Merge the downloaded FACS-sorted data into one ''' celltypes = [ "b_cells", "cd14_monocytes", "cd34", "cd56_nk", "cd4_t_helper", "naive_t", "memory_t", "regulatory_t", ## CD4+ T cells "cytotoxic_t", "naive_cytotoxic" ] ## CD8+ T cells adata_list = [] for celltype in celltypes: celltype_dir = data_dir + os.sep + 'PBMC_Zheng_FACS' + os.sep + celltype adata = anndata.read_mtx(celltype_dir + os.sep + 'matrix.mtx').T ## load genes genes = pd.read_csv(celltype_dir + os.sep + 'genes.tsv', header=None, sep='\t') adata.var['gene_symbols'] = genes[1].values adata.var_names = adata.var['gene_symbols'] adata.var_names_make_unique(join="-") ## load cells cells = pd.read_csv(celltype_dir + os.sep + 'barcodes.tsv', header=None, sep='\t') adata.obs['barcode'] = cells[0].values adata.obs_names = cells[0] adata.obs_names_make_unique(join="-") ## append adata adata_list.append(adata) final_adata = anndata.AnnData.concatenate( *adata_list, join='inner', batch_key="cell.type", batch_categories=celltypes) #inner final_adata.var.index.name = None final_adata.obs.index.name = None final_adata.write(data_dir + os.sep + 'PBMC_Zheng_FACS/FACS_adata.h5ad') return final_adata
def write_brain_adata(data_dir, region="FC"): '''Loading data from different brain region and make it an anndata, store it @region: FC/HC FC: 29463 genes * 194027 cells -> 71,445 cells HC: 27953 genes * 134430 cells -> 53,204 cells Note: the mtx and genes/barcodes come from RDS file using writeMM and write Because the file is too large, I generated h5ad and deleted original data data <- readRDS("Hippocampus.RDS") writeMM(data, "mouse_HC.mtx") write(rownames(data), "mouse_HC_genes.tsv") write(colnames(data), "mouse_HC_barcodes.tsv") ''' ## load data as anndata adata = anndata.read_mtx(data_dir+os.sep+'Mousebrain/mouse_'+region+'.mtx').T ## load cells and genes genes = pd.read_csv(data_dir+os.sep+'Mousebrain/mouse_'+region+'_genes.tsv', header=None, sep='\t') adata.var['gene_symbols'] = genes[0].values adata.var_names = adata.var['gene_symbols'] adata.var_names_make_unique(join="-") cells = pd.read_csv(data_dir+os.sep+'Mousebrain/mouse_'+region+'_barcodes.tsv', header=None, sep='\t') adata.obs['barcode'] = cells[0].values adata.obs_names = cells[0] adata.obs_names_make_unique(join="-") ## load metadata information df = pd.read_csv(data_dir+os.sep+"Mousebrain/Mousebrain_metadata.csv", index_col=0) df = df[df["mouse_celltypes"] != "unknown"] # remove unknown cell types common_barcodes = set(df["barcodes"]).intersection(set(adata.obs["barcode"])) # 53,204 cells adata = adata[list(common_barcodes)] adata.obs = adata.obs.merge(df, left_on="barcode", right_on="barcodes") adata.obs.index = adata.obs["barcode"] adata.obs.index.name = None adata.var.index.name=None adata.write(data_dir+os.sep+'Mousebrain/'+region+'_adata.h5ad')
def load_file(filepath): t_flag = False if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") t_flag = True elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': adata = anndata.read_csv(filepath) if t_flag: adata = adata.T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") # Make sure cluster names are in proper format if 'cluster_names' in adata.uns: adata.uns['cluster_names'] = bidict(adata.uns['cluster_names']) for key in list(adata.uns['cluster_names'].keys()): adata.uns['cluster_names'][int(key)] = \ adata.uns['cluster_names'].pop(key, None) adata.uns['dataset'] = dataset return adata
def load_PBMC_liger_data(data_dir): '''Loading PBMC liger 6k cells data ''' ## load data as anndata adata = anndata.read_mtx(data_dir + os.sep + 'PBMC_demuxlet/matrix.mtx').T ## load cells and genes genes = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/genes.tsv', header=None, sep='\t') adata.var['gene_symbols'] = genes[0].values adata.var_names = adata.var['gene_symbols'] # Make sure the gene names are unique adata.var_names_make_unique(join="-") cells = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/barcodes.tsv', header=None, sep='\t') adata.obs['barcode'] = cells[0].values adata.obs_names = cells[0] # Make sure the cell names are unique adata.obs_names_make_unique(join="-") return adata
### SPLIT SINGLE-CELL DATASET IN GENERATION AND VALIDATION SET ### import pickle import random import anndata import numpy as np import pandas as pd adata_raw = anndata.read_mtx( '/nfs/team205/vk7/sanger_projects/large_data/mouse_viseum_snrna/rawdata/all.mtx' ).T adata_snrna = anndata.read_h5ad( "/nfs/team283/ed6/processed_data/visium_st_beta/snRNA_s144600_preprocessed_20200109.h5ad" ) ## Cell type annotations labels = pd.read_csv( '/nfs/team283/ed6/processed_data/visium_st_beta/snRNA_annotation_20200229.csv', index_col=0) # ## Add cell type labels as columns in adata.obs # adata_snrna = adata_snrna_raw[labels.index,] # adata_snrna.obs = pd.concat([labels, adata_snrna_raw.obs], axis=1) # add cell names obs_id = pd.read_csv( '/nfs/team205/vk7/sanger_projects/large_data/mouse_viseum_snrna/rawdata/all_cells.txt' ) obs = obs_id['cell_id'].str.split(pat="_", expand=True)
def _read( filename: Path, backed=None, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, cache_compression=None, suppress_cache_warning=False, **kwargs, ): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' f'{avail_exts}') else: ext = is_valid_filename(filename, return_ext=True) is_present = _check_datafile_present_and_download( filename, backup_url=backup_url, ) if not is_present: logg.debug(f'... did not find original file {filename}') # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.debug(f'reading sheet {sheet} from file {filename}') return read_hdf(filename, sheet) # read other file types path_cache = settings.cachedir / _slugify(filename).replace( '.' + ext, '.h5ad') # type: Path if path_cache.suffix in {'.gz', '.bz2'}: path_cache = path_cache.with_suffix('') if cache and path_cache.is_file(): logg.info(f'... reading from cache file {path_cache}') return read_h5ad(path_cache) if not is_present: raise FileNotFoundError(f'Did not find file {filename}.') logg.debug(f'reading {filename}') if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( "Provide `sheet` parameter when reading '.xlsx' files.") else: adata = read_excel(filename, sheet) elif ext in {'mtx', 'mtx.gz'}: adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.hint( "... assuming '.data' means tab or white-space " 'separated text file', ) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) elif ext == 'loom': adata = read_loom(filename=filename, **kwargs) else: raise ValueError(f'Unknown extension {ext}.') if cache: logg.info(f'... writing an {settings.file_format_data} ' 'cache file to speedup reading next time') if cache_compression is _empty: cache_compression = settings.cache_compression if not path_cache.parent.is_dir(): path_cache.parent.mkdir(parents=True) # write for faster reading when calling the next time adata.write(path_cache, compression=cache_compression) return adata
def load_PBMC_batch2_data(data_dir, condition=None, ind=None): '''Loading PBMC batch2 data @condition: ctrl/stim @ind: 101 1015 1016 1039 107 1244 1256 1488 ''' ## load genes genes = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/GSE96583_batch2.genes.tsv', header=None, sep='\t') ## load control data ctrl_adata = anndata.read_mtx(data_dir + os.sep + 'PBMC_demuxlet/GSM2560248_2.1.mtx').T ctrl_adata.var['gene_symbols'] = genes[1].values ctrl_adata.var_names = ctrl_adata.var['gene_symbols'] # Make sure the gene names are unique ctrl_adata.var_names_make_unique(join="-") cells = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/GSM2560248_barcodes.tsv', header=None, sep='\t') ctrl_adata.obs['barcode'] = 'ctrl' + cells[0].values ctrl_adata.obs_names = cells[0] # Make sure the cell names are unique ctrl_adata.obs_names_make_unique(join="-") ## load stim data stim_adata = anndata.read_mtx(data_dir + os.sep + 'PBMC_demuxlet/GSM2560249_2.2.mtx').T stim_adata.var['gene_symbols'] = genes[1].values stim_adata.var_names = stim_adata.var['gene_symbols'] # Make sure the gene names are unique stim_adata.var_names_make_unique(join="-") cells = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/GSM2560249_barcodes.tsv', header=None, sep='\t') stim_adata.obs['barcode'] = 'stim' + cells[0].values stim_adata.obs_names = cells[0] # Make sure the cell names are unique stim_adata.obs_names_make_unique(join="-") ## combine control/stimulated data together adata = ctrl_adata.concatenate(stim_adata, batch_key="condition", batch_categories=['control', 'stimulated']) adata.obs.index = adata.obs["barcode"] adata.obs.index.name = None ## load meta data information PBMC_batch2_df = load_PBMC_batch2_df(data_dir) common_barcodes = set(PBMC_batch2_df['barcode']).intersection( set(adata.obs['barcode'])) adata = adata[list(common_barcodes)] adata.obs = adata.obs.merge(PBMC_batch2_df, left_on="barcode", right_on="barcode") adata.obs.index = adata.obs["barcode"] adata.obs.index.name = None adata.var.index.name = None adata.obs.rename(columns={'cell': 'cell.type'}, inplace=True) if condition is not None: cond_cells = adata.obs[adata.obs["condition"] == condition].index adata = adata[cond_cells] if ind is not None: ind_cells = adata.obs[adata.obs["ind"].isin(ind.split('_'))].index adata = adata[ind_cells] return adata
def load_PBMC_batch1_data(data_dir, batch=None, ind=None): '''Loading PBMC batch1 S1 dataset from W1-3 @batch: A/B/C @ind: 1043 1079 1154 1249 1493 1511 1598 1085 ''' ## load batch1 genes genes = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/GSE96583_batch1.genes.tsv', header=None, sep='\t') ## load matrix A data A_adata = anndata.read_mtx( data_dir + os.sep + 'PBMC_demuxlet/GSM2560245_A.mat').T # 3639 inviduals ## load cells A_adata.var['gene_symbols'] = genes[1].values A_adata.var_names = A_adata.var['gene_symbols'] A_adata.var_names_make_unique(join="-") # make unique A_cells = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/GSM2560245_barcodes.tsv', header=None, sep='\t') A_adata.obs['barcode'] = 'A_' + A_cells[0].values A_adata.obs_names = A_cells[0] A_adata.obs_names_make_unique(join="-") # make unique ## load matrix B data B_adata = anndata.read_mtx( data_dir + os.sep + 'PBMC_demuxlet/GSM2560246_B.mat').T # 4246 inviduals ## load cells B_adata.var['gene_symbols'] = genes[1].values B_adata.var_names = B_adata.var['gene_symbols'] B_adata.var_names_make_unique(join="-") # make unique B_cells = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/GSM2560246_barcodes.tsv', header=None, sep='\t') B_adata.obs['barcode'] = 'B_' + B_cells[0].values B_adata.obs_names = B_cells[0] B_adata.obs_names_make_unique(join="-") # make unique ## load matrix C data C_adata = anndata.read_mtx( data_dir + os.sep + 'PBMC_demuxlet/GSM2560247_C.mat').T # 6145 inviduals ## load cells C_adata.var['gene_symbols'] = genes[1].values C_adata.var_names = C_adata.var['gene_symbols'] C_adata.var_names_make_unique(join="-") # make unique C_cells = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/GSM2560247_barcodes.tsv', header=None, sep='\t') C_adata.obs['barcode'] = 'C_' + C_cells[0].values C_adata.obs_names = C_cells[0] C_adata.obs_names_make_unique(join="-") # make unique # combine data together adata = A_adata.concatenate(B_adata, C_adata, batch_key="batch", batch_categories=['A', 'B', 'C']) adata.obs.index = adata.obs["barcode"] adata.obs.index.name = None ## load meta data information PBMC_batch1_df = load_PBMC_batch1_df(data_dir) common_barcodes = set(PBMC_batch1_df['barcode']).intersection( set(adata.obs['barcode'])) adata = adata[list(common_barcodes)] adata.obs = adata.obs.merge(PBMC_batch1_df, on=["barcode", "batch"], how="left") adata.obs.index = adata.obs["barcode"] adata.obs.index.name = None adata.var.index.name = None if batch is not None: batch_cells = adata.obs[adata.obs['batch'] == batch].index adata = adata[batch_cells] if ind is not None: ind_list = [int(x) for x in ind.split('_')] ind_cells = adata.obs[adata.obs["ind"].isin(ind_list)].index adata = adata[ind_cells] return adata
def autoencode(n_inoutnodes_human, n_inoutnodes_mouse, shared_size, adata = None, mtx_file=None, pred_adata=None, pred_mtx_file = None, species = None, nonmissing_indicator=None, initial_file= "", out_dir=".", write_output_to_tsv = False, save_data = False, verbose = True, verbose_sum = True, verbose_fit = 1, batch_size = 32, data_name = '', net_kwargs={}, training_kwargs={}): ############### if adata is None: if mtx_file is None: print('Either adata or mtx_file should be provided') return adata = anndata.read_mtx(mtx_file).transpose() if data_name == '': data_name = re.sub(r'.*/', '', mtx_file) data_name = data_name.replace('.mtx', '') + '_' assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance' ## add other information into AnnData instances adata.X = csr_matrix(adata.X) if species is not None: adata.uns['species'] = species adata.uns['data_type'] = 'UMI' # set seed for reproducibility np.random.seed(42) tf.random.set_seed(42) # print(type(adata.X)) adata = read_dataset(adata, transpose=False, test_split=False, verbose = verbose, copy=False) if 'X_dca' in adata.obsm_keys(): filter_min_counts = False size_factors=False adata.X = csr_matrix(adata.obsm['X_dca']) if pred_adata: pred_adata.X = csr_matrix(pred_adata.obsm['X_dca']) else: filter_min_counts = True size_factors = True adata = normalize(adata, filter_min_counts = filter_min_counts, size_factors=size_factors, logtrans_input=True) adata.uns['shared'] = adata.X.tocsc()[:, 0:shared_size].tocsr() # print(type(adata.X)) if pred_adata or pred_mtx_file: if pred_adata is None: pred_adata = anndata.read_mtx(pred_mtx_file).transpose() else: pred_adata.X = csr_matrix(pred_adata.X) pred_adata.uns['species'] = species pred_adata.uns['data_type'] = 'UMI' pred_adata = read_dataset(pred_adata, transpose=False, verbose = verbose, test_split=False) pred_adata = normalize(pred_adata, size_factors=size_factors, logtrans_input=True) pred_adata.uns['shared'] = pred_adata.X.tocsc()[:, 0:shared_size].tocsr() if nonmissing_indicator is None: nonmissing_indicator = 1 net = nj.JointAutoencoder(input_size_human=n_inoutnodes_human, input_size_mouse=n_inoutnodes_mouse, shared_size = shared_size, **net_kwargs) net.build() if (initial_file != ""): net.load_weights(initial_file) print("Weights loaded from %s!" % initial_file) model = tj.train_joint(adata[adata.obs.DCA_split == 'train'], adata.uns['shared'], net, output_dir=out_dir, batch_size = batch_size, save_weights = True, verbose = verbose, verbose_sum = verbose_sum, verbose_fit = verbose_fit, nonmissing_indicator = nonmissing_indicator, **training_kwargs) model.load_weights("%s/weights.hdf5" % out_dir) if pred_adata or pred_mtx_file: del adata res = net.predict(pred_adata, pred_adata.uns['shared']) del model,net gc.collect() pred_adata.obsm['X_dca'] = res['mean_norm'] if write_output_to_tsv: print('Saving files ...') write_text_matrix(res['mean_norm'], os.path.join(out_dir, data_name + 'pred_mean_norm.tsv')) if save_data: with open(os.path.join(out_dir, data_name + 'pred_adata.pickle'), 'wb') as f: pickle.dump(pred_adata, f, protocol=4) f.close() return pred_adata res = net.predict(adata, adata.uns['shared']) del model,net gc.collect() adata.obsm['X_dca'] = res['mean_norm'] if write_output_to_tsv: print('Saving files ...') write_text_matrix(res['mean_norm'], os.path.join(out_dir, data_name + 'mean_norm.tsv')) # write_text_matrix(res['dispersion'], # os.path.join(out_dir, data_name + 'dispersion.tsv')) if save_data: with open(os.path.join(out_dir, data_name + 'adata.pickle'), 'wb') as f: pickle.dump(adata, f, protocol=4) f.close() return adata
def load_shareseq_data(tissue: str, dirname: str, mode: str = "RNA") -> AnnData: """Load the SHAREseq data""" assert os.path.isdir(dirname) atac_fname_dict = { "skin": [ "GSM4156597_skin.late.anagen.barcodes.txt.gz", "GSM4156597_skin.late.anagen.counts.txt.gz", "GSM4156597_skin.late.anagen.peaks.bed.gz", ], "brain": [ "GSM4156599_brain.barcodes.txt.gz", "GSM4156599_brain.counts.txt.gz", "GSM4156599_brain.peaks.bed.gz", ], "lung": [ "GSM4156600_lung.barcodes.txt.gz", "GSM4156600_lung.counts.txt.gz", "GSM4156600_lung.peaks.bed.gz", ], } rna_fname_dict = { "skin": "GSM4156608_skin.late.anagen.rna.counts.txt.gz", "brain": "GSM4156610_brain.rna.counts.txt.gz", "lung": "GSM4156611_lung.rna.counts.txt.gz", } assert atac_fname_dict.keys() == rna_fname_dict.keys() assert tissue in atac_fname_dict.keys(), f"Unrecognized tissue: {tissue}" atac_barcodes_fname, atac_counts_fname, atac_peaks_fname = atac_fname_dict[ tissue] assert "barcodes" in atac_barcodes_fname # Check fnames are unpacked correctly assert "counts" in atac_counts_fname assert "peaks" in atac_peaks_fname atac_cell_barcodes = pd.read_csv( os.path.join(dirname, atac_barcodes_fname), delimiter="\t", index_col=0, header=None, ) atac_cell_barcodes.index = [ i.replace(",", ".") for i in atac_cell_barcodes.index ] # Load in RNA data if mode == "RNA": retval = ad.read_text(os.path.join(dirname, rna_fname_dict[tissue])).T # Ensure that we return a sparse matrix as the underlying datatype retval.X = scipy.sparse.csr_matrix(retval.X) # Fix formatting of obs names where commas were used for periods retval.obs.index = [i.replace(",", ".") for i in retval.obs.index] intersected_barcodes = [ bc for bc in retval.obs_names if bc in set(atac_cell_barcodes.index) ] assert intersected_barcodes, f"No common barcodes between RNA/ATAC for {tissue}" logging.info( f"RNA {tissue} intersects {len(intersected_barcodes)}/{len(retval.obs_names)} barcodes with ATAC" ) retval = retval[intersected_barcodes] elif mode == "ATAC": # Load in ATAC data # read_mtx automatically gives us a sparse matrix retval = ad.read_mtx(os.path.join(dirname, atac_counts_fname)).T # Attach metadata retval.obs = atac_cell_barcodes atac_peaks = pd.read_csv( os.path.join(dirname, atac_peaks_fname), delimiter="\t", header=None, names=["chrom", "start", "end"], ) atac_peaks.index = [ f"{c}:{s}-{e}" for _i, c, s, e in atac_peaks.itertuples() ] retval.var = atac_peaks else: raise ValueError("mode must be either RNA or ATAC") assert isinstance(retval.X, scipy.sparse.csr_matrix) return retval
def autoencode( adata=None, mtx_file=None, pred_adata=None, ## cross-validation purpose pred_mtx_file=None, out_dir=".", write_output_to_tsv=False, save_data=False, verbose=True, verbose_sum=True, verbose_fit=1, batch_size=32, data_name="", nonmissing_indicator=None, net_kwargs={}): ############### if adata is None: if mtx_file is None: print('Either adata or mtx_file should be provided') return adata = anndata.read_mtx(mtx_file).transpose() if data_name == "": data_name = re.sub(r'.*/', '', mtx_file) data_name = data_name.replace('.mtx', '') + '_' assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance' adata.uns['data_type'] = 'UMI' # set seed for reproducibility np.random.seed(42) tf.random.set_seed(42) adata = read_dataset(adata, transpose=False, test_split=False, verbose=verbose, copy=False) adata = normalize(adata, filter_min_counts=True, size_factors=True, logtrans_input=True) if pred_adata or pred_mtx_file: if pred_adata is None: pred_adata = anndata.read_mtx(pred_mtx_file).transpose() pred_adata.uns['data_type'] = 'UMI' pred_adata = read_dataset(pred_adata, transpose=False, test_split=False, verbose=verbose, copy=False) pred_adata = normalize(pred_adata, size_factors=True, logtrans_input=True) net = NBConstantDispAutoencoder(input_size=adata.n_vars, nonmissing_indicator=nonmissing_indicator, **net_kwargs) net.build() loss = train(adata[adata.obs.DCA_split == 'train'], net, output_dir=out_dir, batch_size=batch_size, save_weights=True, nonmissing_indicator=nonmissing_indicator, verbose=verbose, verbose_sum=verbose_sum, verbose_fit=verbose_fit) net.load_weights("%s/weights.hdf5" % out_dir) if pred_adata or pred_mtx_file: del adata res = net.predict(pred_adata) pred_adata.obsm['X_dca'] = res['mean_norm'] del net, loss gc.collect() if write_output_to_tsv: print('Saving files ...') write_text_matrix( res['mean_norm'], os.path.join(out_dir, data_name + 'pred_mean_norm.tsv')) if save_data: with open(os.path.join(out_dir, data_name + 'pred_adata.pickle'), 'wb') as f: pickle.dump(pred_adata, f, protocol=4) f.close() return pred_adata res = net.predict(adata) adata.obsm['X_dca'] = res['mean_norm'] adata.var['X_dca_dispersion'] = res['dispersion'] if write_output_to_tsv: print('Saving files ...') write_text_matrix(res['mean_norm'], os.path.join(out_dir, data_name + 'mean_norm.tsv')) write_text_matrix(res['dispersion'], os.path.join(out_dir, data_name + 'dispersion.tsv')) if save_data: with open(os.path.join(out_dir, data_name + 'adata.pickle'), 'wb') as f: pickle.dump(adata, f, protocol=4) f.close() del net, loss gc.collect() return adata
nonmissing_indicator = 1, out_dir = '../data/10X_pbmc_filtered' batch_size = 261 write_output_to_tsv = False save_data = True verbose = True verbose_sum = True verbose_fit = 1 seed = 1 data_name = "" curve = np.loadtxt(curve_file_name) print(curve) adata = anndata.read_mtx(mtx_file).transpose() assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance' # set seed for reproducibility np.random.seed(seed) tf.set_random_seed(seed) adata = read_dataset(adata, transpose=False, test_split=False, verbose=verbose, copy=False) pred_adata = anndata.read_mtx(pred_mtx_file).transpose() pred_adata = read_dataset(pred_adata,
def load_mouseprotocol_adata(data_dir, exp=None, protocol=None, curate=False): '''load Mouse cortex data from different protocols After comparing between count.umis.txt and count.reads.txt, although the same dimension, there are entries different with each other - Extract out plate-based cells from count.reads.txt - Extract out droplet-based cells from counts.umi.txt - Concatenate to anndata @exp: cortex1, cortex2 @protocol: plate-based (Smart-seq2), droplet-based(DroNc-seq, sci-RNA-seq, 10x Chromium) @curate: whether to curate for cell types ''' plate_protocols = ["Smart-seq2"] metadata_df = pd.read_csv(data_dir + os.sep + "Mousecortex_protocols/metadata.txt", header=0, sep="\t") metadata_df = metadata_df[metadata_df["CellType"] != "Unassigned"] ## remove unassigned ## they used the same cells and genes indicator cells = pd.read_csv(data_dir + os.sep + "Mousecortex_protocols/cell.names.new.txt", header=None) genes = pd.read_csv(data_dir + os.sep + "Mousecortex_protocols/genes.counts.txt", header=None) ## plate-based data read_adata = anndata.read_mtx(data_dir + os.sep + "Mousecortex_protocols/count.reads.txt").T read_adata.var['gene_symbols'] = [x.split('_')[1] for x in genes[0].values] read_adata.var_names = read_adata.var['gene_symbols'] read_adata.var_names_make_unique(join="-") # make unique read_adata.var_names.name = None read_adata.obs['barcode'] = cells[0].values read_adata.obs_names = read_adata.obs['barcode'] read_adata.obs_names_make_unique(join="-") ## make unique read_adata.obs_names.name = None plate_metadata = metadata_df[metadata_df['Method'].isin(plate_protocols)] common_cells = set(plate_metadata['NAME']).intersection( set(read_adata.obs_names)) common_cells = list(common_cells) read_adata = read_adata[common_cells] obs_df = read_adata.obs.merge(plate_metadata, how='left', left_index=True, right_on='NAME') obs_df.index = obs_df['barcode'].values read_adata.obs = obs_df ## umi-based data umi_adata = anndata.read_mtx(data_dir + os.sep + "Mousecortex_protocols/count.umis.txt").T umi_adata.var['gene_symbols'] = [x.split('_')[1] for x in genes[0].values] umi_adata.var_names = umi_adata.var['gene_symbols'] umi_adata.var_names_make_unique(join="-") # make unique umi_adata.var_names.name = None umi_adata.obs['barcode'] = cells[0].values umi_adata.obs_names = umi_adata.obs['barcode'] umi_adata.obs_names_make_unique(join="-") ## make unique umi_adata.obs_names.name = None droplet_metadata = metadata_df[~metadata_df['Method'].isin(plate_protocols )] common_cells = set(droplet_metadata['NAME']).intersection( set(umi_adata.obs_names)) common_cells = list(common_cells) umi_adata = umi_adata[common_cells] obs_df = umi_adata.obs.merge(droplet_metadata, how='left', left_index=True, right_on='NAME') obs_df.index = obs_df['barcode'].values umi_adata.obs = obs_df ## concatenate adata together adata = read_adata.concatenate(umi_adata, batch_key="protocol_type", batch_categories=['plate', 'droplet']) adata.obs.rename(columns={'CellType': 'cell.type'}, inplace=True) adata_obs = adata.obs adata_obs['Method'].replace(['10x Chromium'], '10x', inplace=True) adata.obs = adata_obs if exp is not None: exp_cells = adata.obs[adata.obs['Experiment'] == exp].index adata = adata[exp_cells] if protocol is not None: proc_cells = adata.obs[adata.obs['Method'] == protocol].index adata = adata[proc_cells] if curate: adata_obs = adata.obs adata_obs["cell.type"].replace(['Astrocyte'], 'Astrocytes', inplace=True) adata_obs["cell.type"].replace(['Excitatory neuron'], 'Neuron', inplace=True) adata_obs["cell.type"].replace(['Inhibitory neuron'], 'Interneuron', inplace=True) adata_obs["cell.type"].replace(['Oligodendrocyte'], 'Oligodendrocytes', inplace=True) adata_obs["cell.type"].replace(['OPC'], 'Polydendrocytes', inplace=True) #adata_obs["cell.type"].replace(['Pericyte'], 'Mural', inplace=True) ## seems not the same cell types ## Endothelial and Microglia as it is adata.obs = adata_obs return adata
def autoencode( adata=None, curve_file_name=None, mtx_file=None, pred_adata=None, ## cross-validation purpose pred_mtx_file=None, out_dir=".", write_output_to_tsv=False, save_data=False, verbose=True, verbose_sum=True, verbose_fit=1, batch_size=32, seed=1, data_name="", nonmissing_indicator=None): ############### print(out_dir) curve = np.loadtxt(curve_file_name) print(curve) if adata is None: if mtx_file is None: print('Either adata or mtx_file should be provided') return adata = anndata.read_mtx(mtx_file).transpose() if data_name == "": data_name = re.sub(r'.*/', '', mtx_file) data_name = data_name.replace('.mtx', '') + '_' assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance' adata.uns['data_type'] = 'UMI' # set seed for reproducibility np.random.seed(seed) tf.set_random_seed(seed) adata = read_dataset(adata, transpose=False, test_split=False, verbose=verbose, copy=False) adata.raw = adata if pred_adata or pred_mtx_file: if pred_adata is None: pred_adata = anndata.read_mtx(pred_mtx_file).transpose() pred_adata.uns['data_type'] = 'UMI' pred_adata = read_dataset(pred_adata, transpose=False, test_split=False, verbose=verbose, copy=False) tmpX = adata.X.A tmpX = tf.convert_to_tensor(tmpX, dtype=np.float32) curve = tf.cast(curve, tf.float32) pi = PiAct(curve[1] * K.exp(curve[0] - K.exp(curve[2]) * tmpX)) net = DecayModelAutoencoder(curve=curve, pi=pi, input_size=adata.n_vars, nonmissing_indicator=nonmissing_indicator) net.build() print("going into training..") loss = train(adata[adata.obs.DCA_split == 'train'], net, output_dir=out_dir, batch_size=batch_size, save_weights=True, nonmissing_indicator=nonmissing_indicator, verbose=verbose, verbose_sum=verbose_sum, verbose_fit=verbose_fit) net.load_weights("%s/weights.hdf5" % out_dir) if pred_adata or pred_mtx_file: del adata res = net.predict(pred_adata) output_dispersion = res['dispersion'] output_mean = res['mean_norm'] outputmean_tensor = tf.convert_to_tensor(output_mean) output_pi_tensor = PiAct( curve[1] * K.exp(curve[0] - K.exp(curve[2]) * outputmean_tensor)) output_pi = (tf.Session().run(output_pi_tensor)) del net, loss gc.collect() if write_output_to_tsv: print('Saving files ...') write_text_matrix( res['mean_norm'], os.path.join(out_dir, data_name + 'pred_mean_norm.tsv')) if save_data: with open(os.path.join(out_dir, data_name + 'pred_adata.pickle'), 'wb') as f: pickle.dump(pred_adata, f, protocol=4) f.close() return output_mean, output_dispersion, output_pi res = net.predict(adata) output_dispersion = res['dispersion'] output_mean = res['mean_norm'] output_pi = res['pi'] outputmean_tensor = tf.convert_to_tensor(output_mean) output_pi_tensor = PiAct( curve[1] * K.exp(curve[0] - K.exp(curve[2]) * outputmean_tensor)) output_pi = (tf.Session().run(output_pi_tensor)) if write_output_to_tsv: print('Saving files ...') write_text_matrix(res['mean_norm'], os.path.join(out_dir, data_name + 'mean_norm.tsv')) write_text_matrix(res['dispersion'], os.path.join(out_dir, data_name + 'dispersion.tsv')) if save_data: with open(os.path.join(out_dir, data_name + 'adata.pickle'), 'wb') as f: pickle.dump(adata, f, protocol=4) f.close() del net, loss gc.collect() return output_mean, output_dispersion, output_pi
def load_PBMC_protocols_data(data_dir, exp=None, protocol=None, protocol_type=None, curate=False): '''load PBMC from different protocols After comparing between cells.read.new.txt and cells.umi.new.txt, the cell names remain as cells.umi.new.txt. Therefore, compare the count matrix and check reads. Turn out to be umi and read count are not the same.. - Extract out plate-based cells from counts.read.txt - Extract out droplet-based cells from counts.umi.txt - Concatenate to anndata @ exp: pbmc1 (froze) and pbmc2 (fresh) @ protocol: plate-based (Smart-Seq2/CEL-Seq2), droplet-based (10X v2, 10X v3, Drop-seq, Seq-Well, inDrops) @ protocol_type: plate or droplet ''' plate_protocols = ["CEL-Seq2", "Smart-seq2"] metadata_df = pd.read_csv(data_dir+os.sep+"PBMC_protocols/metadata.txt", header=0, sep="\t") if curate: metadata_df["CellType"].replace(["B cell"], 'B cells', inplace=True) metadata_df["CellType"].replace(["CD14+ monocyte"], 'CD14+ Monocytes', inplace=True) metadata_df["CellType"].replace(["CD4+ T cell"], 'CD4 T cells', inplace=True) metadata_df["CellType"].replace(["Cytotoxic T cell"], 'CD8 T cells', inplace=True) metadata_df["CellType"].replace(["Natural killer cell"], 'NK cells', inplace=True) ## plate-based data read_adata = anndata.read_mtx(data_dir+os.sep+"PBMC_protocols/counts.read.txt").T read_cells = pd.read_csv(data_dir+os.sep+"PBMC_protocols/cells.read.new.txt", header=None) read_genes = pd.read_csv(data_dir+os.sep+"PBMC_protocols/genes.read.txt", header=None) read_adata.var['gene_symbols'] = [x.split('_')[1] for x in read_genes[0].values] read_adata.var_names = read_adata.var['gene_symbols'] read_adata.var_names_make_unique(join="-") # make unique read_adata.var_names.name = None read_adata.obs['barcode'] = read_cells[0].values read_adata.obs_names = read_adata.obs['barcode'] read_adata.obs_names_make_unique(join="-") ## make unique read_adata.obs_names.name = None plate_metadata = metadata_df[metadata_df['Method'].isin(plate_protocols)] common_cells = set(plate_metadata['NAME']).intersection(set(read_adata.obs_names)) common_cells = list(common_cells) read_adata = read_adata[common_cells] # 1052 cells obs_df = read_adata.obs.merge(plate_metadata, how='left', left_index=True, right_on='NAME') obs_df.index = obs_df['barcode'].values read_adata.obs = obs_df ## umi-based data umi_adata = anndata.read_mtx(data_dir+os.sep+"PBMC_protocols/counts.umi.txt").T umi_cells = pd.read_csv(data_dir+os.sep+"PBMC_protocols/cells.umi.new.txt", header=None) umi_genes = pd.read_csv(data_dir+os.sep+"PBMC_protocols/genes.umi.txt", header=None) umi_adata.var['gene_symbols'] = [x.split('_')[1] for x in umi_genes[0].values] umi_adata.var_names = umi_adata.var['gene_symbols'] umi_adata.var_names_make_unique(join="-") # make unique umi_adata.var_names.name = None umi_adata.obs['barcode'] = umi_cells[0].values umi_adata.obs_names = umi_adata.obs['barcode'] umi_adata.obs_names_make_unique(join="-") ## make unique umi_adata.obs_names.name = None droplet_metadata = metadata_df[~metadata_df['Method'].isin(plate_protocols)] common_cells = set(droplet_metadata['NAME']).intersection(set(umi_adata.obs_names)) common_cells = list(common_cells) umi_adata = umi_adata[common_cells] # 29969 cells obs_df = umi_adata.obs.merge(droplet_metadata, how='left', left_index=True, right_on='NAME') obs_df.index = obs_df['barcode'].values umi_adata.obs = obs_df ## concatenate adata together adata = read_adata.concatenate(umi_adata, batch_key="protocol_type", batch_categories=['plate', 'droplet']) adata.obs.rename(columns={'CellType': 'cell.type'}, inplace=True) adata_obs = adata.obs adata_obs['Method'].replace(['10x Chromium (v2)', '10x Chromium (v2) A', '10x Chromium (v2) B'], '10x-v2', inplace=True) adata_obs['Method'].replace(['10x Chromium (v3)'], '10x-v3', inplace=True) adata.obs = adata_obs if exp is not None: exp_cells = adata.obs[adata.obs['Experiment'].isin(exp.split('_'))].index adata = adata[exp_cells] if protocol is not None: prot_cells = adata.obs[adata.obs['Method'].isin(protocol.split('_'))].index adata = adata[prot_cells] if protocol_type is not None: prot_type_cells = adata.obs[adata.obs['protocol_type'] == protocol_type].index adata = adata[prot_type_cells] return adata