def save_to_dataset(path, X, X_col=None, y=None, y_col=None, rowname=None, print_log=True): r""" path : output folder path X : (n_samples, n_genes) gene expression matrix X_col : (n_genes,) name of each gene y : (n_samples, n_proteins) protein marker level matrix y_col : (n_proteins) name of each protein rowname : (n_samples,) name of cells (i.e. the sample) print_log : bool (default: True) """ _check_data(X, X_col, y, y_col, rowname) assert os.path.isdir(path), "'%s' must be path to a folder" % path # save data if print_log: print("Saving data to %s ..." % ctext(path, 'cyan')) # saving sparse matrix if sparse.issparse(X): with open(os.path.join(path, 'X'), 'wb') as f: pickle.dump(X, f) else: with MmapArrayWriter(path=os.path.join(path, 'X'), dtype='float32', shape=(0, X.shape[1]), remove_exist=True) as out: out.write(X) # save the meta info (X features) if X_col is not None: with open(os.path.join(path, 'X_col'), 'wb') as f: pickle.dump(X_col, f) # saving the label data (can be continous or discrete or binary) if y is not None and len(y.shape) > 0 and y.shape[1] != 0: if sparse.issparse(y): with open(os.path.join(path, 'y'), 'wb') as f: pickle.dump(y, f) else: with MmapArrayWriter(path=os.path.join(path, 'y'), dtype='float32', shape=(0, y.shape[1]), remove_exist=True) as out: out.write(y) with open(os.path.join(path, 'y_col'), 'wb') as f: pickle.dump(y_col, f) # row name for both X and y if rowname is not None: with open(os.path.join(path, 'X_row'), 'wb') as f: pickle.dump(rowname, f)
def test_write_single_time(self): fpath = _get_tempfile() array = np.arange(0, 100, dtype='float32').reshape(-1, 5) with MmapArrayWriter(path=fpath, shape=array.shape, dtype=array.dtype, remove_exist=True) as f: f.write(array) x = MmapArray(fpath) self.assertTrue(np.all(array == x)) with MmapArrayWriter(path=fpath, remove_exist=False) as f: f.write(array) x = MmapArray(fpath) self.assertTrue(np.all(np.concatenate([array, array], axis=0) == x))
def test_write_multiple_time(self): fpath = _get_tempfile() array = np.arange(0, 1000, dtype='float32').reshape(-1, 2, 5) with MmapArrayWriter(path=fpath, shape=(0, ) + array.shape[1:], dtype=array.dtype, remove_exist=True) as f: for i in range(0, array.shape[0], 8): f.write(array[i:i + 8]) x = MmapArray(fpath) self.assertTrue(np.all(array == x)) array1 = np.arange(0, 100, dtype='float32').reshape(-1, 2, 5) array[10:10 + array1.shape[0]] = array1 with MmapArrayWriter(path=fpath, remove_exist=False) as f: f.write(array1, start_position=10) x = MmapArray(fpath) self.assertTrue(np.all(array == x))
def __init__(self, path='~/tensorflow_datasets/3dshapes.h5', cache_dir=None, seed=8): path = os.path.abspath(os.path.expanduser(path)) assert os.path.exists(path), "Path to file %s must exists" % path self.path = path if cache_dir is None: cache_dir = os.path.dirname(path) if not os.path.exists(cache_dir): os.mkdir(cache_dir) image_path = os.path.join(cache_dir, '3dshapes.images') label_path = os.path.join(cache_dir, '3dshapes.labels') # ====== read the dataset and cache it again ====== # if not os.path.exists(image_path) or not os.path.exists(label_path): import h5py with h5py.File(path, 'r') as dataset: images = dataset['images'] labels = dataset['labels'] with MmapArrayWriter(image_path, shape=images.shape, dtype=images.dtype, remove_exist=True) as img, \ MmapArrayWriter(label_path, shape=labels.shape, dtype=labels.dtype, remove_exist=True) as lab: for start, end in tqdm(list( batching(8000, n=images.shape[0])), desc="Caching data"): img.write(images[start:end]) lab.write(labels[start:end]) # ====== load the data ====== # self.images = MmapArray(image_path) self.factors = MmapArray(label_path) # ====== split the dataset ====== # rand = np.random.RandomState(seed=seed) n = len(self.images) ids = rand.permutation(n) # train:85% valid:5% test:10% self.train_indices = ids[:int(0.85 * n)] self.valid_indices = ids[int(0.85 * n):int(0.9 * n)] self.test_indices = ids[int(0.9 * n):]
def test_read_multiprocessing(self): fpath = _get_tempfile() array = np.random.rand(1200, 25, 8) # first write the array with MmapArrayWriter(fpath, (None, 25, 8), array.dtype) as f: f.write(array) x = MmapArray(fpath) self.assertTrue(np.all(array == x)) # use multiprocessing to randomly read the array jobs = [(x, sorted( np.random.randint(0, array.shape[0], size=(2, ), dtype='int32'))) for i in range(25)] with Pool(2) as pool: for start, end, data in pool.map(_fn_read, jobs): data = zlib.decompress(data) data = np.frombuffer(data).reshape(-1, 25, 8) self.assertTrue(np.all(data == array[start:end]))
def read_centenarian(override=False, verbose=False): r""" Data used in: "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in supercentenarians" | bioRxiv [WWW Document], n.d. URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20). """ download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed') if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): labels = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[2])), url=_URL[2], ) data = [] with gzip.open(labels, mode='rb') as f: for line in f: line = str(line, 'utf-8').strip().split('\t') assert line[1][:2] == line[2] data.append(line) labels = np.array(data) y_col = sorted(set(labels[:, 1])) y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]), len(y_col)).astype('float32') y_col = np.array(y_col) # raw = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[0])), url=_URL[0], ) if verbose: print("Unzip and reading raw UMI ...") X_raw, cell_id1, gene_id1 = read_gzip_csv(raw) # norm = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[1])), url=_URL[1], ) if verbose: print("Unzip and reading log-norm UMI ...") X_norm, cell_id2, gene_id2 = read_gzip_csv(norm) # assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \ np.all(gene_id1 == gene_id2) assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \ X_raw.shape[1] == X_norm.shape[1] == len(gene_id1) # if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X=X_raw, X_col=gene_id1, y=y, y_col=y_col, rowname=cell_id1, print_log=verbose) with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'), shape=(0, X_norm.shape[1]), dtype='float32', remove_exist=True) as f: for s, e in batching(batch_size=2048, n=X_norm.shape[0]): f.write(X_norm[s:e]) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
mmap_path = '/tmp/tmp.mmap' numpy_path = '/tmp/tmp.array' N = 50000 X = np.random.rand(N, 25, 128).astype('float64') print("Array size: %.2f (MB)\n" % (np.prod(X.shape) * X.dtype.itemsize / 1024 / 1024)) # ====== test created dataset ====== # start = timeit.default_timer() hdf5 = h5py.File(hdf5_path, 'w') print('Create HDF5 in:', timeit.default_timer() - start, 's') start = timeit.default_timer() mmap = MmapArrayWriter(mmap_path, dtype='float64', shape=(0, ) + X.shape[1:], remove_exist=True) print('Create Memmap in:', timeit.default_timer() - start, 's') # ====== writing ====== # print() start = timeit.default_timer() with open(numpy_path, 'wb') as f: np.save(f, X) print('Numpy save in:', timeit.default_timer() - start, 's') start = timeit.default_timer() hdf5['X'] = X print('Writing data to HDF5 :', timeit.default_timer() - start, 's')
def _fn_write(job): idx, array, path, shape = job with MmapArrayWriter(path=path, shape=shape, dtype='float64') as f: f.write(array, start_position=idx * array.shape[0])
def _extract_zero_and_first_stats(X, sad, indices, gmm, z_path, f_path, name_path): n_samples = X.shape[0] # indices is None, every row is single sample (utterance or image ...) if indices is None: if os.path.exists(z_path): os.remove(z_path) if os.path.exists(f_path): os.remove(f_path) Z = MmapArrayWriter(path=z_path, dtype='float32', shape=(n_samples, gmm.nmix), remove_exist=True) F = MmapArrayWriter(path=f_path, dtype='float32', shape=(n_samples, gmm.feat_dim * gmm.nmix), remove_exist=True) jobs, _ = _split_jobs(n_samples, ncpu=mpi.cpu_count(), device='cpu', gpu_factor=1) def map_transform(start_end): start, end = start_end for i in range(start, end): # removed by SAD if sad is not None and not bool(sad[i]): yield None, None, None else: z, f = gmm.transform(X[i][np.newaxis, :], zero=True, first=True, device='cpu') yield i, z, f prog = Progbar(target=n_samples, print_report=True, print_summary=False, name="Extracting zero and first order statistics") for i, z, f in mpi.MPI(jobs, map_transform, ncpu=None, batch=1): if i is not None: # i None means removed by SAD Z[i] = z F[i] = f prog.add(1) Z.flush() F.flush() Z.close() F.close() # use directly the transform_to_disk function else: gmm.transform_to_disk(X, indices=indices, sad=sad, pathZ=z_path, pathF=f_path, name_path=name_path, dtype='float32', device=None, ncpu=None, override=True)
def read_dataset10x(name, filtered_cells=True, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Predefined procedure for download and preprocessing 10x dataset into `SingleCellOMIC` i.e. scanpy.AnnData object Reference: https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html """ ### prepare the URL name = str(name).lower().strip() spec = 'filtered' if filtered_cells else 'raw' flatten_datasets = [(exp, version, dsname) for exp, i in all_datasets.items() for version, j in i.items() for dsname in j] found = [] for exp, version, dsname in flatten_datasets: if name == dsname: found.append((exp, version, dsname)) if not found: raise ValueError(f"Cannot find data with name {name}, " f"all available datasets are: {flatten_datasets}") if len(found) > 1: raise RuntimeError( f"Found multiple datasets {found} with name='{name}'") exp, version, name = found[0] dataset_name = name + '_' + spec url = group_to_url_skeleton[exp][version].format(version, name, name, spec) ### prepare the output path filename = os.path.basename(url) # download path download_path = os.path.join(DOWNLOAD_DIR, exp, version) if not os.path.exists(download_path): os.makedirs(download_path) # preprocessing path preprocessed_path = os.path.join(DATA_DIR, f'10x_{exp}_{name}_{spec}_preprocessed') if override and os.path.exists(preprocessed_path): if verbose: print("Overriding path: %s" % preprocessed_path) shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: if verbose: print("Dataset10X:") print(" Meta :", found) print(" File :", filename) print(" URL :", url) print(" Download :", download_path) print(" Preprocess :", preprocessed_path) ### download the tar file path = download_file(url=url, filename=os.path.join(download_path, filename), override=False, md5=_MD5.get(f"{exp}*{version}*{name}*{spec}", None)) if not tarfile.is_tarfile(path): raise RuntimeError("Expecting tarfile but received: %s" % path) contents = {} with tarfile.open(path, mode="r:gz") as f: all_files = [(path, info.name, info.size, verbose) for info in f if info.isfile()] for name, data in MPI(jobs=all_files, func=_read_tarinfo, batch=1, ncpu=4): contents[name] = data # cell barcodes barcodes = contents['barcodes'] ### cell-atac if exp == 'cell-atac': n_top_genes = 20000 # this is ad-hoc value X = contents['matrix'].T.todense() peaks = contents['peaks'] X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype( np.float32) X_col_name = np.array([':'.join(i) for i in peaks]) save_data = [(OMIC.atac.name, X)] save_metadata = dict(main_omic=OMIC.atac.name, barcodes=barcodes, chromatin_var=X_col_name) sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.atac, name=name) ### cell-exp and cell-vdj elif exp in ('cell-exp', 'cell-vdj'): n_top_genes = 2000 # feature (Id, Name, Type(antibody or gene-expression)) X_col = contents[ 'features'] if 'features' in contents else contents['genes'] # data matrix X = contents['matrix'].T if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'): X = X.tocsr() X = X.astype('float32') assert X.shape[0] == barcodes.shape[0] and X.shape[ 1] == X_col.shape[0] # antibody and gene are provided prot_ids = [] pmhc_ids = [] gene_ids = [] if X_col.shape[1] == 3: for idx, (feat_id, feat_name, feat_type) in enumerate(X_col): if feat_type == 'Antibody Capture': if exp == "cell-vdj" and "_TotalSeqC" not in feat_name: pmhc_ids.append(idx) else: prot_ids.append(idx) elif feat_type == 'Gene Expression': gene_ids.append(idx) else: raise ValueError( f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}" ) elif X_col.shape[1] == 2: gene_ids = slice(None, None) else: raise ValueError(f"No support for features matrix\n{X_col}") # Antibody ID, Antibody Name y = X[:, prot_ids] y_col = X_col[prot_ids][:, 0] # the id y_col_name = X_col[prot_ids][:, 1] # the name # pMHC peptide if len(pmhc_ids) > 0: z = X[:, pmhc_ids] z_col = X_col[pmhc_ids][:, 0] # the id z_col_name = X_col[pmhc_ids][:, 1] # the name # Gene ID, Gene Name X = X[:, gene_ids].todense() X_col_name = X_col[gene_ids][:, 1] # the name X_col = X_col[gene_ids][:, 0] # the id assert np.min(X) >= 0 and np.max(X) < 65000, \ f"Only support uint16 data type, given data with max={np.max(X)}" # data and metadata sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.transcriptomic, name=name) save_data = [(OMIC.transcriptomic.name, X), (OMIC.proteomic.name, y)] save_metadata = { 'main_omic': OMIC.transcriptomic.name, 'barcodes': barcodes, f"{OMIC.transcriptomic.name}_var": X_col_name, f"{OMIC.proteomic.name}_var": y_col_name } if len(pmhc_ids) > 0: save_data.append((OMIC.pmhc.name, z)) save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name ### others else: raise NotImplementedError(f"No support for experiment: {exp}") ### save data and metadata for name, data in save_data: outpath = os.path.join(preprocessed_path, name) n_samples, n_features = data.shape if n_samples == 0 or n_features == 0: continue with MmapArrayWriter(outpath, shape=(0, n_features), dtype=np.uint16, remove_exist=True) as f: if verbose: prog = tqdm(f"Saving {outpath}", total=n_samples, unit='samples') for s, e in batching(batch_size=5120, n=n_samples): x = data[s:e] if hasattr(x, 'todense'): x = x.todense() f.write(x) if verbose: prog.update(e - s) if verbose: prog.clear() prog.close() # save metadata outpath = os.path.join(preprocessed_path, 'metadata') with open(outpath, 'wb') as f: pickle.dump(save_metadata, f) if verbose: print(f"Saved metadata to path {outpath}") ### filter genes, follow 10x and use Cell Ranger recipe, # this is copied from Scanpy n_genes = sco.shape[1] sc.pp.filter_genes(sco, min_counts=1) # normalize with total UMI count per cell sc.pp.normalize_total(sco, key_added='n_counts_all') filter_result = sc.pp.filter_genes_dispersion(sco.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False) gene_subset = filter_result.gene_subset indices = sco.get_var_indices() markers = (MARKER_GENES if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC) for name in markers: idx = indices.get(name, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) # filter genes if verbose: print( f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.") with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(sco.var_names.values, f) # ******************** load and return the dataset ******************** # omics = [ name for name in os.listdir(preprocessed_path) if name not in ('metadata', 'top_genes') and '_' not in name ] with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f: metadata = pickle.load(f) with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) data = { name: MmapArray(os.path.join(preprocessed_path, name)).astype(np.float32) for name in omics } main_omic = metadata['main_omic'] X = data[main_omic] var_names = metadata[f'{main_omic}_var'] if filtered_genes: var_ids = {j: i for i, j in enumerate(var_names)} ids = [var_ids[i] for i in top_genes] X = X[:, ids] var_names = var_names[ids] sco = SingleCellOMIC( X, cell_id=metadata['barcodes'], gene_id=var_names, omic=main_omic, name=f"{dataset_name}{'' if filtered_genes else 'all'}") for o in omics: if o != main_omic: sco.add_omic(omic=o, X=data[o], var_names=np.asarray(metadata[f'{o}_var'])) return sco
import h5py import numpy as np from bigarray import MmapArray, MmapArrayWriter N = 500000 X = np.random.rand(N, 128).astype('float64') # ====== test created dataset ====== # start = timeit.default_timer() hdf5 = h5py.File('tmp.hdf5', 'w') print('Create HDF5 in:', timeit.default_timer() - start, 's') start = timeit.default_timer() mmap = MmapArrayWriter('tmp.mmap', dtype='float64', shape=(None, 128)) print('Create Memmap in:', timeit.default_timer() - start, 's') # ====== writing ====== # print() start = timeit.default_timer() hdf5['X'] = X print('Writing data to HDF5 :', timeit.default_timer() - start, 's') start = timeit.default_timer() mmap.write(X) print('Writing data to Memmap:', timeit.default_timer() - start, 's') hdf5.flush() hdf5.close() mmap.flush()