Пример #1
0
def save_to_dataset(path,
                    X,
                    X_col=None,
                    y=None,
                    y_col=None,
                    rowname=None,
                    print_log=True):
  r"""
    path : output folder path
    X : (n_samples, n_genes) gene expression matrix
    X_col : (n_genes,) name of each gene
    y : (n_samples, n_proteins) protein marker level matrix
    y_col : (n_proteins) name of each protein
    rowname : (n_samples,) name of cells (i.e. the sample)
    print_log : bool (default: True)
  """
  _check_data(X, X_col, y, y_col, rowname)
  assert os.path.isdir(path), "'%s' must be path to a folder" % path
  # save data
  if print_log:
    print("Saving data to %s ..." % ctext(path, 'cyan'))
  # saving sparse matrix
  if sparse.issparse(X):
    with open(os.path.join(path, 'X'), 'wb') as f:
      pickle.dump(X, f)
  else:
    with MmapArrayWriter(path=os.path.join(path, 'X'),
                         dtype='float32',
                         shape=(0, X.shape[1]),
                         remove_exist=True) as out:
      out.write(X)
  # save the meta info (X features)
  if X_col is not None:
    with open(os.path.join(path, 'X_col'), 'wb') as f:
      pickle.dump(X_col, f)
  # saving the label data (can be continous or discrete or binary)
  if y is not None and len(y.shape) > 0 and y.shape[1] != 0:
    if sparse.issparse(y):
      with open(os.path.join(path, 'y'), 'wb') as f:
        pickle.dump(y, f)
    else:
      with MmapArrayWriter(path=os.path.join(path, 'y'),
                           dtype='float32',
                           shape=(0, y.shape[1]),
                           remove_exist=True) as out:
        out.write(y)
    with open(os.path.join(path, 'y_col'), 'wb') as f:
      pickle.dump(y_col, f)
  # row name for both X and y
  if rowname is not None:
    with open(os.path.join(path, 'X_row'), 'wb') as f:
      pickle.dump(rowname, f)
Пример #2
0
    def test_write_single_time(self):
        fpath = _get_tempfile()
        array = np.arange(0, 100, dtype='float32').reshape(-1, 5)

        with MmapArrayWriter(path=fpath,
                             shape=array.shape,
                             dtype=array.dtype,
                             remove_exist=True) as f:
            f.write(array)
        x = MmapArray(fpath)
        self.assertTrue(np.all(array == x))

        with MmapArrayWriter(path=fpath, remove_exist=False) as f:
            f.write(array)
        x = MmapArray(fpath)
        self.assertTrue(np.all(np.concatenate([array, array], axis=0) == x))
Пример #3
0
    def test_write_multiple_time(self):
        fpath = _get_tempfile()
        array = np.arange(0, 1000, dtype='float32').reshape(-1, 2, 5)

        with MmapArrayWriter(path=fpath,
                             shape=(0, ) + array.shape[1:],
                             dtype=array.dtype,
                             remove_exist=True) as f:
            for i in range(0, array.shape[0], 8):
                f.write(array[i:i + 8])
        x = MmapArray(fpath)
        self.assertTrue(np.all(array == x))

        array1 = np.arange(0, 100, dtype='float32').reshape(-1, 2, 5)
        array[10:10 + array1.shape[0]] = array1
        with MmapArrayWriter(path=fpath, remove_exist=False) as f:
            f.write(array1, start_position=10)
        x = MmapArray(fpath)
        self.assertTrue(np.all(array == x))
Пример #4
0
 def __init__(self,
              path='~/tensorflow_datasets/3dshapes.h5',
              cache_dir=None,
              seed=8):
     path = os.path.abspath(os.path.expanduser(path))
     assert os.path.exists(path), "Path to file %s must exists" % path
     self.path = path
     if cache_dir is None:
         cache_dir = os.path.dirname(path)
     if not os.path.exists(cache_dir):
         os.mkdir(cache_dir)
     image_path = os.path.join(cache_dir, '3dshapes.images')
     label_path = os.path.join(cache_dir, '3dshapes.labels')
     # ====== read the dataset and cache it again ====== #
     if not os.path.exists(image_path) or not os.path.exists(label_path):
         import h5py
         with h5py.File(path, 'r') as dataset:
             images = dataset['images']
             labels = dataset['labels']
             with MmapArrayWriter(image_path,
                                  shape=images.shape,
                                  dtype=images.dtype,
                                  remove_exist=True) as img, \
               MmapArrayWriter(label_path,
                               shape=labels.shape,
                               dtype=labels.dtype,
                               remove_exist=True) as lab:
                 for start, end in tqdm(list(
                         batching(8000, n=images.shape[0])),
                                        desc="Caching data"):
                     img.write(images[start:end])
                     lab.write(labels[start:end])
     # ====== load the data ====== #
     self.images = MmapArray(image_path)
     self.factors = MmapArray(label_path)
     # ====== split the dataset ====== #
     rand = np.random.RandomState(seed=seed)
     n = len(self.images)
     ids = rand.permutation(n)
     # train:85% valid:5% test:10%
     self.train_indices = ids[:int(0.85 * n)]
     self.valid_indices = ids[int(0.85 * n):int(0.9 * n)]
     self.test_indices = ids[int(0.9 * n):]
Пример #5
0
 def test_read_multiprocessing(self):
     fpath = _get_tempfile()
     array = np.random.rand(1200, 25, 8)
     # first write the array
     with MmapArrayWriter(fpath, (None, 25, 8), array.dtype) as f:
         f.write(array)
     x = MmapArray(fpath)
     self.assertTrue(np.all(array == x))
     # use multiprocessing to randomly read the array
     jobs = [(x,
              sorted(
                  np.random.randint(0,
                                    array.shape[0],
                                    size=(2, ),
                                    dtype='int32'))) for i in range(25)]
     with Pool(2) as pool:
         for start, end, data in pool.map(_fn_read, jobs):
             data = zlib.decompress(data)
             data = np.frombuffer(data).reshape(-1, 25, 8)
             self.assertTrue(np.all(data == array[start:end]))
Пример #6
0
def read_centenarian(override=False, verbose=False):
    r""" Data used in:

    "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in
    supercentenarians" | bioRxiv [WWW Document], n.d.
      URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20).

  """
    download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed')
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        labels = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[2])),
            url=_URL[2],
        )
        data = []
        with gzip.open(labels, mode='rb') as f:
            for line in f:
                line = str(line, 'utf-8').strip().split('\t')
                assert line[1][:2] == line[2]
                data.append(line)
        labels = np.array(data)
        y_col = sorted(set(labels[:, 1]))
        y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]),
                    len(y_col)).astype('float32')
        y_col = np.array(y_col)
        #
        raw = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[0])),
            url=_URL[0],
        )
        if verbose:
            print("Unzip and reading raw UMI ...")
        X_raw, cell_id1, gene_id1 = read_gzip_csv(raw)
        #
        norm = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[1])),
            url=_URL[1],
        )
        if verbose:
            print("Unzip and reading log-norm UMI ...")
        X_norm, cell_id2, gene_id2 = read_gzip_csv(norm)
        #
        assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \
          np.all(gene_id1 == gene_id2)
        assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \
          X_raw.shape[1] == X_norm.shape[1] == len(gene_id1)
        #
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X=X_raw,
                        X_col=gene_id1,
                        y=y,
                        y_col=y_col,
                        rowname=cell_id1,
                        print_log=verbose)
        with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'),
                             shape=(0, X_norm.shape[1]),
                             dtype='float32',
                             remove_exist=True) as f:
            for s, e in batching(batch_size=2048, n=X_norm.shape[0]):
                f.write(X_norm[s:e])
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Пример #7
0
mmap_path = '/tmp/tmp.mmap'
numpy_path = '/tmp/tmp.array'

N = 50000
X = np.random.rand(N, 25, 128).astype('float64')
print("Array size: %.2f (MB)\n" %
      (np.prod(X.shape) * X.dtype.itemsize / 1024 / 1024))

# ====== test created dataset ====== #
start = timeit.default_timer()
hdf5 = h5py.File(hdf5_path, 'w')
print('Create HDF5   in:', timeit.default_timer() - start, 's')

start = timeit.default_timer()
mmap = MmapArrayWriter(mmap_path,
                       dtype='float64',
                       shape=(0, ) + X.shape[1:],
                       remove_exist=True)
print('Create Memmap in:', timeit.default_timer() - start, 's')

# ====== writing ====== #
print()

start = timeit.default_timer()
with open(numpy_path, 'wb') as f:
    np.save(f, X)
print('Numpy save in:', timeit.default_timer() - start, 's')

start = timeit.default_timer()
hdf5['X'] = X
print('Writing data to HDF5  :', timeit.default_timer() - start, 's')
Пример #8
0
def _fn_write(job):
    idx, array, path, shape = job
    with MmapArrayWriter(path=path, shape=shape, dtype='float64') as f:
        f.write(array, start_position=idx * array.shape[0])
Пример #9
0
def _extract_zero_and_first_stats(X, sad, indices, gmm, z_path, f_path,
                                  name_path):
    n_samples = X.shape[0]
    # indices is None, every row is single sample (utterance or image ...)
    if indices is None:
        if os.path.exists(z_path):
            os.remove(z_path)
        if os.path.exists(f_path):
            os.remove(f_path)
        Z = MmapArrayWriter(path=z_path,
                            dtype='float32',
                            shape=(n_samples, gmm.nmix),
                            remove_exist=True)
        F = MmapArrayWriter(path=f_path,
                            dtype='float32',
                            shape=(n_samples, gmm.feat_dim * gmm.nmix),
                            remove_exist=True)
        jobs, _ = _split_jobs(n_samples,
                              ncpu=mpi.cpu_count(),
                              device='cpu',
                              gpu_factor=1)

        def map_transform(start_end):
            start, end = start_end
            for i in range(start, end):
                # removed by SAD
                if sad is not None and not bool(sad[i]):
                    yield None, None, None
                else:
                    z, f = gmm.transform(X[i][np.newaxis, :],
                                         zero=True,
                                         first=True,
                                         device='cpu')
                    yield i, z, f

        prog = Progbar(target=n_samples,
                       print_report=True,
                       print_summary=False,
                       name="Extracting zero and first order statistics")
        for i, z, f in mpi.MPI(jobs, map_transform, ncpu=None, batch=1):
            if i is not None:  # i None means removed by SAD
                Z[i] = z
                F[i] = f
            prog.add(1)
        Z.flush()
        F.flush()
        Z.close()
        F.close()
    # use directly the transform_to_disk function
    else:
        gmm.transform_to_disk(X,
                              indices=indices,
                              sad=sad,
                              pathZ=z_path,
                              pathF=f_path,
                              name_path=name_path,
                              dtype='float32',
                              device=None,
                              ncpu=None,
                              override=True)
Пример #10
0
def read_dataset10x(name,
                    filtered_cells=True,
                    filtered_genes=True,
                    override=False,
                    verbose=True) -> SingleCellOMIC:
    r""" Predefined procedure for download and preprocessing 10x dataset into
  `SingleCellOMIC` i.e. scanpy.AnnData object

  Reference:
    https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html

  """
    ### prepare the URL
    name = str(name).lower().strip()
    spec = 'filtered' if filtered_cells else 'raw'
    flatten_datasets = [(exp, version, dsname)
                        for exp, i in all_datasets.items()
                        for version, j in i.items() for dsname in j]
    found = []
    for exp, version, dsname in flatten_datasets:
        if name == dsname:
            found.append((exp, version, dsname))
    if not found:
        raise ValueError(f"Cannot find data with name {name}, "
                         f"all available datasets are: {flatten_datasets}")
    if len(found) > 1:
        raise RuntimeError(
            f"Found multiple datasets {found} with name='{name}'")
    exp, version, name = found[0]
    dataset_name = name + '_' + spec
    url = group_to_url_skeleton[exp][version].format(version, name, name, spec)
    ### prepare the output path
    filename = os.path.basename(url)
    # download path
    download_path = os.path.join(DOWNLOAD_DIR, exp, version)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    # preprocessing path
    preprocessed_path = os.path.join(DATA_DIR,
                                     f'10x_{exp}_{name}_{spec}_preprocessed')
    if override and os.path.exists(preprocessed_path):
        if verbose:
            print("Overriding path: %s" % preprocessed_path)
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        if verbose:
            print("Dataset10X:")
            print(" Meta       :", found)
            print(" File       :", filename)
            print(" URL        :", url)
            print(" Download   :", download_path)
            print(" Preprocess :", preprocessed_path)
        ### download the tar file
        path = download_file(url=url,
                             filename=os.path.join(download_path, filename),
                             override=False,
                             md5=_MD5.get(f"{exp}*{version}*{name}*{spec}",
                                          None))
        if not tarfile.is_tarfile(path):
            raise RuntimeError("Expecting tarfile but received: %s" % path)
        contents = {}
        with tarfile.open(path, mode="r:gz") as f:
            all_files = [(path, info.name, info.size, verbose) for info in f
                         if info.isfile()]
        for name, data in MPI(jobs=all_files,
                              func=_read_tarinfo,
                              batch=1,
                              ncpu=4):
            contents[name] = data
        # cell barcodes
        barcodes = contents['barcodes']
        ### cell-atac
        if exp == 'cell-atac':
            n_top_genes = 20000  # this is ad-hoc value
            X = contents['matrix'].T.todense()
            peaks = contents['peaks']
            X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype(
                np.float32)
            X_col_name = np.array([':'.join(i) for i in peaks])
            save_data = [(OMIC.atac.name, X)]
            save_metadata = dict(main_omic=OMIC.atac.name,
                                 barcodes=barcodes,
                                 chromatin_var=X_col_name)
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.atac,
                                 name=name)
        ### cell-exp and cell-vdj
        elif exp in ('cell-exp', 'cell-vdj'):
            n_top_genes = 2000
            # feature (Id, Name, Type(antibody or gene-expression))
            X_col = contents[
                'features'] if 'features' in contents else contents['genes']
            # data matrix
            X = contents['matrix'].T
            if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'):
                X = X.tocsr()
            X = X.astype('float32')
            assert X.shape[0] == barcodes.shape[0] and X.shape[
                1] == X_col.shape[0]
            # antibody and gene are provided
            prot_ids = []
            pmhc_ids = []
            gene_ids = []
            if X_col.shape[1] == 3:
                for idx, (feat_id, feat_name, feat_type) in enumerate(X_col):
                    if feat_type == 'Antibody Capture':
                        if exp == "cell-vdj" and "_TotalSeqC" not in feat_name:
                            pmhc_ids.append(idx)
                        else:
                            prot_ids.append(idx)
                    elif feat_type == 'Gene Expression':
                        gene_ids.append(idx)
                    else:
                        raise ValueError(
                            f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}"
                        )
            elif X_col.shape[1] == 2:
                gene_ids = slice(None, None)
            else:
                raise ValueError(f"No support for features matrix\n{X_col}")
            # Antibody ID, Antibody Name
            y = X[:, prot_ids]
            y_col = X_col[prot_ids][:, 0]  # the id
            y_col_name = X_col[prot_ids][:, 1]  # the name
            # pMHC peptide
            if len(pmhc_ids) > 0:
                z = X[:, pmhc_ids]
                z_col = X_col[pmhc_ids][:, 0]  # the id
                z_col_name = X_col[pmhc_ids][:, 1]  # the name
            # Gene ID, Gene Name
            X = X[:, gene_ids].todense()
            X_col_name = X_col[gene_ids][:, 1]  # the name
            X_col = X_col[gene_ids][:, 0]  # the id
            assert np.min(X) >= 0 and np.max(X) < 65000, \
              f"Only support uint16 data type, given data with max={np.max(X)}"
            # data and metadata
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.transcriptomic,
                                 name=name)
            save_data = [(OMIC.transcriptomic.name, X),
                         (OMIC.proteomic.name, y)]
            save_metadata = {
                'main_omic': OMIC.transcriptomic.name,
                'barcodes': barcodes,
                f"{OMIC.transcriptomic.name}_var": X_col_name,
                f"{OMIC.proteomic.name}_var": y_col_name
            }
            if len(pmhc_ids) > 0:
                save_data.append((OMIC.pmhc.name, z))
                save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name
        ### others
        else:
            raise NotImplementedError(f"No support for experiment: {exp}")
        ### save data and metadata
        for name, data in save_data:
            outpath = os.path.join(preprocessed_path, name)
            n_samples, n_features = data.shape
            if n_samples == 0 or n_features == 0:
                continue
            with MmapArrayWriter(outpath,
                                 shape=(0, n_features),
                                 dtype=np.uint16,
                                 remove_exist=True) as f:
                if verbose:
                    prog = tqdm(f"Saving {outpath}",
                                total=n_samples,
                                unit='samples')
                for s, e in batching(batch_size=5120, n=n_samples):
                    x = data[s:e]
                    if hasattr(x, 'todense'):
                        x = x.todense()
                    f.write(x)
                    if verbose:
                        prog.update(e - s)
                if verbose:
                    prog.clear()
                    prog.close()
        # save metadata
        outpath = os.path.join(preprocessed_path, 'metadata')
        with open(outpath, 'wb') as f:
            pickle.dump(save_metadata, f)
        if verbose:
            print(f"Saved metadata to path {outpath}")
        ### filter genes, follow 10x and use Cell Ranger recipe,
        # this is copied from Scanpy
        n_genes = sco.shape[1]
        sc.pp.filter_genes(sco, min_counts=1)
        # normalize with total UMI count per cell
        sc.pp.normalize_total(sco, key_added='n_counts_all')
        filter_result = sc.pp.filter_genes_dispersion(sco.X,
                                                      flavor='cell_ranger',
                                                      n_top_genes=n_top_genes,
                                                      log=False)
        gene_subset = filter_result.gene_subset
        indices = sco.get_var_indices()
        markers = (MARKER_GENES
                   if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC)
        for name in markers:
            idx = indices.get(name, None)
            if idx is not None:
                gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)  # filter genes
        if verbose:
            print(
                f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.")
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(sco.var_names.values, f)
    # ******************** load and return the dataset ******************** #
    omics = [
        name for name in os.listdir(preprocessed_path)
        if name not in ('metadata', 'top_genes') and '_' not in name
    ]
    with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f:
        metadata = pickle.load(f)
    with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
        top_genes = pickle.load(f)
    data = {
        name: MmapArray(os.path.join(preprocessed_path,
                                     name)).astype(np.float32)
        for name in omics
    }
    main_omic = metadata['main_omic']
    X = data[main_omic]
    var_names = metadata[f'{main_omic}_var']
    if filtered_genes:
        var_ids = {j: i for i, j in enumerate(var_names)}
        ids = [var_ids[i] for i in top_genes]
        X = X[:, ids]
        var_names = var_names[ids]
    sco = SingleCellOMIC(
        X,
        cell_id=metadata['barcodes'],
        gene_id=var_names,
        omic=main_omic,
        name=f"{dataset_name}{'' if filtered_genes else 'all'}")
    for o in omics:
        if o != main_omic:
            sco.add_omic(omic=o,
                         X=data[o],
                         var_names=np.asarray(metadata[f'{o}_var']))
    return sco
Пример #11
0
import h5py
import numpy as np

from bigarray import MmapArray, MmapArrayWriter

N = 500000
X = np.random.rand(N, 128).astype('float64')

# ====== test created dataset ====== #
start = timeit.default_timer()
hdf5 = h5py.File('tmp.hdf5', 'w')
print('Create HDF5   in:', timeit.default_timer() - start, 's')

start = timeit.default_timer()
mmap = MmapArrayWriter('tmp.mmap', dtype='float64', shape=(None, 128))
print('Create Memmap in:', timeit.default_timer() - start, 's')

# ====== writing ====== #
print()
start = timeit.default_timer()
hdf5['X'] = X
print('Writing data to HDF5  :', timeit.default_timer() - start, 's')

start = timeit.default_timer()
mmap.write(X)
print('Writing data to Memmap:', timeit.default_timer() - start, 's')

hdf5.flush()
hdf5.close()
mmap.flush()