Exemplo n.º 1
0
def read_Hemato(override=False, verbose=False):
    preprocessed_path = select_path(os.path.join(DATA_DIR,
                                                 'HEMATO_preprocessed'),
                                    create_new=True)

    if override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ====== copy the dataset from scVI ====== #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        try:
            from scvi.dataset import HematoDataset
        except ImportError:
            raise RuntimeError("Require `scVI` package for HEMATO dataset")

        gene_dataset = HematoDataset(
            save_path=os.path.join(DOWNLOAD_DIR, 'HEMATO/'))

        X = gene_dataset._X
        gene_names = np.array(gene_dataset.gene_names)
        assert len(gene_names) == X.shape[1]

        y = gene_dataset.meta.values[:, 1:]
        label_names = np.array(gene_dataset.cell_types_levels)
        assert len(label_names) == y.shape[1]

        cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])])

        _save_data_to_path(preprocessed_path, X, y, gene_names, label_names,
                           cell_names, verbose)

        # create a binary classes for testing
        label_names = np.array(["Erythroblasts", "Granulocytes"])
        min_y = np.min(gene_dataset.labels)
        max_y = np.max(gene_dataset.labels)
        y_val = 2 * (gene_dataset.labels - min_y) / (max_y - min_y) - 1
        y_bin = np.argmax(
            np.hstack((
                gene_dataset.meta.iloc[:, 1].values[:, None],  # Er
                gene_dataset.meta.iloc[:, 2].values[:, None])),  # Gr
            axis=-1)
        with open(os.path.join(preprocessed_path, 'labels_name'), 'wb') as f:
            pickle.dump(label_names, f)
        with open(os.path.join(preprocessed_path, 'labels_bin'), 'wb') as f:
            pickle.dump(y_bin, f)
        with open(os.path.join(preprocessed_path, 'labels_val'), 'wb') as f:
            pickle.dump(y_val, f)
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Exemplo n.º 2
0
def read_PBMC_crossdataset_remove_protein(subset,
                                          return_ecc,
                                          filtered_genes=False,
                                          override=False,
                                          verbose=False,
                                          remove_protein=['CD4', 'CD8']):
    remove_protein = sorted(
        [i.lower() for i in as_tuple(remove_protein, t=string_types)])
    preprocessed_path = os.path.join(
        DATA_DIR, 'PBMCcross_%s_%s_no%s_preprocessed' %
        ('ecc' if return_ecc else '8k', subset +
         ('' if filtered_genes else 'full'), ''.join(
             [i.lower() for i in remove_protein])))
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)

    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        ds = read_PBMC_crossdataset_ecc_8k(subset,
                                           return_ecc,
                                           filtered_genes,
                                           override=override,
                                           verbose=verbose)
        X = ds['X'][:]
        X_row = ds['X_row']
        X_col = ds['X_col']
        y = ds['y']
        y_col = ds['y_col']

        remove_ids = [
            i for i, j in enumerate(y_col)
            if standardize_protein_name(j).lower() in remove_protein
        ]
        remain_ids = [i for i in range(len(y_col)) if i not in remove_ids]
        y_col = y_col[remain_ids]
        y = y[:, remain_ids]

        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** return ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Exemplo n.º 3
0
def _read_scvi_dataset(name, clazz_name, override, verbose):
    preprocessed_path = select_path(os.path.join(DATA_DIR,
                                                 '%s_preprocessed' % name),
                                    create_new=True)
    if override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ====== copy the dataset from scVI ====== #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        try:
            import scvi.dataset as scvi_dataset
        except ImportError:
            raise RuntimeError("Require `scVI` package for PBMC dataset")
        clazz = getattr(scvi_dataset, clazz_name)
        gene_dataset = clazz(save_path=DOWNLOAD_DIR)

        X = gene_dataset._X
        if hasattr(X, 'todense'):
            X = np.array(X.todense())

        gene_names = np.array(gene_dataset.gene_names)
        # convert gene identifier to gene symbol (i.e. name)
        if hasattr(gene_dataset, 'de_metadata'):
            from sisua.data.utils import get_gene_id2name
            meta = gene_dataset.de_metadata
            converter = {i: j for i, j in zip(meta.ENSG, meta.GS)}
            pbmc8kconverter = get_gene_id2name()
            gene_names = np.array([
                pbmc8kconverter[i] if i in pbmc8kconverter else converter[i]
                for i in gene_names
            ])
        assert len(gene_names) == X.shape[1]

        label_names = np.array(gene_dataset.cell_types)
        y = one_hot(gene_dataset.labels.ravel(), nb_classes=len(label_names))
        assert len(label_names) == y.shape[1]

        cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])])
        _save_data_to_path(preprocessed_path, X, y, gene_names, label_names,
                           cell_names, verbose)
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Exemplo n.º 4
0
 def load_parameters(clazz):
   # ====== all path ====== #
   name = clazz.__name__ + '.zip'
   path = os.path.join(base64.decodebytes(Model.ORIGIN).decode(), name)
   param_path = get_datasetpath(name=clazz.__name__, override=False)
   zip_path = os.path.join(Model.BASE_DIR, name)
   # ====== get params files ====== #
   if not os.path.exists(param_path) or \
   len(os.listdir(param_path)) == 0:
     get_file(name, origin=path, outdir=Model.BASE_DIR)
     zf = ZipFile(zip_path, mode='r', compression=ZIP_DEFLATED)
     zf.extractall(path=Model.BASE_DIR)
     zf.close()
     # check if proper unzipped
     if not os.path.exists(param_path) or \
     len(os.listdir(param_path)) == 0:
       raise RuntimeError("Zip file at path:%s is not proper unzipped, "
           "cannot find downloaded parameters at path: %s" %
           (zip_path, param_path))
     else:
       os.remove(zip_path)
   # ====== create and return the params dataset ====== #
   ds = Dataset(param_path, read_only=True)
   return ds
Exemplo n.º 5
0
def validating_dataset(path):
  if isinstance(path, Dataset):
    ds = path
  elif isinstance(path, string_types):
    ds = Dataset(path, read_only=True)

  assert 'X' in ds, \
  '`X` (n_samples, n_genes) must be stored at path: %s' % ds.path
  assert 'X_col' in ds, \
  '`X_col` (n_genes,) must be stored at path: %s' % ds.path
  assert 'X_row' in ds, \
  '`X_row` (n_samples,) must be stored at path: %s' % ds.path

  if 'y' in ds:
    assert 'y' in ds, \
    '`y` (n_samples, n_protein) must be stored at path: %s' % ds.path
    assert 'y_col' in ds, \
    '`y_col` (n_protein,) must be stored at path: %s' % ds.path
    y, y_col = ds['y'], ds['y_col']
  else:
    y, y_col = None, None

  X, X_col, rowname = ds['X'], ds['X_col'], ds['X_row']
  _check_data(X, X_col, y, y_col, rowname)
Exemplo n.º 6
0
    def run(self):
        njobs = len(self.jobs)
        dataset = Dataset(self.path)
        if self.n_cache <= 1:
            cache_limit = max(2, int(0.12 * njobs))
        else:
            cache_limit = int(self.n_cache)
        # ====== indices ====== #
        databases = defaultdictkey(
            lambda key: MmapDict(path=os.path.join(dataset.path, key),
                                 cache_size=10000,
                                 read_only=False))
        last_start = defaultdict(int)
        # ====== statistic ====== #
        # load old statistics
        stats = defaultdict(lambda: [0, 0])  # name -> (sum1, sum2)
        for key in dataset.keys():
            if 'sum1' == key[-4]:
                stats[key[:-4]][0] = dataset[key][:]
            elif 'sum2' == key[-4:]:
                stats[key[:-4]][1] = dataset[key][:]
        # all data are cached for periodically flushed
        cache = defaultdict(list)
        n_processed = [0]  # store the value as reference

        # ====== helper ====== #
        def flush_feature(feat_name, X_cached):
            if len(X_cached) > 0:
                X_cached = np.concatenate(X_cached, 0)
                # flush data
                if feat_name in dataset:
                    dataset[feat_name].append(X_cached)
                else:
                    dataset[(feat_name, 'memmap')] = X_cached

        # ====== repeated for each result returned ====== #
        def post_processing(result):
            # search for file name
            if self.identifier not in result:
                raise RuntimeError(
                    "Cannot find identifier '%s' in returned dictionary" %
                    self.identifier)
            file_name = result[self.identifier]
            # invalid file_name
            if not is_string(file_name):
                raise RuntimeError(
                    "Cannot find file name in returned features "
                    "list, the file name can be specified in key: 'name', 'path' "
                    "and the type of the value must be string. All available "
                    "keys are: %s" % str(result.keys()))
            # store all new indices
            # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
            all_indices = {}
            # processing
            for feat_name, X in result.items():
                # some invalid feat_name
                if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
                    raise RuntimeError(
                        "Returned features' name cannot be one "
                        "of the following: 'config', 'pipeline', 'sum1', 'sum2'."
                    )
                # ignore some feat_name
                if feat_name in ('name'):
                    continue
                # if numpy ndarray, save to MmapData
                if isinstance(X, np.ndarray) or \
                'sum1' == feat_name[-4:] or \
                'sum2' == feat_name[-4:]:
                    # save statistics instead
                    if 'sum1' == feat_name[-4:]:
                        stats[feat_name[:-4]][0] += X
                    elif 'sum2' == feat_name[-4:]:
                        stats[feat_name[:-4]][1] += X
                    # save features array
                    else:
                        all_indices[feat_name] = X.shape[0]
                        # cache data, only if we have more than 0 sample
                        if X.shape[0] > 0:
                            cache[feat_name].append(X)
                # else all other kind of data save to MmapDict
                else:
                    databases[feat_name][file_name] = X
                # remove data
                del X
            # ====== update indices ====== #
            if len(all_indices) > 0:
                for feat_name, n in all_indices.items():
                    ids_name = 'indices_%s' % feat_name
                    databases[ids_name][file_name] = (last_start[ids_name],
                                                      last_start[ids_name] + n)
                    last_start[ids_name] += n
            # ====== flush cache ====== #
            n_processed[0] += 1
            if n_processed[0] % cache_limit == 0:  # 12 + 8
                for feat_name, X_cached in cache.items():
                    flush_feature(feat_name, X_cached)
                cache.clear()
            # ====== update progress ====== #
            return file_name

        # ====== mapping function ====== #
        def _map_func(dat):
            try:
                ret = self.extractor.transform(dat)
            except Exception as e:  # Non-handled exception
                ret = '\n========\n'
                ret += 'Time  : `%s`\n' % str(
                    get_formatted_datetime(only_number=False))
                ret += 'Error : `%s`\n' % str(e)
                ret += 'Input : `%s`\n' % str(dat)
                import traceback
                etype, value, tb = sys.exc_info()
                for line in traceback.TracebackException(
                        type(value), value, tb, limit=None).format(chain=True):
                    ret += line
            return ret

        # ====== processing ====== #
        mpi = MPI(jobs=self.jobs,
                  func=_map_func,
                  ncpu=self.n_cpu,
                  batch=1,
                  hwm=self.n_cpu * 3,
                  backend='python')
        # initialize
        prog = Progbar(target=njobs,
                       name=self.path,
                       interval=0.12,
                       print_report=True,
                       print_summary=True)
        start_time = time.time()
        last_time = time.time()
        last_count = 0
        with open(self._log_path, 'w') as flog:
            # writing the log head
            flog.write('============================\n')
            flog.write('Start Time : %s\n' %
                       get_formatted_datetime(only_number=False))
            flog.write('Outpath    : %s\n' % self.path)
            flog.write('Extractor  : %s\n' % '->'.join(
                [s[-1].__class__.__name__ for s in self.extractor.steps]))
            flog.write('#Jobs      : %d\n' % njobs)
            flog.write('#CPU       : %d\n' % self.n_cpu)
            flog.write('#Cache     : %d\n' % cache_limit)
            flog.write('============================\n')
            flog.flush()
            # start processing the file list
            for count, result in enumerate(mpi):
                # Non-handled exception
                if isinstance(result, string_types):
                    flog.write(result)
                    flog.flush()
                    self._error_log.append(result)
                    if self.stop_on_failure:
                        raise RuntimeError(result)
                # some error might happened
                elif isinstance(result, ExtractorSignal):
                    flog.write(str(result))
                    flog.flush()
                    if result.action == 'error':
                        prog.add_notification(str(result))
                        raise RuntimeError(
                            "ExtractorSignal requests terminating processor!")
                    elif result.action == 'warn':
                        prog.add_notification(str(result))
                    elif result.action == 'ignore':
                        self._error_log.append(result)
                    else:
                        raise RuntimeError(
                            "Unknown action from ExtractorSignal: %s" %
                            result.action)
                    prog['File'] = '%-48s' % result.message[:48]
                # otherwise, no error happened, do post-processing
                else:
                    name = post_processing(result)
                    prog['File'] = '%-48s' % str(name)[:48]
                # update progress
                prog.add(1)
                # manually write to external log file
                if (count + 1) % max(1, int(0.01 * njobs)) == 0:
                    curr_time = time.time()
                    elap = curr_time - start_time
                    avg_speed = (count + 1) / elap
                    cur_speed = (count + 1 - last_count) / (curr_time -
                                                            last_time)
                    avg_est = (njobs - count - 1) / avg_speed
                    cur_est = (njobs - count - 1) / cur_speed
                    flog.write(
                        '[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                        '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                        '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                        (get_formatted_datetime(only_number=False), count + 1,
                         njobs - count - 1, elap, avg_speed, avg_est,
                         cur_speed, cur_est))
                    flog.flush()
                    last_time = curr_time
                    last_count = count + 1
        # ====== end, flush the last time ====== #
        for feat_name, X_cached in cache.items():
            flush_feature(feat_name, X_cached)
        cache.clear()
        cache = None
        dataset.flush()
        prog.add_notification("Flushed all data to disk")
        # ====== saving indices ====== #
        for name, db in databases.items():
            db.flush(save_all=True)
            db_size = len(db)
            db.close()
            prog.add_notification(
                'Flush MmapDict "%s" to disk, size: %s' %
                (ctext(name, 'yellow'), ctext(str(db_size), 'yellow')))

        # ====== save mean and std ====== #
        def save_mean_std(sum1, sum2, name):
            N = dataset[name.split('_')[0]].shape[0]
            mean = sum1 / N
            std = np.sqrt(sum2 / N - np.power(mean, 2))
            if np.any(np.isnan(mean)):
                wprint('Mean contains NaN, name: %s' % name)
            if np.any(np.isnan(std)):
                wprint('Std contains NaN, name: %s' % name)
            dataset[name + 'sum1'] = sum1
            dataset[name + 'sum2'] = sum2
            dataset[name + 'mean'] = mean
            dataset[name + 'std'] = std

        # save all stats
        if len(stats) > 0:
            for feat_name, (sum1, sum2) in stats.items():
                save_mean_std(sum1, sum2, feat_name)
                prog.add_notification(
                    'Saved statistics of: %s, shape: %s' %
                    (ctext(feat_name.split('_')[0],
                           'yellow'), ctext(str(sum1.shape), 'yellow')))
        # ====== dataset flush() ====== #
        dataset.flush()
        dataset.close()
        # ====== saving the extractor ====== #
        # not good idea to save the extractor all the time
        # pipeline_path = os.path.join(dataset.path, 'pipeline')
        # with open(pipeline_path, 'wb') as f:
        #   cPickle.dump(self.extractor, f, protocol=2)
        # prog.add_notification("Saved Extractor pipeline at: %s" %
        #                       ctext(pipeline_path, 'yellow'))
        # ====== saving the configuration ====== #
        config_path = os.path.join(dataset.path, 'config')
        config = MmapDict(config_path)
        config['__configuration_time__'] = time.time()
        config['__processor__'] = self.path
        for i in dir(self):
            if _default_module.match(i) is not None:
                continue
            j = getattr(self, i)
            if isinstance(j, (Number, string_types, bool)):
                config[i] = j
        config.flush(save_all=True)
        self.config = {i: j for i, j in config}
        config.close()
        prog.add_notification("Saved configuration at: %s" %
                              ctext(config_path, 'yellow'))
        # ====== final notification ====== #
        prog.add_notification("Closed all dataset.")
        prog.add_notification("Dataset at path: %s" %
                              ctext(dataset.path, 'yellow'))
Exemplo n.º 7
0
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
    """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
    # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
    # add reading data from indices also
    # ====== check input dataset ====== #
    own_dataset = True
    if is_string(dataset) and os.path.isdir(dataset):
        dataset = Dataset(dataset, read_only=True)
    elif isinstance(dataset, Dataset):
        own_dataset = False
    elif isinstance(dataset, FeatureProcessor):
        dataset = Dataset(dataset.path, read_only=True)
    else:
        raise ValueError("Cannot acquire Dataset from input: %s" %
                         str(dataset))
    # ====== extract all feat_name ====== #
    if is_string(feat_name) and feat_name == 'auto':
        feat_name = []
        for k in dataset.keys():
            X = dataset[k]
            if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
                feat_name.append(k)
    else:
        feat_name = [
            name for name in as_tuple(feat_name, t=str) if name in dataset
        ]
    # ====== load PCA ====== #
    from odin.ml import MiniBatchPCA
    # init PCA
    nb_samples = 0
    for feat in feat_name:
        nb_samples += dataset[feat].shape[0]
    # ====== prepare MPI PCA ====== #
    add_notification("Selected features for PCA: " +
                     ctext(', '.join(feat_name), 'yellow'))

    def map_pca(name):
        X = dataset[name]
        # found exist pca model
        if 'pca_' + feat in dataset and not override:
            pca = dataset['pca_' + feat]
        # create new PCA
        else:
            pca = MiniBatchPCA(n_components=None,
                               whiten=False,
                               copy=True,
                               batch_size=None)
        # No shuffling make iter much faster
        for x in X.set_batch(batch_size=batch_size, seed=None,
                             shuffle_level=0):
            pca.partial_fit(x)
            yield x.shape[0]
        # save PCA model
        with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
            cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
        # finish return feature name
        yield name

    mpi = MPI(jobs=feat_name,
              func=map_pca,
              ncpu=None,
              batch=1,
              hwm=12082518,
              backend='python')
    # ====== running the MPI ====== #
    remain_features = list(feat_name)
    finished_features = []
    prog = Progbar(target=nb_samples,
                   print_summary=True,
                   print_report=True,
                   name='PCA')
    for n in mpi:
        if is_string(n):
            remain_features.remove(n)
            finished_features.append(n)
        else:
            prog['Remain'] = ', '.join(remain_features)
            prog['Finished'] = ', '.join(finished_features)
            prog.add(n)
    # ====== return ====== #
    if own_dataset:
        dataset.close()
Exemplo n.º 8
0
def validate_features(ds_or_processor,
                      path,
                      nb_samples=25,
                      override=False,
                      seed=12082518,
                      fig_width=4):
    # TODO: add PCA visualization
    # TODO: update to match new indices style
    def logger(title, tag, check):
        check = bool(check)
        text_color = 'yellow' if check else 'red'
        print(ctext('   *', 'cyan'), ctext(str(title), text_color),
              ctext(str(tag), 'magenta'),
              ctext("✓", text_color) if check else ctext("✗", text_color))

    import matplotlib
    matplotlib.use('Agg')
    from odin.visual import plot_save, plot_multiple_features
    # ====== check path to dataset ====== #
    should_close_ds = True
    if isinstance(ds_or_processor, FeatureProcessor):
        ds = Dataset(ds_or_processor.path, read_only=True)
    elif is_string(ds_or_processor):
        ds = Dataset(ds_or_processor, read_only=True)
    elif isinstance(ds_or_processor, Dataset):
        ds = ds_or_processor
        should_close_ds = False
    else:
        raise ValueError("`ds` can be None, string, or Dataset. No "
                         "support for given input type: %s" % str(type(ds)))
    print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
    # ====== extract the config of the dataset ====== #
    if 'config' not in ds:
        raise RuntimeError(
            "The `Dataset` must be generated by `FeatureProcessor` "
            "which must contain `config` MmapDict of extracted "
            "features configuration.")
    # config = ds['config']
    # pipeline = ds['pipeline']
    # ====== output path ====== #
    path = str(path)
    if not os.path.exists(path):
        os.mkdir(path)
    elif override:
        if os.path.isfile(path):
            os.remove(path)
        else:
            shutil.rmtree(path)
        os.mkdir(path)
    else:
        raise ValueError("`path`=%s exists, cannot override." % path)
    prev_stdio = get_stdio_path()
    stdio(path=os.path.join(path, 'log.txt'))
    nb_samples = int(nb_samples)
    # ====== get all features ====== #
    # [(name, dtype, statistic-able), ...]
    all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
    # store all features (included the features in external_indices
    all_features = []
    # the external indices can be: indices_mfcc_bnf
    external_indices = flatten_list([
        k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices'
    ])
    # ====== checking indices ====== #
    main_indices = {
        name: (start, end)
        for name, (start, end) in ds['indices'].items()
    }
    for ids_name in (k for k in all_keys if 'indices' in k):
        ids = sorted([(name, start, end)
                      for name, (start, end) in ds[ids_name].items()],
                     key=lambda x: x[1])
        for prev, now in zip(ids, ids[1:]):
            assert prev[2] == now[1], "Zero length in indices"
            assert prev[2] - prev[1] > 0, "Zero length in indices"
            assert now[2] - now[1] > 0, "Zero length in indices"
        # final length match length of Data
        if ids_name != 'indices':
            for feat_name in ids_name.split('_')[1:]:
                assert now[-1] == len(ds[feat_name]), \
                    "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
                    (ids_name, feat_name)
                all_features.append(feat_name)
        else:
            for feat_name in all_keys:
                if feat_name not in external_indices and \
                'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
                'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
                isinstance(ds[feat_name], MmapData):
                    assert now[-1] == len(ds[feat_name]), \
                    "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
                    all_features.append(feat_name)
        # logging
        logger("Checked all:", ids_name, True)
    # ====== check all dictionary types ====== #
    for name in all_keys:
        if isinstance(ds[name], MmapDict) and 'indices' not in name:
            data = ds[name]
            # special cases
            if name == 'sr':
                checking_func = lambda x: x > 0  # for sr
            else:
                checking_func = lambda x: True
            # check
            for key, val in data.items():
                assert key in main_indices, \
                "Dictionary with name:'%s' has key not found in indices." % name
                assert checking_func(val)
            logger("Checked dictionary: ", name, True)
    # ====== checking each type of data ====== #
    # get all stats name
    all_stats = defaultdict(list)
    for k in all_keys:
        if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
        'mean' == k[-4:] or 'std' == k[-3:]:
            all_stats[k[:-4].split('_')[0]].append(k)
    # get all pca name
    all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds}
    # checking one-by-one numpy.ndarray features array
    for feat_name in all_features:
        dtype = str(ds[feat_name].dtype)
        # checking all data
        indices = ds.find_prefix(feat_name, 'indices')
        prog = Progbar(target=len(indices),
                       interval=0.1,
                       print_report=True,
                       name='Checking: %s(%s)' % (feat_name, dtype))
        # start iterating over all data file
        fail_test = False
        for file_name, (start, end) in indices:
            dat = ds[feat_name][start:end]
            # No NaN value
            if np.any(np.isnan(dat)):
                logger("NaN values", file_name + ':' + feat_name, False)
                fail_test = True
            # not all value closed to zeros
            if np.all(np.isclose(dat, 0.)):
                logger("All-closed-zeros values", file_name + ':' + feat_name,
                       False)
                fail_test = True
            prog['Name'] = file_name
            prog.add(1)
        if not fail_test:
            logger("Check data incredibility for: ", feat_name, True)
        # checking statistics
        if feat_name in all_stats:
            fail_test = False
            for stat_name in all_stats[feat_name]:
                X = ds[stat_name]
                if X.ndim >= 1:
                    X = X[:]
                if np.any(np.isnan(X)):
                    logger("NaN values", feat_name + ':' + stat_name, False)
                    fail_test = True
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values",
                           feat_name + ':' + stat_name, False)
                    fail_test = True
            if not fail_test:
                logger("Check statistics for: ", feat_name, True)
        # check PCA
        if feat_name in all_pca:
            pca = ds[all_pca[feat_name]]
            n = ds[feat_name].shape[0]
            nb_feats = ds[feat_name].shape[-1]
            fail_test = False
            # performing PCA on random samples
            for i in range(nb_samples):
                start = np.random.randint(0, n - nb_samples - 1)
                X = pca.transform(ds[feat_name][start:(start + nb_samples)],
                                  n_components=max(nb_feats // 2, 1))
                if np.any(np.isnan(X)):
                    logger("NaN values in PCA", feat_name, False)
                    fail_test = True
                    break
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values in PCA", feat_name, False)
                    fail_test = True
                    break
            if not fail_test:
                logger("Check PCA for: ", feat_name, True)
    # ====== Do sampling ====== #
    np.random.seed(seed)  # seed for reproceducible
    all_samples = np.random.choice(list(ds['indices'].keys()),
                                   size=nb_samples,
                                   replace=False)
    # plotting all samples
    for sample_id, file_name in enumerate(all_samples):
        X = {}
        for feat_name in all_features:
            start, end = ds.find_prefix(feat_name, 'indices')[file_name]
            feat = ds[feat_name][start:end]
            X[feat_name] = feat
            # some special handling
            try:
                _special_cases(X=feat,
                               feat_name=feat_name,
                               file_name=file_name,
                               ds=ds,
                               path=path)
            except Exception as e:
                logger("Special case error: %s" % str(e),
                       file_name + ':' + feat_name, False)
        plot_multiple_features(X, title=file_name, fig_width=fig_width)
        figure_path = os.path.join(path,
                                   '%s.pdf' % _escape_file_name(file_name))
        plot_save(figure_path, log=False, clear_all=True)
        logger("Sample figure saved at: ", figure_path, True)
    # plotting the statistic
    figure_path = os.path.join(path, 'stats.pdf')
    for feat_name, stat_name in all_stats.items():
        X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1}
        if len(X) > 0:
            plot_multiple_features(X, title=feat_name, fig_width=fig_width)
    plot_save(figure_path, log=False, clear_all=True)
    logger("Stats figure save at: ", figure_path, True)
    logger("All reports at folder: ", os.path.abspath(path), True)
    # ====== cleaning ====== #
    stdio(path=prev_stdio)
    if should_close_ds:
        ds.close()
Exemplo n.º 9
0
def read_centenarian(override=False, verbose=False):
    r""" Data used in:

    "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in
    supercentenarians" | bioRxiv [WWW Document], n.d.
      URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20).

  """
    download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed')
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        labels = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[2])),
            url=_URL[2],
        )
        data = []
        with gzip.open(labels, mode='rb') as f:
            for line in f:
                line = str(line, 'utf-8').strip().split('\t')
                assert line[1][:2] == line[2]
                data.append(line)
        labels = np.array(data)
        y_col = sorted(set(labels[:, 1]))
        y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]),
                    len(y_col)).astype('float32')
        y_col = np.array(y_col)
        #
        raw = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[0])),
            url=_URL[0],
        )
        if verbose:
            print("Unzip and reading raw UMI ...")
        X_raw, cell_id1, gene_id1 = read_gzip_csv(raw)
        #
        norm = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[1])),
            url=_URL[1],
        )
        if verbose:
            print("Unzip and reading log-norm UMI ...")
        X_norm, cell_id2, gene_id2 = read_gzip_csv(norm)
        #
        assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \
          np.all(gene_id1 == gene_id2)
        assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \
          X_raw.shape[1] == X_norm.shape[1] == len(gene_id1)
        #
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X=X_raw,
                        X_col=gene_id1,
                        y=y,
                        y_col=y_col,
                        rowname=cell_id1,
                        print_log=verbose)
        with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'),
                             shape=(0, X_norm.shape[1]),
                             dtype='float32',
                             remove_exist=True) as f:
            for s, e in batching(batch_size=2048, n=X_norm.shape[0]):
                f.write(X_norm[s:e])
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Exemplo n.º 10
0
def read_full_FACS(override=False, verbose=False):
    """ https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE75478
  This is the full FACS data of 2 individuals with 7 protein markers
  """
    download_path = os.path.join(DOWNLOAD_DIR, "FACS_full")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    # ====== download the data ====== #
    file_url = [
        ('GSE75478_transcriptomics_facs_indeces_filtered_I1.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Ffacs%5Findeces%5Ffiltered%5FI1%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_facs_indeces_filtered_I2.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Ffacs%5Findeces%5Ffiltered%5FI2%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_raw_filtered_I1.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Fraw%5Ffiltered%5FI1%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_raw_filtered_I2.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Fraw%5Ffiltered%5FI2%2Ecsv%2Egz'
         ),
    ]
    for name, url in file_url:
        filename = os.path.join(download_path, name)
        if not os.path.exists(filename):
            if verbose:
                print("Downloading file '{filename}' ...")
            urlretrieve(url=url, filename=filename)
    # ====== extract the data ====== #
    preprocessed_path = _FACS_PREPROCESSED % 7
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        data_map = {}
        for name, _ in file_url:
            zip_path = os.path.join(download_path, name)
            with gzip.open(zip_path, 'rb') as f:
                data_map[name.split('.')[0]] = np.array(
                    [str(line, 'utf-8').strip().split(',') for line in f]).T

        i1 = data_map['GSE75478_transcriptomics_raw_filtered_I1']
        f1 = data_map['GSE75478_transcriptomics_facs_indeces_filtered_I1']

        i2 = data_map['GSE75478_transcriptomics_raw_filtered_I2']
        f2 = data_map['GSE75478_transcriptomics_facs_indeces_filtered_I2']
        # Matching duplicated row in `i` and `f`
        row_name = set(i1[1:, 0]) & set(f1[1:, 0])
        i1 = i1[[True] + [True if i in row_name else False
                          for i in i1[1:, 0]], :]
        f1 = f1[[True] + [True if i in row_name else False
                          for i in f1[1:, 0]], :]
        assert np.all(i1[:, 0] == f1[:, 0])

        row_name = set(i2[1:, 0]) & set(f2[1:, 0])
        i2 = i2[[True] + [True if i in row_name else False
                          for i in i2[1:, 0]], :]
        f2 = f2[[True] + [True if i in row_name else False
                          for i in f2[1:, 0]], :]
        assert np.all(i2[:, 0] == f2[:, 0])

        # Matching the genes and protein among individuals
        gene_name = set(i1[0][1:]) & set(i2[0][1:])
        i1 = i1[:, [True] +
                [True if i in gene_name else False for i in i1[0][1:]]]
        i2 = i2[:, [True] +
                [True if i in gene_name else False for i in i2[0][1:]]]
        assert np.all(i1[0] == i2[0])
        gene = np.concatenate((i1, i2[1:]), axis=0)

        prot_name = set(
            [i for i in set(f1[0][1:]) & set(f2[0][1:]) if '_cd' in i])
        prot_name = sorted(prot_name)
        f1 = f1[:, [0] + [f1[0].tolist().index(i) for i in prot_name]]
        f2 = f2[:, [0] + [f2[0].tolist().index(i) for i in prot_name]]
        assert np.all(f1[0] == f2[0])
        prot = np.concatenate((f1, f2[1:]), axis=0)

        # ====== save data to disk ====== #
        X = gene[1:, 1:].astype('float32')
        X_row = gene[1:, 0]
        X_col = gene[0, 1:]
        X_col = np.array([i.replace('"', '') for i in X_col])

        y = prot[1:, 1:].astype('float32')
        y_row = prot[1:, 0]
        y_col = np.array(
            [i.replace('"', '').split('_')[-1].upper() for i in prot[0, 1:]])

        assert np.all(X_row == y_row)
        X_row = np.array([i.replace('"', '') for i in X_row])

        # ====== the protein marker can be smaller than zero ====== #
        min_values = np.min(y, axis=0, keepdims=True)
        min_values = np.where(min_values > 0, 0, min_values)
        y = y + np.abs(min_values)
        # ====== filter zero columns ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        save_to_dataset(path=preprocessed_path,
                        X=X,
                        X_col=X_col,
                        y=y,
                        y_col=y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Exemplo n.º 11
0
def read_PBMC8k(subset='full',
                override=False,
                verbose=True,
                filtered_genes=True,
                return_arrays=False) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    # prepare the path
    download_path = os.path.join(DOWNLOAD_DIR, f"PBMC8k_{subset}_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMC8k_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        # ====== pbmc 8k ====== #
        if subset == 'full':
            ly = read_PBMC8k('ly',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            my = read_PBMC8k('my',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            url = str(base64.decodebytes(_URL_PBMC8k), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # load data
            data = np.load(path)
            X = data['X']
            X_row = data['X_row']
            X_col = data['X_col'].tolist()
            y = data['y']
            y_col = data['y_col'].tolist()
            # merge all genes from my and ly subset
            all_genes = set(ly['X_col'].tolist() + my['X_col'].tolist())
            all_genes = sorted([X_col.index(i) for i in all_genes])
            # same for protein
            all_proteins = set(ly['y_col'].tolist() + my['y_col'].tolist())
            all_proteins = sorted([y_col.index(i) for i in all_proteins])
            #
            X = X[:, all_genes]
            y = y[:, all_proteins]
            X_col = np.array(X_col)[all_genes]
            y_col = np.array(y_col)[all_proteins]
            cell_types = np.array(
                ['ly' if i in ly['X_row'] else 'my' for i in X_row])
        # ====== pbmc ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # extract the data
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_filt']
                X_col = data['X_filt_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array([subset] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    if return_arrays:
        return ds
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"8k{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco
Exemplo n.º 12
0
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True):
    download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = _CITEseq_CBMC_PREPROCESSED
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        if verbose:
            print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED)
        shutil.rmtree(_CITEseq_CBMC_PREPROCESSED)
        os.mkdir(_CITEseq_CBMC_PREPROCESSED)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8')
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        download_file(filename=zip_path,
                      url=url,
                      override=False,
                      md5=r"beb76d01a67707c61c21bfb188e1b69f")
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
        # ====== post-processing ====== #
        X = np.array(data_dict['X'].astype('float32'))
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        X, X_col = remove_allzeros_columns(matrix=X, colname=X_col)
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]
        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]
        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"
        # save data
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
        sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col)
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        sco._inplace_subset_var(result.gene_subset)
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(set(sco.var_names.values), f)
        del sco
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(
        X=ds['X'],
        cell_id=ds['X_row'],
        gene_id=ds['X_col'],
        omic='transcriptomic',
        name=f"cbmcCITEseq{'' if filtered_genes else 'all'}",
    ).add_omic('proteomic', ds['y'], ds['y_col'])
    if filtered_genes:
        with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
            top_genes = pickle.load(f)
        sco._inplace_subset_var([i in top_genes for i in sco.var_names])
    return sco
Exemplo n.º 13
0
def read_CITEseq_PBMC(override=False,
                      verbose=True,
                      filtered_genes=False) -> SingleCellOMIC:
  download_path = os.path.join(
      DOWNLOAD_DIR,
      "PBMC_%s_original" % ('5000' if filtered_genes else 'CITEseq'))
  if not os.path.exists(download_path):
    os.makedirs(download_path)
  preprocessed_path = (_5000_PBMC_PREPROCESSED
                       if filtered_genes else _CITEseq_PBMC_PREPROCESSED)
  if override:
    shutil.rmtree(preprocessed_path)
    os.makedirs(preprocessed_path)
  # ******************** preprocessed data NOT found ******************** #
  if not os.path.exists(os.path.join(preprocessed_path, 'X')):
    X, X_row, X_col = [], None, None
    y, y_row, y_col = [], None, None
    # ====== download the data ====== #
    download_files = {}
    for url, md5 in zip(
        [_URL_5000 if filtered_genes else _URL_FULL, _URL_PROTEIN],
        [_MD5_5000 if filtered_genes else _MD5_FULL, _MD5_PROTEIN]):
      url = str(base64.decodebytes(url), 'utf-8')
      base_name = os.path.basename(url)
      path = os.path.join(download_path, base_name)
      download_file(filename=path, url=url, override=False)
      download_files[base_name] = (path, md5)
    # ====== extract the data ====== #
    n = set()
    for name, (path, md5) in sorted(download_files.items()):
      if verbose:
        print(f"Extracting {name} ...")
      binary_data = decrypt_aes(path, password=_PASSWORD)
      md5_ = md5_checksum(binary_data)
      assert md5_ == md5, f"MD5 checksum mismatch for file: {name}"
      with zipfile.ZipFile(file=BytesIO(binary_data), mode='r') as f:
        for name in f.namelist():
          data = str(f.read(name), 'utf8')
          for line in data.split('\n'):
            if len(line) == 0:
              continue
            line = line.strip().split(',')
            n.add(len(line))
            if 'Protein' in name:
              y.append(line)
            else:
              X.append(line)
    # ====== post-processing ====== #
    assert len(n) == 1, \
    "Number of samples inconsistent between raw count and protein count"
    if verbose:
      print("Processing gene count ...")
    X = np.array(X).T
    X_row, X_col = X[1:, 0], X[0, 1:]
    X = X[1:, 1:].astype('float32')
    # ====== filter mouse genes ====== #
    human_cols = [True if "HUMAN_" in i else False for i in X_col]
    if verbose:
      print(f"Removing {np.sum(np.logical_not(human_cols))} MOUSE genes ...")
    X = X[:, human_cols]
    X_col = np.array([i.replace('HUMAN_', '') for i in X_col[human_cols]])
    X, X_col = remove_allzeros_columns(matrix=X,
                                       colname=X_col,
                                       print_log=verbose)

    # ====== protein ====== #
    if verbose:
      print("Processing protein count ...")
    y = np.array(y).T
    y_row, y_col = y[1:, 0], y[0, 1:]
    y = y[1:, 1:].astype('float32')
    assert np.all(X_row == y_row), \
    "Cell order mismatch between gene count and protein count"
    # save data
    if verbose:
      print(f"Saving data to {preprocessed_path} ...")
    save_to_dataset(preprocessed_path,
                    X,
                    X_col,
                    y,
                    y_col,
                    rowname=X_row,
                    print_log=verbose)
  # ====== read preprocessed data ====== #
  ds = Dataset(preprocessed_path, read_only=True)
  return SingleCellOMIC(
      X=ds['X'],
      cell_id=ds['X_row'],
      gene_id=ds['X_col'],
      omic='transcriptomic',
      name=f"pbmcCITEseq{'' if filtered_genes else 'all'}",
  ).add_omic('proteomic', ds['y'], ds['y_col'])
Exemplo n.º 14
0
  def run(self):
    njobs = len(self.jobs)
    dataset = Dataset(self.path)
    if self.n_cache <= 1:
      cache_limit = max(2, int(0.12 * njobs))
    else:
      cache_limit = int(self.n_cache)
    # ====== indices ====== #
    databases = defaultdictkey(lambda key:
        MmapDict(path=os.path.join(dataset.path, key), cache_size=10000,
                 read_only=False))
    last_start = defaultdict(int)
    # ====== statistic ====== #
    # load old statistics
    stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2)
    for key in dataset.keys():
      if 'sum1' == key[-4]:
        stats[key[:-4]][0] = dataset[key][:]
      elif 'sum2' == key[-4:]:
        stats[key[:-4]][1] = dataset[key][:]
    # all data are cached for periodically flushed
    cache = defaultdict(list)
    n_processed = [0] # store the value as reference

    # ====== helper ====== #
    def flush_feature(feat_name, X_cached):
      if len(X_cached) > 0:
        X_cached = np.concatenate(X_cached, 0)
        # flush data
        if feat_name in dataset:
          dataset[feat_name].append(X_cached)
        else:
          dataset[(feat_name, 'memmap')] = X_cached

    # ====== repeated for each result returned ====== #
    def post_processing(result):
      # search for file name
      if self.identifier not in result:
        raise RuntimeError(
            "Cannot find identifier '%s' in returned dictionary" % self.identifier)
      file_name = result[self.identifier]
      # invalid file_name
      if not is_string(file_name):
        raise RuntimeError("Cannot find file name in returned features "
            "list, the file name can be specified in key: 'name', 'path' "
            "and the type of the value must be string. All available "
            "keys are: %s" % str(result.keys()))
      # store all new indices
      # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
      all_indices = {}
      # processing
      for feat_name, X in result.items():
        # some invalid feat_name
        if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
          raise RuntimeError("Returned features' name cannot be one "
                             "of the following: 'config', 'pipeline', 'sum1', 'sum2'.")
        # ignore some feat_name
        if feat_name in ('name'):
          continue
        # if numpy ndarray, save to MmapData
        if isinstance(X, np.ndarray) or \
        'sum1' == feat_name[-4:] or \
        'sum2' == feat_name[-4:]:
          # save statistics instead
          if 'sum1' == feat_name[-4:]:
            stats[feat_name[:-4]][0] += X
          elif 'sum2' == feat_name[-4:]:
            stats[feat_name[:-4]][1] += X
          # save features array
          else:
            all_indices[feat_name] = X.shape[0]
            # cache data, only if we have more than 0 sample
            if X.shape[0] > 0:
              cache[feat_name].append(X)
        # else all other kind of data save to MmapDict
        else:
          databases[feat_name][file_name] = X
        # remove data
        del X
      # ====== update indices ====== #
      if len(all_indices) > 0:
        for feat_name, n in all_indices.items():
          ids_name = 'indices_%s' % feat_name
          databases[ids_name][file_name] = (last_start[ids_name],
                                            last_start[ids_name] + n)
          last_start[ids_name] += n
      # ====== flush cache ====== #
      n_processed[0] += 1
      if n_processed[0] % cache_limit == 0: # 12 + 8
        for feat_name, X_cached in cache.items():
          flush_feature(feat_name, X_cached)
        cache.clear()
      # ====== update progress ====== #
      return file_name

    # ====== mapping function ====== #
    def _map_func(dat):
      try:
        ret = self.extractor.transform(dat)
      except Exception as e: # Non-handled exception
        ret = '\n========\n'
        ret += 'Time  : `%s`\n' % str(get_formatted_datetime(only_number=False))
        ret += 'Error : `%s`\n' % str(e)
        ret += 'Input : `%s`\n' % str(dat)
        import traceback
        etype, value, tb = sys.exc_info()
        for line in traceback.TracebackException(
                type(value), value, tb, limit=None).format(chain=True):
          ret += line
      return ret
    # ====== processing ====== #
    mpi = MPI(jobs=self.jobs,
              func=_map_func,
              ncpu=self.n_cpu,
              batch=1,
              hwm=self.n_cpu * 3,
              backend='python')
    # initialize
    prog = Progbar(target=njobs, name=self.path,
                   interval=0.12, print_report=True, print_summary=True)
    start_time = time.time()
    last_time = time.time()
    last_count = 0
    with open(self._log_path, 'w') as flog:
      # writing the log head
      flog.write('============================\n')
      flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False))
      flog.write('Outpath    : %s\n' % self.path)
      flog.write('Extractor  : %s\n' % '->'.join([s[-1].__class__.__name__
                                                  for s in self.extractor.steps]))
      flog.write('#Jobs      : %d\n' % njobs)
      flog.write('#CPU       : %d\n' % self.n_cpu)
      flog.write('#Cache     : %d\n' % cache_limit)
      flog.write('============================\n')
      flog.flush()
      # start processing the file list
      for count, result in enumerate(mpi):
        # Non-handled exception
        if isinstance(result, string_types):
          flog.write(result)
          flog.flush()
          self._error_log.append(result)
          if self.stop_on_failure:
            raise RuntimeError(result)
        # some error might happened
        elif isinstance(result, ExtractorSignal):
          flog.write(str(result)); flog.flush()
          if result.action == 'error':
            prog.add_notification(str(result))
            raise RuntimeError("ExtractorSignal requests terminating processor!")
          elif result.action == 'warn':
            prog.add_notification(str(result))
          elif result.action == 'ignore':
            self._error_log.append(result)
          else:
            raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action)
          prog['File'] = '%-48s' % result.message[:48]
        # otherwise, no error happened, do post-processing
        else:
          name = post_processing(result)
          prog['File'] = '%-48s' % str(name)[:48]
        # update progress
        prog.add(1)
        # manually write to external log file
        if (count + 1) % max(1, int(0.01 * njobs)) == 0:
          curr_time = time.time()
          elap = curr_time - start_time
          avg_speed = (count + 1) / elap
          cur_speed = (count + 1 - last_count) / (curr_time - last_time)
          avg_est = (njobs - count - 1) / avg_speed
          cur_est = (njobs - count - 1) / cur_speed
          flog.write('[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                     '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                     '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                     (get_formatted_datetime(only_number=False),
                      count + 1, njobs - count - 1, elap,
                      avg_speed, avg_est,
                      cur_speed, cur_est))
          flog.flush()
          last_time = curr_time
          last_count = count + 1
    # ====== end, flush the last time ====== #
    for feat_name, X_cached in cache.items():
      flush_feature(feat_name, X_cached)
    cache.clear()
    cache = None
    dataset.flush()
    prog.add_notification("Flushed all data to disk")
    # ====== saving indices ====== #
    for name, db in databases.items():
      db.flush(save_all=True)
      db_size = len(db)
      db.close()
      prog.add_notification('Flush MmapDict "%s" to disk, size: %s' %
                            (ctext(name, 'yellow'),
                             ctext(str(db_size), 'yellow')))

    # ====== save mean and std ====== #
    def save_mean_std(sum1, sum2, name):
      N = dataset[name.split('_')[0]].shape[0]
      mean = sum1 / N
      std = np.sqrt(sum2 / N - np.power(mean, 2))
      if np.any(np.isnan(mean)):
        wprint('Mean contains NaN, name: %s' % name)
      if np.any(np.isnan(std)):
        wprint('Std contains NaN, name: %s' % name)
      dataset[name + 'sum1'] = sum1
      dataset[name + 'sum2'] = sum2
      dataset[name + 'mean'] = mean
      dataset[name + 'std'] = std
    # save all stats
    if len(stats) > 0:
      for feat_name, (sum1, sum2) in stats.items():
        save_mean_std(sum1, sum2, feat_name)
        prog.add_notification('Saved statistics of: %s, shape: %s' %
                              (ctext(feat_name.split('_')[0], 'yellow'),
                               ctext(str(sum1.shape), 'yellow')))
    # ====== dataset flush() ====== #
    dataset.flush()
    dataset.close()
    # ====== saving the extractor ====== #
    # not good idea to save the extractor all the time
    # pipeline_path = os.path.join(dataset.path, 'pipeline')
    # with open(pipeline_path, 'wb') as f:
    #   cPickle.dump(self.extractor, f, protocol=2)
    # prog.add_notification("Saved Extractor pipeline at: %s" %
    #                       ctext(pipeline_path, 'yellow'))
    # ====== saving the configuration ====== #
    config_path = os.path.join(dataset.path, 'config')
    config = MmapDict(config_path)
    config['__configuration_time__'] = time.time()
    config['__processor__'] = self.path
    for i in dir(self):
      if _default_module.match(i) is not None:
        continue
      j = getattr(self, i)
      if isinstance(j, (Number, string_types, bool)):
        config[i] = j
    config.flush(save_all=True)
    self.config = {i: j
                   for i, j in config}
    config.close()
    prog.add_notification("Saved configuration at: %s" %
                          ctext(config_path, 'yellow'))
    # ====== final notification ====== #
    prog.add_notification("Closed all dataset.")
    prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
Exemplo n.º 15
0
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
  """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
  # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
  # add reading data from indices also
  # ====== check input dataset ====== #
  own_dataset = True
  if is_string(dataset) and os.path.isdir(dataset):
    dataset = Dataset(dataset, read_only=True)
  elif isinstance(dataset, Dataset):
    own_dataset = False
  elif isinstance(dataset, FeatureProcessor):
    dataset = Dataset(dataset.path, read_only=True)
  else:
    raise ValueError("Cannot acquire Dataset from input: %s" %
                     str(dataset))
  # ====== extract all feat_name ====== #
  if is_string(feat_name) and feat_name == 'auto':
    feat_name = []
    for k in dataset.keys():
      X = dataset[k]
      if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
        feat_name.append(k)
  else:
    feat_name = [name
                 for name in as_tuple(feat_name, t=str)
                 if name in dataset]
  # ====== load PCA ====== #
  from odin.ml import MiniBatchPCA
  # init PCA
  nb_samples = 0
  for feat in feat_name:
    nb_samples += dataset[feat].shape[0]
  # ====== prepare MPI PCA ====== #
  add_notification("Selected features for PCA: " +
      ctext(', '.join(feat_name), 'yellow'))

  def map_pca(name):
    X = dataset[name]
    # found exist pca model
    if 'pca_' + feat in dataset and not override:
      pca = dataset['pca_' + feat]
    # create new PCA
    else:
      pca = MiniBatchPCA(n_components=None, whiten=False,
                         copy=True, batch_size=None)
    # No shuffling make iter much faster
    for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0):
      pca.partial_fit(x)
      yield x.shape[0]
    # save PCA model
    with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
      cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
    # finish return feature name
    yield name
  mpi = MPI(jobs=feat_name, func=map_pca,
            ncpu=None, batch=1, hwm=12082518,
            backend='python')
  # ====== running the MPI ====== #
  remain_features = list(feat_name)
  finished_features = []
  prog = Progbar(target=nb_samples, print_summary=True, print_report=True,
                 name='PCA')
  for n in mpi:
    if is_string(n):
      remain_features.remove(n)
      finished_features.append(n)
    else:
      prog['Remain'] = ', '.join(remain_features)
      prog['Finished'] = ', '.join(finished_features)
      prog.add(n)
  # ====== return ====== #
  if own_dataset:
    dataset.close()
Exemplo n.º 16
0
def validate_features(ds_or_processor, path, nb_samples=25,
                      override=False, seed=12082518, fig_width=4):
  # TODO: add PCA visualization
  # TODO: update to match new indices style
  def logger(title, tag, check):
    check = bool(check)
    text_color = 'yellow' if check else 'red'
    print(ctext('   *', 'cyan'),
          ctext(str(title), text_color),
          ctext(str(tag), 'magenta'),
          ctext("✓", text_color) if check else ctext("✗", text_color))
  import matplotlib
  matplotlib.use('Agg')
  from odin.visual import plot_save, plot_multiple_features
  # ====== check path to dataset ====== #
  should_close_ds = True
  if isinstance(ds_or_processor, FeatureProcessor):
    ds = Dataset(ds_or_processor.path, read_only=True)
  elif is_string(ds_or_processor):
    ds = Dataset(ds_or_processor, read_only=True)
  elif isinstance(ds_or_processor, Dataset):
    ds = ds_or_processor
    should_close_ds = False
  else:
    raise ValueError("`ds` can be None, string, or Dataset. No "
                     "support for given input type: %s" % str(type(ds)))
  print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
  # ====== extract the config of the dataset ====== #
  if 'config' not in ds:
    raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` "
                       "which must contain `config` MmapDict of extracted "
                       "features configuration.")
  # config = ds['config']
  # pipeline = ds['pipeline']
  # ====== output path ====== #
  path = str(path)
  if not os.path.exists(path):
    os.mkdir(path)
  elif override:
    if os.path.isfile(path):
      os.remove(path)
    else:
      shutil.rmtree(path)
    os.mkdir(path)
  else:
    raise ValueError("`path`=%s exists, cannot override." % path)
  prev_stdio = get_stdio_path()
  stdio(path=os.path.join(path, 'log.txt'))
  nb_samples = int(nb_samples)
  # ====== get all features ====== #
  # [(name, dtype, statistic-able), ...]
  all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
  # store all features (included the features in external_indices
  all_features = []
  # the external indices can be: indices_mfcc_bnf
  external_indices = flatten_list([k.split('_')[1:] for k in all_keys
                                   if 'indices' in k and k != 'indices'])
  # ====== checking indices ====== #
  main_indices = {name: (start, end)
                  for name, (start, end) in ds['indices'].items()}
  for ids_name in (k for k in all_keys if 'indices' in k):
    ids = sorted([(name, start, end)
                  for name, (start, end) in ds[ids_name].items()],
                 key=lambda x: x[1])
    for prev, now in zip(ids, ids[1:]):
      assert prev[2] == now[1], "Zero length in indices"
      assert prev[2] - prev[1] > 0, "Zero length in indices"
      assert now[2] - now[1] > 0, "Zero length in indices"
    # final length match length of Data
    if ids_name != 'indices':
      for feat_name in ids_name.split('_')[1:]:
        assert now[-1] == len(ds[feat_name]), \
            "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
            (ids_name, feat_name)
        all_features.append(feat_name)
    else:
      for feat_name in all_keys:
        if feat_name not in external_indices and \
        'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
        'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
        isinstance(ds[feat_name], MmapData):
          assert now[-1] == len(ds[feat_name]), \
          "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
          all_features.append(feat_name)
    # logging
    logger("Checked all:", ids_name, True)
  # ====== check all dictionary types ====== #
  for name in all_keys:
    if isinstance(ds[name], MmapDict) and 'indices' not in name:
      data = ds[name]
      # special cases
      if name == 'sr':
        checking_func = lambda x: x > 0 # for sr
      else:
        checking_func = lambda x: True
      # check
      for key, val in data.items():
        assert key in main_indices, \
        "Dictionary with name:'%s' has key not found in indices." % name
        assert checking_func(val)
      logger("Checked dictionary: ", name, True)
  # ====== checking each type of data ====== #
  # get all stats name
  all_stats = defaultdict(list)
  for k in all_keys:
    if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
    'mean' == k[-4:] or 'std' == k[-3:]:
      all_stats[k[:-4].split('_')[0]].append(k)
  # get all pca name
  all_pca = {i: i + '_pca' for i in all_features
             if i + '_pca' in ds}
  # checking one-by-one numpy.ndarray features array
  for feat_name in all_features:
    dtype = str(ds[feat_name].dtype)
    # checking all data
    indices = ds.find_prefix(feat_name, 'indices')
    prog = Progbar(target=len(indices), interval=0.1,
                   print_report=True,
                   name='Checking: %s(%s)' % (feat_name, dtype))
    # start iterating over all data file
    fail_test = False
    for file_name, (start, end) in indices:
      dat = ds[feat_name][start:end]
      # No NaN value
      if np.any(np.isnan(dat)):
        logger("NaN values", file_name + ':' + feat_name, False)
        fail_test = True
      # not all value closed to zeros
      if np.all(np.isclose(dat, 0.)):
        logger("All-closed-zeros values", file_name + ':' + feat_name,
               False)
        fail_test = True
      prog['Name'] = file_name
      prog.add(1)
    if not fail_test:
      logger("Check data incredibility for: ", feat_name, True)
    # checking statistics
    if feat_name in all_stats:
      fail_test = False
      for stat_name in all_stats[feat_name]:
        X = ds[stat_name]
        if X.ndim >= 1:
          X = X[:]
        if np.any(np.isnan(X)):
          logger("NaN values", feat_name + ':' + stat_name, False)
          fail_test = True
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values", feat_name + ':' + stat_name,
                 False)
          fail_test = True
      if not fail_test:
        logger("Check statistics for: ", feat_name, True)
    # check PCA
    if feat_name in all_pca:
      pca = ds[all_pca[feat_name]]
      n = ds[feat_name].shape[0]
      nb_feats = ds[feat_name].shape[-1]
      fail_test = False
      # performing PCA on random samples
      for i in range(nb_samples):
        start = np.random.randint(0, n - nb_samples - 1)
        X = pca.transform(
            ds[feat_name][start:(start + nb_samples)],
            n_components=max(nb_feats // 2, 1))
        if np.any(np.isnan(X)):
          logger("NaN values in PCA", feat_name, False)
          fail_test = True
          break
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values in PCA", feat_name, False)
          fail_test = True
          break
      if not fail_test:
        logger("Check PCA for: ", feat_name, True)
  # ====== Do sampling ====== #
  np.random.seed(seed) # seed for reproceducible
  all_samples = np.random.choice(list(ds['indices'].keys()),
                                 size=nb_samples,
                                 replace=False)
  # plotting all samples
  for sample_id, file_name in enumerate(all_samples):
    X = {}
    for feat_name in all_features:
      start, end = ds.find_prefix(feat_name, 'indices')[file_name]
      feat = ds[feat_name][start:end]
      X[feat_name] = feat
      # some special handling
      try:
        _special_cases(X=feat, feat_name=feat_name, file_name=file_name,
                       ds=ds, path=path)
      except Exception as e:
        logger("Special case error: %s" % str(e),
               file_name + ':' + feat_name, False)
    plot_multiple_features(X, title=file_name, fig_width=fig_width)
    figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name))
    plot_save(figure_path, log=False, clear_all=True)
    logger("Sample figure saved at: ", figure_path, True)
  # plotting the statistic
  figure_path = os.path.join(path, 'stats.pdf')
  for feat_name, stat_name in all_stats.items():
    X = {name: ds[name][:]
         for name in stat_name
         if ds[name].ndim >= 1}
    if len(X) > 0:
      plot_multiple_features(X, title=feat_name, fig_width=fig_width)
  plot_save(figure_path, log=False, clear_all=True)
  logger("Stats figure save at: ", figure_path, True)
  logger("All reports at folder: ", os.path.abspath(path), True)
  # ====== cleaning ====== #
  stdio(path=prev_stdio)
  if should_close_ds:
    ds.close()
Exemplo n.º 17
0
def read_PBMCeec(subset='ly',
                 override=False,
                 verbose=True,
                 filtered_genes=True) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    if subset in ('my', 'full'):
        raise NotImplementedError("No support for subset: %s - PBMCecc" %
                                  subset)
    download_path = os.path.join(DOWNLOAD_DIR, "PBMCecc_%s_original" % subset)
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMCecc_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at path {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        # ====== full ====== #
        if subset == 'full':
            raise NotImplementedError
        # ====== ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # ====== extract the data ====== #
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_var']
                X_col = data['X_var_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array(['ly'] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"ecc{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco
Exemplo n.º 18
0
def read_FACS(n_protein, override=False, verbose=False):
    download_path = os.path.join(DOWNLOAD_DIR, "FACS_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)

    n_protein = int(n_protein)
    assert n_protein in (2, 5)

    preprocessed_path = _FACS_PREPROCESSED % n_protein
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8') % n_protein
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        urlretrieve(url=url, filename=zip_path)
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
            if verbose:
                print('%-12s' % base_name, ':', data.shape)
        # ====== post-processing ====== #
        X = data_dict['X'].astype('float32')
        X = np.array(X)
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]

        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]

        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"

        # ====== filter zero columns ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)

        save_to_dataset(path=preprocessed_path,
                        X=X,
                        X_col=X_col,
                        y=y,
                        y_col=y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds