Python Dataset примеры использования

Язык программирования: Python

Пространство имен/Пакет: odin.fuel

Класс/Тип: Dataset

Примеров на hotexamples.com: 18

Python Dataset - 18 примеров найдено. Это лучшие примеры Python кода для odin.fuel.Dataset, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Dataset(15)

close(3)

keys(3)

find_prefix(1)

flush(1)

Пример #1

Показать файл

def read_Hemato(override=False, verbose=False):
    preprocessed_path = select_path(os.path.join(DATA_DIR,
                                                 'HEMATO_preprocessed'),
                                    create_new=True)

    if override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ====== copy the dataset from scVI ====== #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        try:
            from scvi.dataset import HematoDataset
        except ImportError:
            raise RuntimeError("Require `scVI` package for HEMATO dataset")

        gene_dataset = HematoDataset(
            save_path=os.path.join(DOWNLOAD_DIR, 'HEMATO/'))

        X = gene_dataset._X
        gene_names = np.array(gene_dataset.gene_names)
        assert len(gene_names) == X.shape[1]

        y = gene_dataset.meta.values[:, 1:]
        label_names = np.array(gene_dataset.cell_types_levels)
        assert len(label_names) == y.shape[1]

        cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])])

        _save_data_to_path(preprocessed_path, X, y, gene_names, label_names,
                           cell_names, verbose)

        # create a binary classes for testing
        label_names = np.array(["Erythroblasts", "Granulocytes"])
        min_y = np.min(gene_dataset.labels)
        max_y = np.max(gene_dataset.labels)
        y_val = 2 * (gene_dataset.labels - min_y) / (max_y - min_y) - 1
        y_bin = np.argmax(
            np.hstack((
                gene_dataset.meta.iloc[:, 1].values[:, None],  # Er
                gene_dataset.meta.iloc[:, 2].values[:, None])),  # Gr
            axis=-1)
        with open(os.path.join(preprocessed_path, 'labels_name'), 'wb') as f:
            pickle.dump(label_names, f)
        with open(os.path.join(preprocessed_path, 'labels_bin'), 'wb') as f:
            pickle.dump(y_bin, f)
        with open(os.path.join(preprocessed_path, 'labels_val'), 'wb') as f:
            pickle.dump(y_val, f)
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds

Пример #2

Показать файл

Файл: removed_protein.py Проект: trungnt13/sisua

def read_PBMC_crossdataset_remove_protein(subset,
                                          return_ecc,
                                          filtered_genes=False,
                                          override=False,
                                          verbose=False,
                                          remove_protein=['CD4', 'CD8']):
    remove_protein = sorted(
        [i.lower() for i in as_tuple(remove_protein, t=string_types)])
    preprocessed_path = os.path.join(
        DATA_DIR, 'PBMCcross_%s_%s_no%s_preprocessed' %
        ('ecc' if return_ecc else '8k', subset +
         ('' if filtered_genes else 'full'), ''.join(
             [i.lower() for i in remove_protein])))
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)

    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        ds = read_PBMC_crossdataset_ecc_8k(subset,
                                           return_ecc,
                                           filtered_genes,
                                           override=override,
                                           verbose=verbose)
        X = ds['X'][:]
        X_row = ds['X_row']
        X_col = ds['X_col']
        y = ds['y']
        y_col = ds['y_col']

        remove_ids = [
            i for i, j in enumerate(y_col)
            if standardize_protein_name(j).lower() in remove_protein
        ]
        remain_ids = [i for i in range(len(y_col)) if i not in remove_ids]
        y_col = y_col[remain_ids]
        y = y[:, remain_ids]

        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** return ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds

Пример #3

Показать файл

def _read_scvi_dataset(name, clazz_name, override, verbose):
    preprocessed_path = select_path(os.path.join(DATA_DIR,
                                                 '%s_preprocessed' % name),
                                    create_new=True)
    if override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ====== copy the dataset from scVI ====== #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        try:
            import scvi.dataset as scvi_dataset
        except ImportError:
            raise RuntimeError("Require `scVI` package for PBMC dataset")
        clazz = getattr(scvi_dataset, clazz_name)
        gene_dataset = clazz(save_path=DOWNLOAD_DIR)

        X = gene_dataset._X
        if hasattr(X, 'todense'):
            X = np.array(X.todense())

        gene_names = np.array(gene_dataset.gene_names)
        # convert gene identifier to gene symbol (i.e. name)
        if hasattr(gene_dataset, 'de_metadata'):
            from sisua.data.utils import get_gene_id2name
            meta = gene_dataset.de_metadata
            converter = {i: j for i, j in zip(meta.ENSG, meta.GS)}
            pbmc8kconverter = get_gene_id2name()
            gene_names = np.array([
                pbmc8kconverter[i] if i in pbmc8kconverter else converter[i]
                for i in gene_names
            ])
        assert len(gene_names) == X.shape[1]

        label_names = np.array(gene_dataset.cell_types)
        y = one_hot(gene_dataset.labels.ravel(), nb_classes=len(label_names))
        assert len(label_names) == y.shape[1]

        cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])])
        _save_data_to_path(preprocessed_path, X, y, gene_names, label_names,
                           cell_names, verbose)
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds

Пример #4

Показать файл

 def load_parameters(clazz):
   # ====== all path ====== #
   name = clazz.__name__ + '.zip'
   path = os.path.join(base64.decodebytes(Model.ORIGIN).decode(), name)
   param_path = get_datasetpath(name=clazz.__name__, override=False)
   zip_path = os.path.join(Model.BASE_DIR, name)
   # ====== get params files ====== #
   if not os.path.exists(param_path) or \
   len(os.listdir(param_path)) == 0:
     get_file(name, origin=path, outdir=Model.BASE_DIR)
     zf = ZipFile(zip_path, mode='r', compression=ZIP_DEFLATED)
     zf.extractall(path=Model.BASE_DIR)
     zf.close()
     # check if proper unzipped
     if not os.path.exists(param_path) or \
     len(os.listdir(param_path)) == 0:
       raise RuntimeError("Zip file at path:%s is not proper unzipped, "
           "cannot find downloaded parameters at path: %s" %
           (zip_path, param_path))
     else:
       os.remove(zip_path)
   # ====== create and return the params dataset ====== #
   ds = Dataset(param_path, read_only=True)
   return ds

Пример #5

Показать файл

def validating_dataset(path):
  if isinstance(path, Dataset):
    ds = path
  elif isinstance(path, string_types):
    ds = Dataset(path, read_only=True)

  assert 'X' in ds, \
  '`X` (n_samples, n_genes) must be stored at path: %s' % ds.path
  assert 'X_col' in ds, \
  '`X_col` (n_genes,) must be stored at path: %s' % ds.path
  assert 'X_row' in ds, \
  '`X_row` (n_samples,) must be stored at path: %s' % ds.path

  if 'y' in ds:
    assert 'y' in ds, \
    '`y` (n_samples, n_protein) must be stored at path: %s' % ds.path
    assert 'y_col' in ds, \
    '`y_col` (n_protein,) must be stored at path: %s' % ds.path
    y, y_col = ds['y'], ds['y_col']
  else:
    y, y_col = None, None

  X, X_col, rowname = ds['X'], ds['X_col'], ds['X_row']
  _check_data(X, X_col, y, y_col, rowname)

Пример #6

Показать файл

Файл: processor.py Проект: professorlust/odin-ai

    def run(self):
        njobs = len(self.jobs)
        dataset = Dataset(self.path)
        if self.n_cache <= 1:
            cache_limit = max(2, int(0.12 * njobs))
        else:
            cache_limit = int(self.n_cache)
        # ====== indices ====== #
        databases = defaultdictkey(
            lambda key: MmapDict(path=os.path.join(dataset.path, key),
                                 cache_size=10000,
                                 read_only=False))
        last_start = defaultdict(int)
        # ====== statistic ====== #
        # load old statistics
        stats = defaultdict(lambda: [0, 0])  # name -> (sum1, sum2)
        for key in dataset.keys():
            if 'sum1' == key[-4]:
                stats[key[:-4]][0] = dataset[key][:]
            elif 'sum2' == key[-4:]:
                stats[key[:-4]][1] = dataset[key][:]
        # all data are cached for periodically flushed
        cache = defaultdict(list)
        n_processed = [0]  # store the value as reference

        # ====== helper ====== #
        def flush_feature(feat_name, X_cached):
            if len(X_cached) > 0:
                X_cached = np.concatenate(X_cached, 0)
                # flush data
                if feat_name in dataset:
                    dataset[feat_name].append(X_cached)
                else:
                    dataset[(feat_name, 'memmap')] = X_cached

        # ====== repeated for each result returned ====== #
        def post_processing(result):
            # search for file name
            if self.identifier not in result:
                raise RuntimeError(
                    "Cannot find identifier '%s' in returned dictionary" %
                    self.identifier)
            file_name = result[self.identifier]
            # invalid file_name
            if not is_string(file_name):
                raise RuntimeError(
                    "Cannot find file name in returned features "
                    "list, the file name can be specified in key: 'name', 'path' "
                    "and the type of the value must be string. All available "
                    "keys are: %s" % str(result.keys()))
            # store all new indices
            # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
            all_indices = {}
            # processing
            for feat_name, X in result.items():
                # some invalid feat_name
                if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
                    raise RuntimeError(
                        "Returned features' name cannot be one "
                        "of the following: 'config', 'pipeline', 'sum1', 'sum2'."
                    )
                # ignore some feat_name
                if feat_name in ('name'):
                    continue
                # if numpy ndarray, save to MmapData
                if isinstance(X, np.ndarray) or \
                'sum1' == feat_name[-4:] or \
                'sum2' == feat_name[-4:]:
                    # save statistics instead
                    if 'sum1' == feat_name[-4:]:
                        stats[feat_name[:-4]][0] += X
                    elif 'sum2' == feat_name[-4:]:
                        stats[feat_name[:-4]][1] += X
                    # save features array
                    else:
                        all_indices[feat_name] = X.shape[0]
                        # cache data, only if we have more than 0 sample
                        if X.shape[0] > 0:
                            cache[feat_name].append(X)
                # else all other kind of data save to MmapDict
                else:
                    databases[feat_name][file_name] = X
                # remove data
                del X
            # ====== update indices ====== #
            if len(all_indices) > 0:
                for feat_name, n in all_indices.items():
                    ids_name = 'indices_%s' % feat_name
                    databases[ids_name][file_name] = (last_start[ids_name],
                                                      last_start[ids_name] + n)
                    last_start[ids_name] += n
            # ====== flush cache ====== #
            n_processed[0] += 1
            if n_processed[0] % cache_limit == 0:  # 12 + 8
                for feat_name, X_cached in cache.items():
                    flush_feature(feat_name, X_cached)
                cache.clear()
            # ====== update progress ====== #
            return file_name

        # ====== mapping function ====== #
        def _map_func(dat):
            try:
                ret = self.extractor.transform(dat)
            except Exception as e:  # Non-handled exception
                ret = '\n========\n'
                ret += 'Time  : `%s`\n' % str(
                    get_formatted_datetime(only_number=False))
                ret += 'Error : `%s`\n' % str(e)
                ret += 'Input : `%s`\n' % str(dat)
                import traceback
                etype, value, tb = sys.exc_info()
                for line in traceback.TracebackException(
                        type(value), value, tb, limit=None).format(chain=True):
                    ret += line
            return ret

        # ====== processing ====== #
        mpi = MPI(jobs=self.jobs,
                  func=_map_func,
                  ncpu=self.n_cpu,
                  batch=1,
                  hwm=self.n_cpu * 3,
                  backend='python')
        # initialize
        prog = Progbar(target=njobs,
                       name=self.path,
                       interval=0.12,
                       print_report=True,
                       print_summary=True)
        start_time = time.time()
        last_time = time.time()
        last_count = 0
        with open(self._log_path, 'w') as flog:
            # writing the log head
            flog.write('============================\n')
            flog.write('Start Time : %s\n' %
                       get_formatted_datetime(only_number=False))
            flog.write('Outpath    : %s\n' % self.path)
            flog.write('Extractor  : %s\n' % '->'.join(
                [s[-1].__class__.__name__ for s in self.extractor.steps]))
            flog.write('#Jobs      : %d\n' % njobs)
            flog.write('#CPU       : %d\n' % self.n_cpu)
            flog.write('#Cache     : %d\n' % cache_limit)
            flog.write('============================\n')
            flog.flush()
            # start processing the file list
            for count, result in enumerate(mpi):
                # Non-handled exception
                if isinstance(result, string_types):
                    flog.write(result)
                    flog.flush()
                    self._error_log.append(result)
                    if self.stop_on_failure:
                        raise RuntimeError(result)
                # some error might happened
                elif isinstance(result, ExtractorSignal):
                    flog.write(str(result))
                    flog.flush()
                    if result.action == 'error':
                        prog.add_notification(str(result))
                        raise RuntimeError(
                            "ExtractorSignal requests terminating processor!")
                    elif result.action == 'warn':
                        prog.add_notification(str(result))
                    elif result.action == 'ignore':
                        self._error_log.append(result)
                    else:
                        raise RuntimeError(
                            "Unknown action from ExtractorSignal: %s" %
                            result.action)
                    prog['File'] = '%-48s' % result.message[:48]
                # otherwise, no error happened, do post-processing
                else:
                    name = post_processing(result)
                    prog['File'] = '%-48s' % str(name)[:48]
                # update progress
                prog.add(1)
                # manually write to external log file
                if (count + 1) % max(1, int(0.01 * njobs)) == 0:
                    curr_time = time.time()
                    elap = curr_time - start_time
                    avg_speed = (count + 1) / elap
                    cur_speed = (count + 1 - last_count) / (curr_time -
                                                            last_time)
                    avg_est = (njobs - count - 1) / avg_speed
                    cur_est = (njobs - count - 1) / cur_speed
                    flog.write(
                        '[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                        '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                        '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                        (get_formatted_datetime(only_number=False), count + 1,
                         njobs - count - 1, elap, avg_speed, avg_est,
                         cur_speed, cur_est))
                    flog.flush()
                    last_time = curr_time
                    last_count = count + 1
        # ====== end, flush the last time ====== #
        for feat_name, X_cached in cache.items():
            flush_feature(feat_name, X_cached)
        cache.clear()
        cache = None
        dataset.flush()
        prog.add_notification("Flushed all data to disk")
        # ====== saving indices ====== #
        for name, db in databases.items():
            db.flush(save_all=True)
            db_size = len(db)
            db.close()
            prog.add_notification(
                'Flush MmapDict "%s" to disk, size: %s' %
                (ctext(name, 'yellow'), ctext(str(db_size), 'yellow')))

        # ====== save mean and std ====== #
        def save_mean_std(sum1, sum2, name):
            N = dataset[name.split('_')[0]].shape[0]
            mean = sum1 / N
            std = np.sqrt(sum2 / N - np.power(mean, 2))
            if np.any(np.isnan(mean)):
                wprint('Mean contains NaN, name: %s' % name)
            if np.any(np.isnan(std)):
                wprint('Std contains NaN, name: %s' % name)
            dataset[name + 'sum1'] = sum1
            dataset[name + 'sum2'] = sum2
            dataset[name + 'mean'] = mean
            dataset[name + 'std'] = std

        # save all stats
        if len(stats) > 0:
            for feat_name, (sum1, sum2) in stats.items():
                save_mean_std(sum1, sum2, feat_name)
                prog.add_notification(
                    'Saved statistics of: %s, shape: %s' %
                    (ctext(feat_name.split('_')[0],
                           'yellow'), ctext(str(sum1.shape), 'yellow')))
        # ====== dataset flush() ====== #
        dataset.flush()
        dataset.close()
        # ====== saving the extractor ====== #
        # not good idea to save the extractor all the time
        # pipeline_path = os.path.join(dataset.path, 'pipeline')
        # with open(pipeline_path, 'wb') as f:
        #   cPickle.dump(self.extractor, f, protocol=2)
        # prog.add_notification("Saved Extractor pipeline at: %s" %
        #                       ctext(pipeline_path, 'yellow'))
        # ====== saving the configuration ====== #
        config_path = os.path.join(dataset.path, 'config')
        config = MmapDict(config_path)
        config['__configuration_time__'] = time.time()
        config['__processor__'] = self.path
        for i in dir(self):
            if _default_module.match(i) is not None:
                continue
            j = getattr(self, i)
            if isinstance(j, (Number, string_types, bool)):
                config[i] = j
        config.flush(save_all=True)
        self.config = {i: j for i, j in config}
        config.close()
        prog.add_notification("Saved configuration at: %s" %
                              ctext(config_path, 'yellow'))
        # ====== final notification ====== #
        prog.add_notification("Closed all dataset.")
        prog.add_notification("Dataset at path: %s" %
                              ctext(dataset.path, 'yellow'))

Пример #7

Показать файл

Файл: processor.py Проект: professorlust/odin-ai

def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
    """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
    # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
    # add reading data from indices also
    # ====== check input dataset ====== #
    own_dataset = True
    if is_string(dataset) and os.path.isdir(dataset):
        dataset = Dataset(dataset, read_only=True)
    elif isinstance(dataset, Dataset):
        own_dataset = False
    elif isinstance(dataset, FeatureProcessor):
        dataset = Dataset(dataset.path, read_only=True)
    else:
        raise ValueError("Cannot acquire Dataset from input: %s" %
                         str(dataset))
    # ====== extract all feat_name ====== #
    if is_string(feat_name) and feat_name == 'auto':
        feat_name = []
        for k in dataset.keys():
            X = dataset[k]
            if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
                feat_name.append(k)
    else:
        feat_name = [
            name for name in as_tuple(feat_name, t=str) if name in dataset
        ]
    # ====== load PCA ====== #
    from odin.ml import MiniBatchPCA
    # init PCA
    nb_samples = 0
    for feat in feat_name:
        nb_samples += dataset[feat].shape[0]
    # ====== prepare MPI PCA ====== #
    add_notification("Selected features for PCA: " +
                     ctext(', '.join(feat_name), 'yellow'))

    def map_pca(name):
        X = dataset[name]
        # found exist pca model
        if 'pca_' + feat in dataset and not override:
            pca = dataset['pca_' + feat]
        # create new PCA
        else:
            pca = MiniBatchPCA(n_components=None,
                               whiten=False,
                               copy=True,
                               batch_size=None)
        # No shuffling make iter much faster
        for x in X.set_batch(batch_size=batch_size, seed=None,
                             shuffle_level=0):
            pca.partial_fit(x)
            yield x.shape[0]
        # save PCA model
        with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
            cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
        # finish return feature name
        yield name

    mpi = MPI(jobs=feat_name,
              func=map_pca,
              ncpu=None,
              batch=1,
              hwm=12082518,
              backend='python')
    # ====== running the MPI ====== #
    remain_features = list(feat_name)
    finished_features = []
    prog = Progbar(target=nb_samples,
                   print_summary=True,
                   print_report=True,
                   name='PCA')
    for n in mpi:
        if is_string(n):
            remain_features.remove(n)
            finished_features.append(n)
        else:
            prog['Remain'] = ', '.join(remain_features)
            prog['Finished'] = ', '.join(finished_features)
            prog.add(n)
    # ====== return ====== #
    if own_dataset:
        dataset.close()

Пример #8

Показать файл

Файл: processor.py Проект: professorlust/odin-ai

def validate_features(ds_or_processor,
                      path,
                      nb_samples=25,
                      override=False,
                      seed=12082518,
                      fig_width=4):
    # TODO: add PCA visualization
    # TODO: update to match new indices style
    def logger(title, tag, check):
        check = bool(check)
        text_color = 'yellow' if check else 'red'
        print(ctext('   *', 'cyan'), ctext(str(title), text_color),
              ctext(str(tag), 'magenta'),
              ctext("✓", text_color) if check else ctext("✗", text_color))

    import matplotlib
    matplotlib.use('Agg')
    from odin.visual import plot_save, plot_multiple_features
    # ====== check path to dataset ====== #
    should_close_ds = True
    if isinstance(ds_or_processor, FeatureProcessor):
        ds = Dataset(ds_or_processor.path, read_only=True)
    elif is_string(ds_or_processor):
        ds = Dataset(ds_or_processor, read_only=True)
    elif isinstance(ds_or_processor, Dataset):
        ds = ds_or_processor
        should_close_ds = False
    else:
        raise ValueError("`ds` can be None, string, or Dataset. No "
                         "support for given input type: %s" % str(type(ds)))
    print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
    # ====== extract the config of the dataset ====== #
    if 'config' not in ds:
        raise RuntimeError(
            "The `Dataset` must be generated by `FeatureProcessor` "
            "which must contain `config` MmapDict of extracted "
            "features configuration.")
    # config = ds['config']
    # pipeline = ds['pipeline']
    # ====== output path ====== #
    path = str(path)
    if not os.path.exists(path):
        os.mkdir(path)
    elif override:
        if os.path.isfile(path):
            os.remove(path)
        else:
            shutil.rmtree(path)
        os.mkdir(path)
    else:
        raise ValueError("`path`=%s exists, cannot override." % path)
    prev_stdio = get_stdio_path()
    stdio(path=os.path.join(path, 'log.txt'))
    nb_samples = int(nb_samples)
    # ====== get all features ====== #
    # [(name, dtype, statistic-able), ...]
    all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
    # store all features (included the features in external_indices
    all_features = []
    # the external indices can be: indices_mfcc_bnf
    external_indices = flatten_list([
        k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices'
    ])
    # ====== checking indices ====== #
    main_indices = {
        name: (start, end)
        for name, (start, end) in ds['indices'].items()
    }
    for ids_name in (k for k in all_keys if 'indices' in k):
        ids = sorted([(name, start, end)
                      for name, (start, end) in ds[ids_name].items()],
                     key=lambda x: x[1])
        for prev, now in zip(ids, ids[1:]):
            assert prev[2] == now[1], "Zero length in indices"
            assert prev[2] - prev[1] > 0, "Zero length in indices"
            assert now[2] - now[1] > 0, "Zero length in indices"
        # final length match length of Data
        if ids_name != 'indices':
            for feat_name in ids_name.split('_')[1:]:
                assert now[-1] == len(ds[feat_name]), \
                    "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
                    (ids_name, feat_name)
                all_features.append(feat_name)
        else:
            for feat_name in all_keys:
                if feat_name not in external_indices and \
                'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
                'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
                isinstance(ds[feat_name], MmapData):
                    assert now[-1] == len(ds[feat_name]), \
                    "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
                    all_features.append(feat_name)
        # logging
        logger("Checked all:", ids_name, True)
    # ====== check all dictionary types ====== #
    for name in all_keys:
        if isinstance(ds[name], MmapDict) and 'indices' not in name:
            data = ds[name]
            # special cases
            if name == 'sr':
                checking_func = lambda x: x > 0  # for sr
            else:
                checking_func = lambda x: True
            # check
            for key, val in data.items():
                assert key in main_indices, \
                "Dictionary with name:'%s' has key not found in indices." % name
                assert checking_func(val)
            logger("Checked dictionary: ", name, True)
    # ====== checking each type of data ====== #
    # get all stats name
    all_stats = defaultdict(list)
    for k in all_keys:
        if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
        'mean' == k[-4:] or 'std' == k[-3:]:
            all_stats[k[:-4].split('_')[0]].append(k)
    # get all pca name
    all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds}
    # checking one-by-one numpy.ndarray features array
    for feat_name in all_features:
        dtype = str(ds[feat_name].dtype)
        # checking all data
        indices = ds.find_prefix(feat_name, 'indices')
        prog = Progbar(target=len(indices),
                       interval=0.1,
                       print_report=True,
                       name='Checking: %s(%s)' % (feat_name, dtype))
        # start iterating over all data file
        fail_test = False
        for file_name, (start, end) in indices:
            dat = ds[feat_name][start:end]
            # No NaN value
            if np.any(np.isnan(dat)):
                logger("NaN values", file_name + ':' + feat_name, False)
                fail_test = True
            # not all value closed to zeros
            if np.all(np.isclose(dat, 0.)):
                logger("All-closed-zeros values", file_name + ':' + feat_name,
                       False)
                fail_test = True
            prog['Name'] = file_name
            prog.add(1)
        if not fail_test:
            logger("Check data incredibility for: ", feat_name, True)
        # checking statistics
        if feat_name in all_stats:
            fail_test = False
            for stat_name in all_stats[feat_name]:
                X = ds[stat_name]
                if X.ndim >= 1:
                    X = X[:]
                if np.any(np.isnan(X)):
                    logger("NaN values", feat_name + ':' + stat_name, False)
                    fail_test = True
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values",
                           feat_name + ':' + stat_name, False)
                    fail_test = True
            if not fail_test:
                logger("Check statistics for: ", feat_name, True)
        # check PCA
        if feat_name in all_pca:
            pca = ds[all_pca[feat_name]]
            n = ds[feat_name].shape[0]
            nb_feats = ds[feat_name].shape[-1]
            fail_test = False
            # performing PCA on random samples
            for i in range(nb_samples):
                start = np.random.randint(0, n - nb_samples - 1)
                X = pca.transform(ds[feat_name][start:(start + nb_samples)],
                                  n_components=max(nb_feats // 2, 1))
                if np.any(np.isnan(X)):
                    logger("NaN values in PCA", feat_name, False)
                    fail_test = True
                    break
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values in PCA", feat_name, False)
                    fail_test = True
                    break
            if not fail_test:
                logger("Check PCA for: ", feat_name, True)
    # ====== Do sampling ====== #
    np.random.seed(seed)  # seed for reproceducible
    all_samples = np.random.choice(list(ds['indices'].keys()),
                                   size=nb_samples,
                                   replace=False)
    # plotting all samples
    for sample_id, file_name in enumerate(all_samples):
        X = {}
        for feat_name in all_features:
            start, end = ds.find_prefix(feat_name, 'indices')[file_name]
            feat = ds[feat_name][start:end]
            X[feat_name] = feat
            # some special handling
            try:
                _special_cases(X=feat,
                               feat_name=feat_name,
                               file_name=file_name,
                               ds=ds,
                               path=path)
            except Exception as e:
                logger("Special case error: %s" % str(e),
                       file_name + ':' + feat_name, False)
        plot_multiple_features(X, title=file_name, fig_width=fig_width)
        figure_path = os.path.join(path,
                                   '%s.pdf' % _escape_file_name(file_name))
        plot_save(figure_path, log=False, clear_all=True)
        logger("Sample figure saved at: ", figure_path, True)
    # plotting the statistic
    figure_path = os.path.join(path, 'stats.pdf')
    for feat_name, stat_name in all_stats.items():
        X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1}
        if len(X) > 0:
            plot_multiple_features(X, title=feat_name, fig_width=fig_width)
    plot_save(figure_path, log=False, clear_all=True)
    logger("Stats figure save at: ", figure_path, True)
    logger("All reports at folder: ", os.path.abspath(path), True)
    # ====== cleaning ====== #
    stdio(path=prev_stdio)
    if should_close_ds:
        ds.close()

Пример #9

Показать файл

def read_centenarian(override=False, verbose=False):
    r""" Data used in:

    "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in
    supercentenarians" | bioRxiv [WWW Document], n.d.
      URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20).

  """
    download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed')
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        labels = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[2])),
            url=_URL[2],
        )
        data = []
        with gzip.open(labels, mode='rb') as f:
            for line in f:
                line = str(line, 'utf-8').strip().split('\t')
                assert line[1][:2] == line[2]
                data.append(line)
        labels = np.array(data)
        y_col = sorted(set(labels[:, 1]))
        y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]),
                    len(y_col)).astype('float32')
        y_col = np.array(y_col)
        #
        raw = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[0])),
            url=_URL[0],
        )
        if verbose:
            print("Unzip and reading raw UMI ...")
        X_raw, cell_id1, gene_id1 = read_gzip_csv(raw)
        #
        norm = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[1])),
            url=_URL[1],
        )
        if verbose:
            print("Unzip and reading log-norm UMI ...")
        X_norm, cell_id2, gene_id2 = read_gzip_csv(norm)
        #
        assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \
          np.all(gene_id1 == gene_id2)
        assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \
          X_raw.shape[1] == X_norm.shape[1] == len(gene_id1)
        #
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X=X_raw,
                        X_col=gene_id1,
                        y=y,
                        y_col=y_col,
                        rowname=cell_id1,
                        print_log=verbose)
        with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'),
                             shape=(0, X_norm.shape[1]),
                             dtype='float32',
                             remove_exist=True) as f:
            for s, e in batching(batch_size=2048, n=X_norm.shape[0]):
                f.write(X_norm[s:e])
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds

Пример #10

Показать файл

def read_full_FACS(override=False, verbose=False):
    """ https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE75478
  This is the full FACS data of 2 individuals with 7 protein markers
  """
    download_path = os.path.join(DOWNLOAD_DIR, "FACS_full")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    # ====== download the data ====== #
    file_url = [
        ('GSE75478_transcriptomics_facs_indeces_filtered_I1.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Ffacs%5Findeces%5Ffiltered%5FI1%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_facs_indeces_filtered_I2.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Ffacs%5Findeces%5Ffiltered%5FI2%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_raw_filtered_I1.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Fraw%5Ffiltered%5FI1%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_raw_filtered_I2.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Fraw%5Ffiltered%5FI2%2Ecsv%2Egz'
         ),
    ]
    for name, url in file_url:
        filename = os.path.join(download_path, name)
        if not os.path.exists(filename):
            if verbose:
                print("Downloading file '{filename}' ...")
            urlretrieve(url=url, filename=filename)
    # ====== extract the data ====== #
    preprocessed_path = _FACS_PREPROCESSED % 7
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        data_map = {}
        for name, _ in file_url:
            zip_path = os.path.join(download_path, name)
            with gzip.open(zip_path, 'rb') as f:
                data_map[name.split('.')[0]] = np.array(
                    [str(line, 'utf-8').strip().split(',') for line in f]).T

        i1 = data_map['GSE75478_transcriptomics_raw_filtered_I1']
        f1 = data_map['GSE75478_transcriptomics_facs_indeces_filtered_I1']

        i2 = data_map['GSE75478_transcriptomics_raw_filtered_I2']
        f2 = data_map['GSE75478_transcriptomics_facs_indeces_filtered_I2']
        # Matching duplicated row in `i` and `f`
        row_name = set(i1[1:, 0]) & set(f1[1:, 0])
        i1 = i1[[True] + [True if i in row_name else False
                          for i in i1[1:, 0]], :]
        f1 = f1[[True] + [True if i in row_name else False
                          for i in f1[1:, 0]], :]
        assert np.all(i1[:, 0] == f1[:, 0])

        row_name = set(i2[1:, 0]) & set(f2[1:, 0])
        i2 = i2[[True] + [True if i in row_name else False
                          for i in i2[1:, 0]], :]
        f2 = f2[[True] + [True if i in row_name else False
                          for i in f2[1:, 0]], :]
        assert np.all(i2[:, 0] == f2[:, 0])

        # Matching the genes and protein among individuals
        gene_name = set(i1[0][1:]) & set(i2[0][1:])
        i1 = i1[:, [True] +
                [True if i in gene_name else False for i in i1[0][1:]]]
        i2 = i2[:, [True] +
                [True if i in gene_name else False for i in i2[0][1:]]]
        assert np.all(i1[0] == i2[0])
        gene = np.concatenate((i1, i2[1:]), axis=0)

        prot_name = set(
            [i for i in set(f1[0][1:]) & set(f2[0][1:]) if '_cd' in i])
        prot_name = sorted(prot_name)
        f1 = f1[:, [0] + [f1[0].tolist().index(i) for i in prot_name]]
        f2 = f2[:, [0] + [f2[0].tolist().index(i) for i in prot_name]]
        assert np.all(f1[0] == f2[0])
        prot = np.concatenate((f1, f2[1:]), axis=0)

        # ====== save data to disk ====== #
        X = gene[1:, 1:].astype('float32')
        X_row = gene[1:, 0]
        X_col = gene[0, 1:]
        X_col = np.array([i.replace('"', '') for i in X_col])

        y = prot[1:, 1:].astype('float32')
        y_row = prot[1:, 0]
        y_col = np.array(
            [i.replace('"', '').split('_')[-1].upper() for i in prot[0, 1:]])

        assert np.all(X_row == y_row)
        X_row = np.array([i.replace('"', '') for i in X_row])

        # ====== the protein marker can be smaller than zero ====== #
        min_values = np.min(y, axis=0, keepdims=True)
        min_values = np.where(min_values > 0, 0, min_values)
        y = y + np.abs(min_values)
        # ====== filter zero columns ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        save_to_dataset(path=preprocessed_path,
                        X=X,
                        X_col=X_col,
                        y=y,
                        y_col=y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds

Пример #11

Показать файл

def read_PBMC8k(subset='full',
                override=False,
                verbose=True,
                filtered_genes=True,
                return_arrays=False) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    # prepare the path
    download_path = os.path.join(DOWNLOAD_DIR, f"PBMC8k_{subset}_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMC8k_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        # ====== pbmc 8k ====== #
        if subset == 'full':
            ly = read_PBMC8k('ly',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            my = read_PBMC8k('my',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            url = str(base64.decodebytes(_URL_PBMC8k), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # load data
            data = np.load(path)
            X = data['X']
            X_row = data['X_row']
            X_col = data['X_col'].tolist()
            y = data['y']
            y_col = data['y_col'].tolist()
            # merge all genes from my and ly subset
            all_genes = set(ly['X_col'].tolist() + my['X_col'].tolist())
            all_genes = sorted([X_col.index(i) for i in all_genes])
            # same for protein
            all_proteins = set(ly['y_col'].tolist() + my['y_col'].tolist())
            all_proteins = sorted([y_col.index(i) for i in all_proteins])
            #
            X = X[:, all_genes]
            y = y[:, all_proteins]
            X_col = np.array(X_col)[all_genes]
            y_col = np.array(y_col)[all_proteins]
            cell_types = np.array(
                ['ly' if i in ly['X_row'] else 'my' for i in X_row])
        # ====== pbmc ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # extract the data
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_filt']
                X_col = data['X_filt_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array([subset] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    if return_arrays:
        return ds
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"8k{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco

Пример #12

Показать файл

Файл: cbmc_CITEseq.py Проект: trungnt13/sisua

def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True):
    download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = _CITEseq_CBMC_PREPROCESSED
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        if verbose:
            print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED)
        shutil.rmtree(_CITEseq_CBMC_PREPROCESSED)
        os.mkdir(_CITEseq_CBMC_PREPROCESSED)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8')
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        download_file(filename=zip_path,
                      url=url,
                      override=False,
                      md5=r"beb76d01a67707c61c21bfb188e1b69f")
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
        # ====== post-processing ====== #
        X = np.array(data_dict['X'].astype('float32'))
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        X, X_col = remove_allzeros_columns(matrix=X, colname=X_col)
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]
        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]
        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"
        # save data
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
        sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col)
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        sco._inplace_subset_var(result.gene_subset)
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(set(sco.var_names.values), f)
        del sco
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(
        X=ds['X'],
        cell_id=ds['X_row'],
        gene_id=ds['X_col'],
        omic='transcriptomic',
        name=f"cbmcCITEseq{'' if filtered_genes else 'all'}",
    ).add_omic('proteomic', ds['y'], ds['y_col'])
    if filtered_genes:
        with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
            top_genes = pickle.load(f)
        sco._inplace_subset_var([i in top_genes for i in sco.var_names])
    return sco

Пример #13

Показать файл

def read_CITEseq_PBMC(override=False,
                      verbose=True,
                      filtered_genes=False) -> SingleCellOMIC:
  download_path = os.path.join(
      DOWNLOAD_DIR,
      "PBMC_%s_original" % ('5000' if filtered_genes else 'CITEseq'))
  if not os.path.exists(download_path):
    os.makedirs(download_path)
  preprocessed_path = (_5000_PBMC_PREPROCESSED
                       if filtered_genes else _CITEseq_PBMC_PREPROCESSED)
  if override:
    shutil.rmtree(preprocessed_path)
    os.makedirs(preprocessed_path)
  # ******************** preprocessed data NOT found ******************** #
  if not os.path.exists(os.path.join(preprocessed_path, 'X')):
    X, X_row, X_col = [], None, None
    y, y_row, y_col = [], None, None
    # ====== download the data ====== #
    download_files = {}
    for url, md5 in zip(
        [_URL_5000 if filtered_genes else _URL_FULL, _URL_PROTEIN],
        [_MD5_5000 if filtered_genes else _MD5_FULL, _MD5_PROTEIN]):
      url = str(base64.decodebytes(url), 'utf-8')
      base_name = os.path.basename(url)
      path = os.path.join(download_path, base_name)
      download_file(filename=path, url=url, override=False)
      download_files[base_name] = (path, md5)
    # ====== extract the data ====== #
    n = set()
    for name, (path, md5) in sorted(download_files.items()):
      if verbose:
        print(f"Extracting {name} ...")
      binary_data = decrypt_aes(path, password=_PASSWORD)
      md5_ = md5_checksum(binary_data)
      assert md5_ == md5, f"MD5 checksum mismatch for file: {name}"
      with zipfile.ZipFile(file=BytesIO(binary_data), mode='r') as f:
        for name in f.namelist():
          data = str(f.read(name), 'utf8')
          for line in data.split('\n'):
            if len(line) == 0:
              continue
            line = line.strip().split(',')
            n.add(len(line))
            if 'Protein' in name:
              y.append(line)
            else:
              X.append(line)
    # ====== post-processing ====== #
    assert len(n) == 1, \
    "Number of samples inconsistent between raw count and protein count"
    if verbose:
      print("Processing gene count ...")
    X = np.array(X).T
    X_row, X_col = X[1:, 0], X[0, 1:]
    X = X[1:, 1:].astype('float32')
    # ====== filter mouse genes ====== #
    human_cols = [True if "HUMAN_" in i else False for i in X_col]
    if verbose:
      print(f"Removing {np.sum(np.logical_not(human_cols))} MOUSE genes ...")
    X = X[:, human_cols]
    X_col = np.array([i.replace('HUMAN_', '') for i in X_col[human_cols]])
    X, X_col = remove_allzeros_columns(matrix=X,
                                       colname=X_col,
                                       print_log=verbose)

    # ====== protein ====== #
    if verbose:
      print("Processing protein count ...")
    y = np.array(y).T
    y_row, y_col = y[1:, 0], y[0, 1:]
    y = y[1:, 1:].astype('float32')
    assert np.all(X_row == y_row), \
    "Cell order mismatch between gene count and protein count"
    # save data
    if verbose:
      print(f"Saving data to {preprocessed_path} ...")
    save_to_dataset(preprocessed_path,
                    X,
                    X_col,
                    y,
                    y_col,
                    rowname=X_row,
                    print_log=verbose)
  # ====== read preprocessed data ====== #
  ds = Dataset(preprocessed_path, read_only=True)
  return SingleCellOMIC(
      X=ds['X'],
      cell_id=ds['X_row'],
      gene_id=ds['X_col'],
      omic='transcriptomic',
      name=f"pbmcCITEseq{'' if filtered_genes else 'all'}",
  ).add_omic('proteomic', ds['y'], ds['y_col'])

Пример #14

Показать файл

Файл: processor.py Проект: imito/odin

  def run(self):
    njobs = len(self.jobs)
    dataset = Dataset(self.path)
    if self.n_cache <= 1:
      cache_limit = max(2, int(0.12 * njobs))
    else:
      cache_limit = int(self.n_cache)
    # ====== indices ====== #
    databases = defaultdictkey(lambda key:
        MmapDict(path=os.path.join(dataset.path, key), cache_size=10000,
                 read_only=False))
    last_start = defaultdict(int)
    # ====== statistic ====== #
    # load old statistics
    stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2)
    for key in dataset.keys():
      if 'sum1' == key[-4]:
        stats[key[:-4]][0] = dataset[key][:]
      elif 'sum2' == key[-4:]:
        stats[key[:-4]][1] = dataset[key][:]
    # all data are cached for periodically flushed
    cache = defaultdict(list)
    n_processed = [0] # store the value as reference

    # ====== helper ====== #
    def flush_feature(feat_name, X_cached):
      if len(X_cached) > 0:
        X_cached = np.concatenate(X_cached, 0)
        # flush data
        if feat_name in dataset:
          dataset[feat_name].append(X_cached)
        else:
          dataset[(feat_name, 'memmap')] = X_cached

    # ====== repeated for each result returned ====== #
    def post_processing(result):
      # search for file name
      if self.identifier not in result:
        raise RuntimeError(
            "Cannot find identifier '%s' in returned dictionary" % self.identifier)
      file_name = result[self.identifier]
      # invalid file_name
      if not is_string(file_name):
        raise RuntimeError("Cannot find file name in returned features "
            "list, the file name can be specified in key: 'name', 'path' "
            "and the type of the value must be string. All available "
            "keys are: %s" % str(result.keys()))
      # store all new indices
      # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
      all_indices = {}
      # processing
      for feat_name, X in result.items():
        # some invalid feat_name
        if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
          raise RuntimeError("Returned features' name cannot be one "
                             "of the following: 'config', 'pipeline', 'sum1', 'sum2'.")
        # ignore some feat_name
        if feat_name in ('name'):
          continue
        # if numpy ndarray, save to MmapData
        if isinstance(X, np.ndarray) or \
        'sum1' == feat_name[-4:] or \
        'sum2' == feat_name[-4:]:
          # save statistics instead
          if 'sum1' == feat_name[-4:]:
            stats[feat_name[:-4]][0] += X
          elif 'sum2' == feat_name[-4:]:
            stats[feat_name[:-4]][1] += X
          # save features array
          else:
            all_indices[feat_name] = X.shape[0]
            # cache data, only if we have more than 0 sample
            if X.shape[0] > 0:
              cache[feat_name].append(X)
        # else all other kind of data save to MmapDict
        else:
          databases[feat_name][file_name] = X
        # remove data
        del X
      # ====== update indices ====== #
      if len(all_indices) > 0:
        for feat_name, n in all_indices.items():
          ids_name = 'indices_%s' % feat_name
          databases[ids_name][file_name] = (last_start[ids_name],
                                            last_start[ids_name] + n)
          last_start[ids_name] += n
      # ====== flush cache ====== #
      n_processed[0] += 1
      if n_processed[0] % cache_limit == 0: # 12 + 8
        for feat_name, X_cached in cache.items():
          flush_feature(feat_name, X_cached)
        cache.clear()
      # ====== update progress ====== #
      return file_name

    # ====== mapping function ====== #
    def _map_func(dat):
      try:
        ret = self.extractor.transform(dat)
      except Exception as e: # Non-handled exception
        ret = '\n========\n'
        ret += 'Time  : `%s`\n' % str(get_formatted_datetime(only_number=False))
        ret += 'Error : `%s`\n' % str(e)
        ret += 'Input : `%s`\n' % str(dat)
        import traceback
        etype, value, tb = sys.exc_info()
        for line in traceback.TracebackException(
                type(value), value, tb, limit=None).format(chain=True):
          ret += line
      return ret
    # ====== processing ====== #
    mpi = MPI(jobs=self.jobs,
              func=_map_func,
              ncpu=self.n_cpu,
              batch=1,
              hwm=self.n_cpu * 3,
              backend='python')
    # initialize
    prog = Progbar(target=njobs, name=self.path,
                   interval=0.12, print_report=True, print_summary=True)
    start_time = time.time()
    last_time = time.time()
    last_count = 0
    with open(self._log_path, 'w') as flog:
      # writing the log head
      flog.write('============================\n')
      flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False))
      flog.write('Outpath    : %s\n' % self.path)
      flog.write('Extractor  : %s\n' % '->'.join([s[-1].__class__.__name__
                                                  for s in self.extractor.steps]))
      flog.write('#Jobs      : %d\n' % njobs)
      flog.write('#CPU       : %d\n' % self.n_cpu)
      flog.write('#Cache     : %d\n' % cache_limit)
      flog.write('============================\n')
      flog.flush()
      # start processing the file list
      for count, result in enumerate(mpi):
        # Non-handled exception
        if isinstance(result, string_types):
          flog.write(result)
          flog.flush()
          self._error_log.append(result)
          if self.stop_on_failure:
            raise RuntimeError(result)
        # some error might happened
        elif isinstance(result, ExtractorSignal):
          flog.write(str(result)); flog.flush()
          if result.action == 'error':
            prog.add_notification(str(result))
            raise RuntimeError("ExtractorSignal requests terminating processor!")
          elif result.action == 'warn':
            prog.add_notification(str(result))
          elif result.action == 'ignore':
            self._error_log.append(result)
          else:
            raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action)
          prog['File'] = '%-48s' % result.message[:48]
        # otherwise, no error happened, do post-processing
        else:
          name = post_processing(result)
          prog['File'] = '%-48s' % str(name)[:48]
        # update progress
        prog.add(1)
        # manually write to external log file
        if (count + 1) % max(1, int(0.01 * njobs)) == 0:
          curr_time = time.time()
          elap = curr_time - start_time
          avg_speed = (count + 1) / elap
          cur_speed = (count + 1 - last_count) / (curr_time - last_time)
          avg_est = (njobs - count - 1) / avg_speed
          cur_est = (njobs - count - 1) / cur_speed
          flog.write('[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                     '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                     '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                     (get_formatted_datetime(only_number=False),
                      count + 1, njobs - count - 1, elap,
                      avg_speed, avg_est,
                      cur_speed, cur_est))
          flog.flush()
          last_time = curr_time
          last_count = count + 1
    # ====== end, flush the last time ====== #
    for feat_name, X_cached in cache.items():
      flush_feature(feat_name, X_cached)
    cache.clear()
    cache = None
    dataset.flush()
    prog.add_notification("Flushed all data to disk")
    # ====== saving indices ====== #
    for name, db in databases.items():
      db.flush(save_all=True)
      db_size = len(db)
      db.close()
      prog.add_notification('Flush MmapDict "%s" to disk, size: %s' %
                            (ctext(name, 'yellow'),
                             ctext(str(db_size), 'yellow')))

    # ====== save mean and std ====== #
    def save_mean_std(sum1, sum2, name):
      N = dataset[name.split('_')[0]].shape[0]
      mean = sum1 / N
      std = np.sqrt(sum2 / N - np.power(mean, 2))
      if np.any(np.isnan(mean)):
        wprint('Mean contains NaN, name: %s' % name)
      if np.any(np.isnan(std)):
        wprint('Std contains NaN, name: %s' % name)
      dataset[name + 'sum1'] = sum1
      dataset[name + 'sum2'] = sum2
      dataset[name + 'mean'] = mean
      dataset[name + 'std'] = std
    # save all stats
    if len(stats) > 0:
      for feat_name, (sum1, sum2) in stats.items():
        save_mean_std(sum1, sum2, feat_name)
        prog.add_notification('Saved statistics of: %s, shape: %s' %
                              (ctext(feat_name.split('_')[0], 'yellow'),
                               ctext(str(sum1.shape), 'yellow')))
    # ====== dataset flush() ====== #
    dataset.flush()
    dataset.close()
    # ====== saving the extractor ====== #
    # not good idea to save the extractor all the time
    # pipeline_path = os.path.join(dataset.path, 'pipeline')
    # with open(pipeline_path, 'wb') as f:
    #   cPickle.dump(self.extractor, f, protocol=2)
    # prog.add_notification("Saved Extractor pipeline at: %s" %
    #                       ctext(pipeline_path, 'yellow'))
    # ====== saving the configuration ====== #
    config_path = os.path.join(dataset.path, 'config')
    config = MmapDict(config_path)
    config['__configuration_time__'] = time.time()
    config['__processor__'] = self.path
    for i in dir(self):
      if _default_module.match(i) is not None:
        continue
      j = getattr(self, i)
      if isinstance(j, (Number, string_types, bool)):
        config[i] = j
    config.flush(save_all=True)
    self.config = {i: j
                   for i, j in config}
    config.close()
    prog.add_notification("Saved configuration at: %s" %
                          ctext(config_path, 'yellow'))
    # ====== final notification ====== #
    prog.add_notification("Closed all dataset.")
    prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))

Пример #15

Показать файл

Файл: processor.py Проект: imito/odin

def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
  """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
  # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
  # add reading data from indices also
  # ====== check input dataset ====== #
  own_dataset = True
  if is_string(dataset) and os.path.isdir(dataset):
    dataset = Dataset(dataset, read_only=True)
  elif isinstance(dataset, Dataset):
    own_dataset = False
  elif isinstance(dataset, FeatureProcessor):
    dataset = Dataset(dataset.path, read_only=True)
  else:
    raise ValueError("Cannot acquire Dataset from input: %s" %
                     str(dataset))
  # ====== extract all feat_name ====== #
  if is_string(feat_name) and feat_name == 'auto':
    feat_name = []
    for k in dataset.keys():
      X = dataset[k]
      if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
        feat_name.append(k)
  else:
    feat_name = [name
                 for name in as_tuple(feat_name, t=str)
                 if name in dataset]
  # ====== load PCA ====== #
  from odin.ml import MiniBatchPCA
  # init PCA
  nb_samples = 0
  for feat in feat_name:
    nb_samples += dataset[feat].shape[0]
  # ====== prepare MPI PCA ====== #
  add_notification("Selected features for PCA: " +
      ctext(', '.join(feat_name), 'yellow'))

  def map_pca(name):
    X = dataset[name]
    # found exist pca model
    if 'pca_' + feat in dataset and not override:
      pca = dataset['pca_' + feat]
    # create new PCA
    else:
      pca = MiniBatchPCA(n_components=None, whiten=False,
                         copy=True, batch_size=None)
    # No shuffling make iter much faster
    for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0):
      pca.partial_fit(x)
      yield x.shape[0]
    # save PCA model
    with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
      cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
    # finish return feature name
    yield name
  mpi = MPI(jobs=feat_name, func=map_pca,
            ncpu=None, batch=1, hwm=12082518,
            backend='python')
  # ====== running the MPI ====== #
  remain_features = list(feat_name)
  finished_features = []
  prog = Progbar(target=nb_samples, print_summary=True, print_report=True,
                 name='PCA')
  for n in mpi:
    if is_string(n):
      remain_features.remove(n)
      finished_features.append(n)
    else:
      prog['Remain'] = ', '.join(remain_features)
      prog['Finished'] = ', '.join(finished_features)
      prog.add(n)
  # ====== return ====== #
  if own_dataset:
    dataset.close()

Пример #16

Показать файл

Файл: processor.py Проект: imito/odin

def validate_features(ds_or_processor, path, nb_samples=25,
                      override=False, seed=12082518, fig_width=4):
  # TODO: add PCA visualization
  # TODO: update to match new indices style
  def logger(title, tag, check):
    check = bool(check)
    text_color = 'yellow' if check else 'red'
    print(ctext('   *', 'cyan'),
          ctext(str(title), text_color),
          ctext(str(tag), 'magenta'),
          ctext("✓", text_color) if check else ctext("✗", text_color))
  import matplotlib
  matplotlib.use('Agg')
  from odin.visual import plot_save, plot_multiple_features
  # ====== check path to dataset ====== #
  should_close_ds = True
  if isinstance(ds_or_processor, FeatureProcessor):
    ds = Dataset(ds_or_processor.path, read_only=True)
  elif is_string(ds_or_processor):
    ds = Dataset(ds_or_processor, read_only=True)
  elif isinstance(ds_or_processor, Dataset):
    ds = ds_or_processor
    should_close_ds = False
  else:
    raise ValueError("`ds` can be None, string, or Dataset. No "
                     "support for given input type: %s" % str(type(ds)))
  print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
  # ====== extract the config of the dataset ====== #
  if 'config' not in ds:
    raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` "
                       "which must contain `config` MmapDict of extracted "
                       "features configuration.")
  # config = ds['config']
  # pipeline = ds['pipeline']
  # ====== output path ====== #
  path = str(path)
  if not os.path.exists(path):
    os.mkdir(path)
  elif override:
    if os.path.isfile(path):
      os.remove(path)
    else:
      shutil.rmtree(path)
    os.mkdir(path)
  else:
    raise ValueError("`path`=%s exists, cannot override." % path)
  prev_stdio = get_stdio_path()
  stdio(path=os.path.join(path, 'log.txt'))
  nb_samples = int(nb_samples)
  # ====== get all features ====== #
  # [(name, dtype, statistic-able), ...]
  all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
  # store all features (included the features in external_indices
  all_features = []
  # the external indices can be: indices_mfcc_bnf
  external_indices = flatten_list([k.split('_')[1:] for k in all_keys
                                   if 'indices' in k and k != 'indices'])
  # ====== checking indices ====== #
  main_indices = {name: (start, end)
                  for name, (start, end) in ds['indices'].items()}
  for ids_name in (k for k in all_keys if 'indices' in k):
    ids = sorted([(name, start, end)
                  for name, (start, end) in ds[ids_name].items()],
                 key=lambda x: x[1])
    for prev, now in zip(ids, ids[1:]):
      assert prev[2] == now[1], "Zero length in indices"
      assert prev[2] - prev[1] > 0, "Zero length in indices"
      assert now[2] - now[1] > 0, "Zero length in indices"
    # final length match length of Data
    if ids_name != 'indices':
      for feat_name in ids_name.split('_')[1:]:
        assert now[-1] == len(ds[feat_name]), \
            "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
            (ids_name, feat_name)
        all_features.append(feat_name)
    else:
      for feat_name in all_keys:
        if feat_name not in external_indices and \
        'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
        'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
        isinstance(ds[feat_name], MmapData):
          assert now[-1] == len(ds[feat_name]), \
          "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
          all_features.append(feat_name)
    # logging
    logger("Checked all:", ids_name, True)
  # ====== check all dictionary types ====== #
  for name in all_keys:
    if isinstance(ds[name], MmapDict) and 'indices' not in name:
      data = ds[name]
      # special cases
      if name == 'sr':
        checking_func = lambda x: x > 0 # for sr
      else:
        checking_func = lambda x: True
      # check
      for key, val in data.items():
        assert key in main_indices, \
        "Dictionary with name:'%s' has key not found in indices." % name
        assert checking_func(val)
      logger("Checked dictionary: ", name, True)
  # ====== checking each type of data ====== #
  # get all stats name
  all_stats = defaultdict(list)
  for k in all_keys:
    if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
    'mean' == k[-4:] or 'std' == k[-3:]:
      all_stats[k[:-4].split('_')[0]].append(k)
  # get all pca name
  all_pca = {i: i + '_pca' for i in all_features
             if i + '_pca' in ds}
  # checking one-by-one numpy.ndarray features array
  for feat_name in all_features:
    dtype = str(ds[feat_name].dtype)
    # checking all data
    indices = ds.find_prefix(feat_name, 'indices')
    prog = Progbar(target=len(indices), interval=0.1,
                   print_report=True,
                   name='Checking: %s(%s)' % (feat_name, dtype))
    # start iterating over all data file
    fail_test = False
    for file_name, (start, end) in indices:
      dat = ds[feat_name][start:end]
      # No NaN value
      if np.any(np.isnan(dat)):
        logger("NaN values", file_name + ':' + feat_name, False)
        fail_test = True
      # not all value closed to zeros
      if np.all(np.isclose(dat, 0.)):
        logger("All-closed-zeros values", file_name + ':' + feat_name,
               False)
        fail_test = True
      prog['Name'] = file_name
      prog.add(1)
    if not fail_test:
      logger("Check data incredibility for: ", feat_name, True)
    # checking statistics
    if feat_name in all_stats:
      fail_test = False
      for stat_name in all_stats[feat_name]:
        X = ds[stat_name]
        if X.ndim >= 1:
          X = X[:]
        if np.any(np.isnan(X)):
          logger("NaN values", feat_name + ':' + stat_name, False)
          fail_test = True
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values", feat_name + ':' + stat_name,
                 False)
          fail_test = True
      if not fail_test:
        logger("Check statistics for: ", feat_name, True)
    # check PCA
    if feat_name in all_pca:
      pca = ds[all_pca[feat_name]]
      n = ds[feat_name].shape[0]
      nb_feats = ds[feat_name].shape[-1]
      fail_test = False
      # performing PCA on random samples
      for i in range(nb_samples):
        start = np.random.randint(0, n - nb_samples - 1)
        X = pca.transform(
            ds[feat_name][start:(start + nb_samples)],
            n_components=max(nb_feats // 2, 1))
        if np.any(np.isnan(X)):
          logger("NaN values in PCA", feat_name, False)
          fail_test = True
          break
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values in PCA", feat_name, False)
          fail_test = True
          break
      if not fail_test:
        logger("Check PCA for: ", feat_name, True)
  # ====== Do sampling ====== #
  np.random.seed(seed) # seed for reproceducible
  all_samples = np.random.choice(list(ds['indices'].keys()),
                                 size=nb_samples,
                                 replace=False)
  # plotting all samples
  for sample_id, file_name in enumerate(all_samples):
    X = {}
    for feat_name in all_features:
      start, end = ds.find_prefix(feat_name, 'indices')[file_name]
      feat = ds[feat_name][start:end]
      X[feat_name] = feat
      # some special handling
      try:
        _special_cases(X=feat, feat_name=feat_name, file_name=file_name,
                       ds=ds, path=path)
      except Exception as e:
        logger("Special case error: %s" % str(e),
               file_name + ':' + feat_name, False)
    plot_multiple_features(X, title=file_name, fig_width=fig_width)
    figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name))
    plot_save(figure_path, log=False, clear_all=True)
    logger("Sample figure saved at: ", figure_path, True)
  # plotting the statistic
  figure_path = os.path.join(path, 'stats.pdf')
  for feat_name, stat_name in all_stats.items():
    X = {name: ds[name][:]
         for name in stat_name
         if ds[name].ndim >= 1}
    if len(X) > 0:
      plot_multiple_features(X, title=feat_name, fig_width=fig_width)
  plot_save(figure_path, log=False, clear_all=True)
  logger("Stats figure save at: ", figure_path, True)
  logger("All reports at folder: ", os.path.abspath(path), True)
  # ====== cleaning ====== #
  stdio(path=prev_stdio)
  if should_close_ds:
    ds.close()

Пример #17

Показать файл

def read_PBMCeec(subset='ly',
                 override=False,
                 verbose=True,
                 filtered_genes=True) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    if subset in ('my', 'full'):
        raise NotImplementedError("No support for subset: %s - PBMCecc" %
                                  subset)
    download_path = os.path.join(DOWNLOAD_DIR, "PBMCecc_%s_original" % subset)
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMCecc_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at path {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        # ====== full ====== #
        if subset == 'full':
            raise NotImplementedError
        # ====== ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # ====== extract the data ====== #
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_var']
                X_col = data['X_var_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array(['ly'] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"ecc{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco

Пример #18

Показать файл

def read_FACS(n_protein, override=False, verbose=False):
    download_path = os.path.join(DOWNLOAD_DIR, "FACS_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)

    n_protein = int(n_protein)
    assert n_protein in (2, 5)

    preprocessed_path = _FACS_PREPROCESSED % n_protein
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8') % n_protein
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        urlretrieve(url=url, filename=zip_path)
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
            if verbose:
                print('%-12s' % base_name, ':', data.shape)
        # ====== post-processing ====== #
        X = data_dict['X'].astype('float32')
        X = np.array(X)
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]

        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]

        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"

        # ====== filter zero columns ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)

        save_to_dataset(path=preprocessed_path,
                        X=X,
                        X_col=X_col,
                        y=y,
                        y_col=y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds