def read_Hemato(override=False, verbose=False): preprocessed_path = select_path(os.path.join(DATA_DIR, 'HEMATO_preprocessed'), create_new=True) if override: shutil.rmtree(preprocessed_path) os.mkdir(preprocessed_path) # ====== copy the dataset from scVI ====== # if not os.path.exists(os.path.join(preprocessed_path, 'X')): try: from scvi.dataset import HematoDataset except ImportError: raise RuntimeError("Require `scVI` package for HEMATO dataset") gene_dataset = HematoDataset( save_path=os.path.join(DOWNLOAD_DIR, 'HEMATO/')) X = gene_dataset._X gene_names = np.array(gene_dataset.gene_names) assert len(gene_names) == X.shape[1] y = gene_dataset.meta.values[:, 1:] label_names = np.array(gene_dataset.cell_types_levels) assert len(label_names) == y.shape[1] cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])]) _save_data_to_path(preprocessed_path, X, y, gene_names, label_names, cell_names, verbose) # create a binary classes for testing label_names = np.array(["Erythroblasts", "Granulocytes"]) min_y = np.min(gene_dataset.labels) max_y = np.max(gene_dataset.labels) y_val = 2 * (gene_dataset.labels - min_y) / (max_y - min_y) - 1 y_bin = np.argmax( np.hstack(( gene_dataset.meta.iloc[:, 1].values[:, None], # Er gene_dataset.meta.iloc[:, 2].values[:, None])), # Gr axis=-1) with open(os.path.join(preprocessed_path, 'labels_name'), 'wb') as f: pickle.dump(label_names, f) with open(os.path.join(preprocessed_path, 'labels_bin'), 'wb') as f: pickle.dump(y_bin, f) with open(os.path.join(preprocessed_path, 'labels_val'), 'wb') as f: pickle.dump(y_val, f) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
def read_PBMC_crossdataset_remove_protein(subset, return_ecc, filtered_genes=False, override=False, verbose=False, remove_protein=['CD4', 'CD8']): remove_protein = sorted( [i.lower() for i in as_tuple(remove_protein, t=string_types)]) preprocessed_path = os.path.join( DATA_DIR, 'PBMCcross_%s_%s_no%s_preprocessed' % ('ecc' if return_ecc else '8k', subset + ('' if filtered_genes else 'full'), ''.join( [i.lower() for i in remove_protein]))) if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): ds = read_PBMC_crossdataset_ecc_8k(subset, return_ecc, filtered_genes, override=override, verbose=verbose) X = ds['X'][:] X_row = ds['X_row'] X_col = ds['X_col'] y = ds['y'] y_col = ds['y_col'] remove_ids = [ i for i, j in enumerate(y_col) if standardize_protein_name(j).lower() in remove_protein ] remain_ids = [i for i in range(len(y_col)) if i not in remove_ids] y_col = y_col[remain_ids] y = y[:, remain_ids] save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ******************** return ******************** # ds = Dataset(preprocessed_path, read_only=True) return ds
def _read_scvi_dataset(name, clazz_name, override, verbose): preprocessed_path = select_path(os.path.join(DATA_DIR, '%s_preprocessed' % name), create_new=True) if override: shutil.rmtree(preprocessed_path) os.mkdir(preprocessed_path) # ====== copy the dataset from scVI ====== # if not os.path.exists(os.path.join(preprocessed_path, 'X')): try: import scvi.dataset as scvi_dataset except ImportError: raise RuntimeError("Require `scVI` package for PBMC dataset") clazz = getattr(scvi_dataset, clazz_name) gene_dataset = clazz(save_path=DOWNLOAD_DIR) X = gene_dataset._X if hasattr(X, 'todense'): X = np.array(X.todense()) gene_names = np.array(gene_dataset.gene_names) # convert gene identifier to gene symbol (i.e. name) if hasattr(gene_dataset, 'de_metadata'): from sisua.data.utils import get_gene_id2name meta = gene_dataset.de_metadata converter = {i: j for i, j in zip(meta.ENSG, meta.GS)} pbmc8kconverter = get_gene_id2name() gene_names = np.array([ pbmc8kconverter[i] if i in pbmc8kconverter else converter[i] for i in gene_names ]) assert len(gene_names) == X.shape[1] label_names = np.array(gene_dataset.cell_types) y = one_hot(gene_dataset.labels.ravel(), nb_classes=len(label_names)) assert len(label_names) == y.shape[1] cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])]) _save_data_to_path(preprocessed_path, X, y, gene_names, label_names, cell_names, verbose) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
def load_parameters(clazz): # ====== all path ====== # name = clazz.__name__ + '.zip' path = os.path.join(base64.decodebytes(Model.ORIGIN).decode(), name) param_path = get_datasetpath(name=clazz.__name__, override=False) zip_path = os.path.join(Model.BASE_DIR, name) # ====== get params files ====== # if not os.path.exists(param_path) or \ len(os.listdir(param_path)) == 0: get_file(name, origin=path, outdir=Model.BASE_DIR) zf = ZipFile(zip_path, mode='r', compression=ZIP_DEFLATED) zf.extractall(path=Model.BASE_DIR) zf.close() # check if proper unzipped if not os.path.exists(param_path) or \ len(os.listdir(param_path)) == 0: raise RuntimeError("Zip file at path:%s is not proper unzipped, " "cannot find downloaded parameters at path: %s" % (zip_path, param_path)) else: os.remove(zip_path) # ====== create and return the params dataset ====== # ds = Dataset(param_path, read_only=True) return ds
def validating_dataset(path): if isinstance(path, Dataset): ds = path elif isinstance(path, string_types): ds = Dataset(path, read_only=True) assert 'X' in ds, \ '`X` (n_samples, n_genes) must be stored at path: %s' % ds.path assert 'X_col' in ds, \ '`X_col` (n_genes,) must be stored at path: %s' % ds.path assert 'X_row' in ds, \ '`X_row` (n_samples,) must be stored at path: %s' % ds.path if 'y' in ds: assert 'y' in ds, \ '`y` (n_samples, n_protein) must be stored at path: %s' % ds.path assert 'y_col' in ds, \ '`y_col` (n_protein,) must be stored at path: %s' % ds.path y, y_col = ds['y'], ds['y_col'] else: y, y_col = None, None X, X_col, rowname = ds['X'], ds['X_col'], ds['X_row'] _check_data(X, X_col, y, y_col, rowname)
def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey( lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError( "Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError( "Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'." ) # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str( get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join( [s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)) flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError( "ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError( "Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write( '[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification( 'Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification( 'Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False): """ Using parallel MiniBatchPCA to do PCA for multiple features at once. """ # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec) # add reading data from indices also # ====== check input dataset ====== # own_dataset = True if is_string(dataset) and os.path.isdir(dataset): dataset = Dataset(dataset, read_only=True) elif isinstance(dataset, Dataset): own_dataset = False elif isinstance(dataset, FeatureProcessor): dataset = Dataset(dataset.path, read_only=True) else: raise ValueError("Cannot acquire Dataset from input: %s" % str(dataset)) # ====== extract all feat_name ====== # if is_string(feat_name) and feat_name == 'auto': feat_name = [] for k in dataset.keys(): X = dataset[k] if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1: feat_name.append(k) else: feat_name = [ name for name in as_tuple(feat_name, t=str) if name in dataset ] # ====== load PCA ====== # from odin.ml import MiniBatchPCA # init PCA nb_samples = 0 for feat in feat_name: nb_samples += dataset[feat].shape[0] # ====== prepare MPI PCA ====== # add_notification("Selected features for PCA: " + ctext(', '.join(feat_name), 'yellow')) def map_pca(name): X = dataset[name] # found exist pca model if 'pca_' + feat in dataset and not override: pca = dataset['pca_' + feat] # create new PCA else: pca = MiniBatchPCA(n_components=None, whiten=False, copy=True, batch_size=None) # No shuffling make iter much faster for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0): pca.partial_fit(x) yield x.shape[0] # save PCA model with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f: cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL) # finish return feature name yield name mpi = MPI(jobs=feat_name, func=map_pca, ncpu=None, batch=1, hwm=12082518, backend='python') # ====== running the MPI ====== # remain_features = list(feat_name) finished_features = [] prog = Progbar(target=nb_samples, print_summary=True, print_report=True, name='PCA') for n in mpi: if is_string(n): remain_features.remove(n) finished_features.append(n) else: prog['Remain'] = ', '.join(remain_features) prog['Finished'] = ', '.join(finished_features) prog.add(n) # ====== return ====== # if own_dataset: dataset.close()
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError( "The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([ k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices' ]) # ====== checking indices ====== # main_indices = { name: (start, end) for name, (start, end) in ds['indices'].items() } for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform(ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()
def read_centenarian(override=False, verbose=False): r""" Data used in: "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in supercentenarians" | bioRxiv [WWW Document], n.d. URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20). """ download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed') if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): labels = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[2])), url=_URL[2], ) data = [] with gzip.open(labels, mode='rb') as f: for line in f: line = str(line, 'utf-8').strip().split('\t') assert line[1][:2] == line[2] data.append(line) labels = np.array(data) y_col = sorted(set(labels[:, 1])) y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]), len(y_col)).astype('float32') y_col = np.array(y_col) # raw = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[0])), url=_URL[0], ) if verbose: print("Unzip and reading raw UMI ...") X_raw, cell_id1, gene_id1 = read_gzip_csv(raw) # norm = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[1])), url=_URL[1], ) if verbose: print("Unzip and reading log-norm UMI ...") X_norm, cell_id2, gene_id2 = read_gzip_csv(norm) # assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \ np.all(gene_id1 == gene_id2) assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \ X_raw.shape[1] == X_norm.shape[1] == len(gene_id1) # if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X=X_raw, X_col=gene_id1, y=y, y_col=y_col, rowname=cell_id1, print_log=verbose) with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'), shape=(0, X_norm.shape[1]), dtype='float32', remove_exist=True) as f: for s, e in batching(batch_size=2048, n=X_norm.shape[0]): f.write(X_norm[s:e]) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
def read_full_FACS(override=False, verbose=False): """ https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE75478 This is the full FACS data of 2 individuals with 7 protein markers """ download_path = os.path.join(DOWNLOAD_DIR, "FACS_full") if not os.path.exists(download_path): os.mkdir(download_path) # ====== download the data ====== # file_url = [ ('GSE75478_transcriptomics_facs_indeces_filtered_I1.csv.gz', 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Ffacs%5Findeces%5Ffiltered%5FI1%2Ecsv%2Egz' ), ('GSE75478_transcriptomics_facs_indeces_filtered_I2.csv.gz', 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Ffacs%5Findeces%5Ffiltered%5FI2%2Ecsv%2Egz' ), ('GSE75478_transcriptomics_raw_filtered_I1.csv.gz', 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Fraw%5Ffiltered%5FI1%2Ecsv%2Egz' ), ('GSE75478_transcriptomics_raw_filtered_I2.csv.gz', 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Fraw%5Ffiltered%5FI2%2Ecsv%2Egz' ), ] for name, url in file_url: filename = os.path.join(download_path, name) if not os.path.exists(filename): if verbose: print("Downloading file '{filename}' ...") urlretrieve(url=url, filename=filename) # ====== extract the data ====== # preprocessed_path = _FACS_PREPROCESSED % 7 if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) elif override: shutil.rmtree(preprocessed_path) os.mkdir(preprocessed_path) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): data_map = {} for name, _ in file_url: zip_path = os.path.join(download_path, name) with gzip.open(zip_path, 'rb') as f: data_map[name.split('.')[0]] = np.array( [str(line, 'utf-8').strip().split(',') for line in f]).T i1 = data_map['GSE75478_transcriptomics_raw_filtered_I1'] f1 = data_map['GSE75478_transcriptomics_facs_indeces_filtered_I1'] i2 = data_map['GSE75478_transcriptomics_raw_filtered_I2'] f2 = data_map['GSE75478_transcriptomics_facs_indeces_filtered_I2'] # Matching duplicated row in `i` and `f` row_name = set(i1[1:, 0]) & set(f1[1:, 0]) i1 = i1[[True] + [True if i in row_name else False for i in i1[1:, 0]], :] f1 = f1[[True] + [True if i in row_name else False for i in f1[1:, 0]], :] assert np.all(i1[:, 0] == f1[:, 0]) row_name = set(i2[1:, 0]) & set(f2[1:, 0]) i2 = i2[[True] + [True if i in row_name else False for i in i2[1:, 0]], :] f2 = f2[[True] + [True if i in row_name else False for i in f2[1:, 0]], :] assert np.all(i2[:, 0] == f2[:, 0]) # Matching the genes and protein among individuals gene_name = set(i1[0][1:]) & set(i2[0][1:]) i1 = i1[:, [True] + [True if i in gene_name else False for i in i1[0][1:]]] i2 = i2[:, [True] + [True if i in gene_name else False for i in i2[0][1:]]] assert np.all(i1[0] == i2[0]) gene = np.concatenate((i1, i2[1:]), axis=0) prot_name = set( [i for i in set(f1[0][1:]) & set(f2[0][1:]) if '_cd' in i]) prot_name = sorted(prot_name) f1 = f1[:, [0] + [f1[0].tolist().index(i) for i in prot_name]] f2 = f2[:, [0] + [f2[0].tolist().index(i) for i in prot_name]] assert np.all(f1[0] == f2[0]) prot = np.concatenate((f1, f2[1:]), axis=0) # ====== save data to disk ====== # X = gene[1:, 1:].astype('float32') X_row = gene[1:, 0] X_col = gene[0, 1:] X_col = np.array([i.replace('"', '') for i in X_col]) y = prot[1:, 1:].astype('float32') y_row = prot[1:, 0] y_col = np.array( [i.replace('"', '').split('_')[-1].upper() for i in prot[0, 1:]]) assert np.all(X_row == y_row) X_row = np.array([i.replace('"', '') for i in X_row]) # ====== the protein marker can be smaller than zero ====== # min_values = np.min(y, axis=0, keepdims=True) min_values = np.where(min_values > 0, 0, min_values) y = y + np.abs(min_values) # ====== filter zero columns ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) save_to_dataset(path=preprocessed_path, X=X, X_col=X_col, y=y, y_col=y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) return ds
def read_PBMC8k(subset='full', override=False, verbose=True, filtered_genes=True, return_arrays=False) -> SingleCellOMIC: subset = str(subset).strip().lower() if subset not in ('ly', 'my', 'full'): raise ValueError( "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'") # prepare the path download_path = os.path.join(DOWNLOAD_DIR, f"PBMC8k_{subset}_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join( DATA_DIR, f"PBMC8k_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed" ) if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: # ====== pbmc 8k ====== # if subset == 'full': ly = read_PBMC8k('ly', filtered_genes=filtered_genes, return_arrays=True) my = read_PBMC8k('my', filtered_genes=filtered_genes, return_arrays=True) url = str(base64.decodebytes(_URL_PBMC8k), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # load data data = np.load(path) X = data['X'] X_row = data['X_row'] X_col = data['X_col'].tolist() y = data['y'] y_col = data['y_col'].tolist() # merge all genes from my and ly subset all_genes = set(ly['X_col'].tolist() + my['X_col'].tolist()) all_genes = sorted([X_col.index(i) for i in all_genes]) # same for protein all_proteins = set(ly['y_col'].tolist() + my['y_col'].tolist()) all_proteins = sorted([y_col.index(i) for i in all_proteins]) # X = X[:, all_genes] y = y[:, all_proteins] X_col = np.array(X_col)[all_genes] y_col = np.array(y_col)[all_proteins] cell_types = np.array( ['ly' if i in ly['X_row'] else 'my' for i in X_row]) # ====== pbmc ly and my ====== # else: url = str( base64.decodebytes(_URL_LYMPHOID if subset == 'ly' else _URL_MYELOID), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # extract the data data = np.load(path) X_row = data['X_row'] y = data['y'] y_col = data['y_col'] if filtered_genes: X = data['X_filt'] X_col = data['X_filt_col'] else: X = data['X_full'] X_col = data['X_full_col'] cell_types = np.array([subset] * X.shape[0]) # ====== save everything ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) assert X.shape == (len(X_row), len(X_col)) assert len(X) == len(y) assert y.shape[1] == len(y_col) with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f: pickle.dump(cell_types, f) save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) if return_arrays: return ds sco = SingleCellOMIC(X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"8k{subset}{'' if filtered_genes else 'all'}") sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col']) progenitor = ds['cell_types'] sco.add_omic( 'progenitor', X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor], dtype=np.float32), var_names=np.array(['myeloid', 'lymphoid']), ) return sco
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True): download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = _CITEseq_CBMC_PREPROCESSED if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) elif override: if verbose: print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED) shutil.rmtree(_CITEseq_CBMC_PREPROCESSED) os.mkdir(_CITEseq_CBMC_PREPROCESSED) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # url = str(base64.decodebytes(_URL), 'utf-8') base_name = os.path.basename(url) zip_path = os.path.join(download_path, base_name) download_file(filename=zip_path, url=url, override=False, md5=r"beb76d01a67707c61c21bfb188e1b69f") # ====== extract the data ====== # data_dict = {} for name, data in crypto.unzip_aes(zip_path, password=_PASSWORD, verbose=False): base_name = os.path.splitext(name)[0] if '.npz' in name: data = sp.sparse.load_npz(BytesIO(data)).todense() elif '.csv' in name: data = np.loadtxt(StringIO(str(data, 'utf-8')), dtype=str, delimiter=',') else: raise RuntimeError("Unknown format: %s" % name) data_dict[base_name] = data # ====== post-processing ====== # X = np.array(data_dict['X'].astype('float32')) X_row, X_col = data_dict['X_row'], data_dict['X_col'] X, X_col = remove_allzeros_columns(matrix=X, colname=X_col) assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1] y = data_dict['y'].astype('float32') y_row, y_col = data_dict['y_row'], data_dict['y_col'] assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1] assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col) sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) sco._inplace_subset_var(result.gene_subset) with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(set(sco.var_names.values), f) del sco # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) sco = SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"cbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col']) if filtered_genes: with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) sco._inplace_subset_var([i in top_genes for i in sco.var_names]) return sco
def read_CITEseq_PBMC(override=False, verbose=True, filtered_genes=False) -> SingleCellOMIC: download_path = os.path.join( DOWNLOAD_DIR, "PBMC_%s_original" % ('5000' if filtered_genes else 'CITEseq')) if not os.path.exists(download_path): os.makedirs(download_path) preprocessed_path = (_5000_PBMC_PREPROCESSED if filtered_genes else _CITEseq_PBMC_PREPROCESSED) if override: shutil.rmtree(preprocessed_path) os.makedirs(preprocessed_path) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # download_files = {} for url, md5 in zip( [_URL_5000 if filtered_genes else _URL_FULL, _URL_PROTEIN], [_MD5_5000 if filtered_genes else _MD5_FULL, _MD5_PROTEIN]): url = str(base64.decodebytes(url), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) download_files[base_name] = (path, md5) # ====== extract the data ====== # n = set() for name, (path, md5) in sorted(download_files.items()): if verbose: print(f"Extracting {name} ...") binary_data = decrypt_aes(path, password=_PASSWORD) md5_ = md5_checksum(binary_data) assert md5_ == md5, f"MD5 checksum mismatch for file: {name}" with zipfile.ZipFile(file=BytesIO(binary_data), mode='r') as f: for name in f.namelist(): data = str(f.read(name), 'utf8') for line in data.split('\n'): if len(line) == 0: continue line = line.strip().split(',') n.add(len(line)) if 'Protein' in name: y.append(line) else: X.append(line) # ====== post-processing ====== # assert len(n) == 1, \ "Number of samples inconsistent between raw count and protein count" if verbose: print("Processing gene count ...") X = np.array(X).T X_row, X_col = X[1:, 0], X[0, 1:] X = X[1:, 1:].astype('float32') # ====== filter mouse genes ====== # human_cols = [True if "HUMAN_" in i else False for i in X_col] if verbose: print(f"Removing {np.sum(np.logical_not(human_cols))} MOUSE genes ...") X = X[:, human_cols] X_col = np.array([i.replace('HUMAN_', '') for i in X_col[human_cols]]) X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) # ====== protein ====== # if verbose: print("Processing protein count ...") y = np.array(y).T y_row, y_col = y[1:, 0], y[0, 1:] y = y[1:, 1:].astype('float32') assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"pbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col'])
def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey(lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError("Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError("Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'.") # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str(get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join([s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)); flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError("ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write('[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification('Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification('Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False): """ Using parallel MiniBatchPCA to do PCA for multiple features at once. """ # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec) # add reading data from indices also # ====== check input dataset ====== # own_dataset = True if is_string(dataset) and os.path.isdir(dataset): dataset = Dataset(dataset, read_only=True) elif isinstance(dataset, Dataset): own_dataset = False elif isinstance(dataset, FeatureProcessor): dataset = Dataset(dataset.path, read_only=True) else: raise ValueError("Cannot acquire Dataset from input: %s" % str(dataset)) # ====== extract all feat_name ====== # if is_string(feat_name) and feat_name == 'auto': feat_name = [] for k in dataset.keys(): X = dataset[k] if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1: feat_name.append(k) else: feat_name = [name for name in as_tuple(feat_name, t=str) if name in dataset] # ====== load PCA ====== # from odin.ml import MiniBatchPCA # init PCA nb_samples = 0 for feat in feat_name: nb_samples += dataset[feat].shape[0] # ====== prepare MPI PCA ====== # add_notification("Selected features for PCA: " + ctext(', '.join(feat_name), 'yellow')) def map_pca(name): X = dataset[name] # found exist pca model if 'pca_' + feat in dataset and not override: pca = dataset['pca_' + feat] # create new PCA else: pca = MiniBatchPCA(n_components=None, whiten=False, copy=True, batch_size=None) # No shuffling make iter much faster for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0): pca.partial_fit(x) yield x.shape[0] # save PCA model with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f: cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL) # finish return feature name yield name mpi = MPI(jobs=feat_name, func=map_pca, ncpu=None, batch=1, hwm=12082518, backend='python') # ====== running the MPI ====== # remain_features = list(feat_name) finished_features = [] prog = Progbar(target=nb_samples, print_summary=True, print_report=True, name='PCA') for n in mpi: if is_string(n): remain_features.remove(n) finished_features.append(n) else: prog['Remain'] = ', '.join(remain_features) prog['Finished'] = ', '.join(finished_features) prog.add(n) # ====== return ====== # if own_dataset: dataset.close()
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices']) # ====== checking indices ====== # main_indices = {name: (start, end) for name, (start, end) in ds['indices'].items()} for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform( ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()
def read_PBMCeec(subset='ly', override=False, verbose=True, filtered_genes=True) -> SingleCellOMIC: subset = str(subset).strip().lower() if subset not in ('ly', 'my', 'full'): raise ValueError( "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'") if subset in ('my', 'full'): raise NotImplementedError("No support for subset: %s - PBMCecc" % subset) download_path = os.path.join(DOWNLOAD_DIR, "PBMCecc_%s_original" % subset) if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join( DATA_DIR, f"PBMCecc_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed" ) if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at path {preprocessed_path}") if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): # ====== full ====== # if subset == 'full': raise NotImplementedError # ====== ly and my ====== # else: url = str( base64.decodebytes(_URL_LYMPHOID if subset == 'ly' else _URL_MYELOID), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # ====== extract the data ====== # data = np.load(path) X_row = data['X_row'] y = data['y'] y_col = data['y_col'] if filtered_genes: X = data['X_var'] X_col = data['X_var_col'] else: X = data['X_full'] X_col = data['X_full_col'] cell_types = np.array(['ly'] * X.shape[0]) # ====== save everything ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) assert X.shape == (len(X_row), len(X_col)) assert len(X) == len(y) assert y.shape[1] == len(y_col) with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f: pickle.dump(cell_types, f) save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) sco = SingleCellOMIC(X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"ecc{subset}{'' if filtered_genes else 'all'}") sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col']) progenitor = ds['cell_types'] sco.add_omic( 'progenitor', X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor], dtype=np.float32), var_names=np.array(['myeloid', 'lymphoid']), ) return sco
def read_FACS(n_protein, override=False, verbose=False): download_path = os.path.join(DOWNLOAD_DIR, "FACS_original") if not os.path.exists(download_path): os.mkdir(download_path) n_protein = int(n_protein) assert n_protein in (2, 5) preprocessed_path = _FACS_PREPROCESSED % n_protein if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) elif override: shutil.rmtree(preprocessed_path) os.mkdir(preprocessed_path) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # url = str(base64.decodebytes(_URL), 'utf-8') % n_protein base_name = os.path.basename(url) zip_path = os.path.join(download_path, base_name) urlretrieve(url=url, filename=zip_path) # ====== extract the data ====== # data_dict = {} for name, data in crypto.unzip_aes(zip_path, password=_PASSWORD, verbose=False): base_name = os.path.splitext(name)[0] if '.npz' in name: data = sp.sparse.load_npz(BytesIO(data)).todense() elif '.csv' in name: data = np.loadtxt(StringIO(str(data, 'utf-8')), dtype=str, delimiter=',') else: raise RuntimeError("Unknown format: %s" % name) data_dict[base_name] = data if verbose: print('%-12s' % base_name, ':', data.shape) # ====== post-processing ====== # X = data_dict['X'].astype('float32') X = np.array(X) X_row, X_col = data_dict['X_row'], data_dict['X_col'] assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1] y = data_dict['y'].astype('float32') y_row, y_col = data_dict['y_row'], data_dict['y_col'] assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1] assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # ====== filter zero columns ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) save_to_dataset(path=preprocessed_path, X=X, X_col=X_col, y=y, y_col=y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) return ds