def __init__(self, algorithm='count', vocab_size: int = 2000, min_frequency: int = 2, max_frequency: float = 0.95, max_length: int = 500, cache_path: str = "~/nlp_data/newsgroup20", **kwargs): categorices = kwargs.pop('categorices', None) super().__init__(algorithm=algorithm, vocab_size=vocab_size, min_frequency=min_frequency, max_frequency=max_frequency, max_length=max_length, cache_path=cache_path, **kwargs) kw = dict(shuffle=True, random_state=1, categories=categorices, remove=('headers', 'footers', 'quotes')) data = fetch_20newsgroups(subset='train', return_X_y=False, **kw) X_train, y_train = data.data, data.target labels_name = data.target_names self.X_test, y_test = fetch_20newsgroups(subset='test', return_X_y=True, **kw) self.X_train, self.X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=0.2, shuffle=True, random_state=0) self._labels = np.array(labels_name) self.y_train = one_hot(y_train, len(self._labels)) self.y_valid = one_hot(y_valid, len(self._labels)) self.y_test = one_hot(y_test, len(self._labels))
def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=None, partition='train', inc_labels=False, seed=1) -> tf.data.Dataset: r""" Arguments: partition : {'train', 'valid', 'test'} inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 28, 28, 1))` label - `(tf.float32, (None, 10))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ X, y = get_partition(partition, train=(self.X_train, self.y_train), valid=(self.X_valid, self.y_valid), test=(self.X_test, self.y_test)) inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) assert X.shape[0] == y.shape[0] X = np.reshape(X, (-1, 3, 32, 32)) X = np.transpose(X, (0, 2, 3, 1)) y = one_hot(y, self.n_labels) def _process(*data): image = tf.cast(data[0], tf.float32) image = self.normalize_255(image) if inc_labels: label = tf.cast(data[1], tf.float32) if 0. < inc_labels < 1.: # semi-supervised mask mask = gen.uniform(shape=(1,)) < inc_labels return dict(inputs=(image, label), mask=mask) return image, label return image ds = tf.data.Dataset.from_tensor_slices(X) if inc_labels > 0.: ds = tf.data.Dataset.zip((ds, tf.data.Dataset.from_tensor_slices(y))) ds = ds.map(_process, parallel) if cache is not None: ds = ds.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: ds = ds.shuffle(int(shuffle)) ds = ds.batch(batch_size, drop_remainder) if prefetch is not None: ds = ds.prefetch(prefetch) return ds
def sample_prior(self, n: int = 1, seed: int = 1, **kwargs) -> tf.Tensor: classes = self.classes y = np.concatenate([one_hot(np.mod(np.arange(n), i), i) for i in classes], -1) z = super(M2VAE, self).sample_prior(n=n, seed=seed, **kwargs) z.qy_x = y return z
def _load_scale_dataset(path, dsname): url = str( base64.decodebytes( b'aHR0cHM6Ly9haS1kYXRhc2V0cy5zMy5hbWF6b25hd3MuY29tL3NjYWxlX2RhdGFzZXRzLnppcA==\n' ), 'utf-8') md5 = r"5fc7c52108220e30a04f033e355716c0" path = os.path.abspath(os.path.expanduser(path)) if not os.path.exists(path): os.makedirs(path) filename = os.path.basename(url) filepath = os.path.join(path, filename) # download if not os.path.exists(filepath): print(f"Downloading {url} ...") urlretrieve(url, filename=filepath) # extract zip_path = os.path.join(path, 'scale_datasets') if not os.path.exists(zip_path): with zipfile.ZipFile(filepath, "r") as f: f.extractall(path) # load cell = np.load(os.path.join(zip_path, f"{dsname}_cell")) labels = np.load(os.path.join(zip_path, f"{dsname}_labels")) peak = np.load(os.path.join(zip_path, f"{dsname}_peak")) x = sparse.load_npz(os.path.join(zip_path, f"{dsname}_x")) ids = {key: i for i, key in enumerate(sorted(set(labels)))} labels = one_hot(np.array([ids[i] for i in labels]), len(ids)) return x, labels, peak, np.array(list(ids.keys()))
def _preprocess_xy(x, y, nb_classes): if x.ndim > 2: x = np.reshape(x, newshape=(x.shape[0], -1)) if y is not None: if y.ndim == 1 and nb_classes > 2: y = one_hot(y, nb_classes=nb_classes) return x, y return x
def process(self, name, X, *args): _ = [] for transcription in args: if isinstance(transcription, str): transcription = [i for i in transcription.split(' ') if len(i) > 0] transcription = [int(i) for i in transcription] transcription = one_hot(transcription, n_classes=self._n_classes) _.append(transcription) return (name, X) + tuple(_)
def __init__(self, path: str = "~/tensorflow_datasets/melanoma_atac"): path = os.path.abspath(os.path.expanduser(path)) if not os.path.exists(path): os.makedirs(path) ### download data data = {} for url in _URL: fname = os.path.basename(url) fpath = os.path.join(path, fname) if not os.path.exists(fpath): print(f"Downloading file: {fname} ...") urlretrieve(url, filename=fpath) data[fname.split(".")[0]] = fpath ### load data try: import rpy2.robjects as robjects from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter robjects.r['options'](warn=-1) robjects.r("library(Matrix)") pandas2ri.activate() except ImportError: raise ImportError("Require package 'rpy2' for reading Rdata file.") loaded_data = {} for k, v in data.items(): robjects.r['load'](v) x = robjects.r[k] if k == "counts_mel": with localconverter(robjects.default_converter + pandas2ri.converter): # dgCMatrix x = sparse.csr_matrix( (x.slots["x"], x.slots["i"], x.slots["p"]), shape=tuple(robjects.r("dim")(x))[::-1], dtype=np.float32) else: x = robjects.conversion.rpy2py(x) loaded_data[k] = x ### post-processing x = loaded_data['counts_mel'] labels = [] for i, j in zip(loaded_data["cellData_mel"]['cellLine'], loaded_data["cellData_mel"]['LineType']): labels.append(i + '_' + j.split("-")[0]) labels = np.array(labels) labels = np.array(labels) labels_name = {name: i for i, name in enumerate(sorted(set(labels)))} labels = one_hot(np.array([labels_name[i] for i in labels]), len(labels_name)) ### assign the data self.x = x self.y = labels self.xvar = np.array([f"Region{i + 1}" for i in range(x.shape[1])]) self.yvar = np.array(list(labels_name.keys()))
def process(self, name, X): data_idx = axis_normalize(axis=self.data_idx, ndim=len(X), return_tuple=True) X_new = [] for idx, x in enumerate(X): # transform into one-label y if idx in data_idx: x = np.array(x, dtype='int32') x = one_hot(x, nb_classes=self.nb_classes) X_new.append(x) return name, X_new
def read_scale_dataset(dsname="leukemia", filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Datasets provided by (Xiong et al. 2019), four datasets are supported: - 'breast_tumor' - 'forebrain' - 'leukemia' - 'insilico' Reference: Xiong, L. et al. SCALE method for single-cell ATAC-seq analysis via latent feature extraction. Nat Commun 10, 4576 (2019). """ datasets = {'breast_tumor', 'forebrain', 'leukemia', 'insilico'} assert dsname in datasets, \ f"Cannot find dataset with name {dsname}, available datasets are: {datasets}" download_path = os.path.join(DOWNLOAD_DIR, f"scale_dataset") preprocessed_path = os.path.join(DATA_DIR, f"scale_preprocessed") if not os.path.exists(download_path): os.makedirs(download_path) if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### Download data url = str(base64.decodebytes(_URL), 'utf-8') path = os.path.join(download_path, os.path.basename(url)) download_file(url, path, override=False, md5=_MD5) ### extract the data if len(os.listdir(preprocessed_path)) == 0: with zipfile.ZipFile(path, "r") as f: for info in f.filelist: name = os.path.basename(info.filename) if len(name) == 0: continue with open(os.path.join(preprocessed_path, name), 'wb') as fout: fout.write(f.read(info)) ### load the data cell = np.load(os.path.join(preprocessed_path, f"{dsname}_cell")) labels = np.load(os.path.join(preprocessed_path, f"{dsname}_labels")) peak = np.load(os.path.join(preprocessed_path, f"{dsname}_peak")) x = sparse.load_npz(os.path.join(preprocessed_path, f"{dsname}_x")) sco = SingleCellOMIC(X=x, cell_id=cell, gene_id=peak, omic=OMIC.atac, name=dsname) ids = {key: i for i, key in enumerate(sorted(set(labels)))} sco.add_omic(OMIC.celltype, X=one_hot(np.array([ids[i] for i in labels]), len(ids)), var_names=list(ids.keys())) return sco
def _read_scvi_dataset(name, clazz_name, override, verbose): preprocessed_path = select_path(os.path.join(DATA_DIR, '%s_preprocessed' % name), create_new=True) if override: shutil.rmtree(preprocessed_path) os.mkdir(preprocessed_path) # ====== copy the dataset from scVI ====== # if not os.path.exists(os.path.join(preprocessed_path, 'X')): try: import scvi.dataset as scvi_dataset except ImportError: raise RuntimeError("Require `scVI` package for PBMC dataset") clazz = getattr(scvi_dataset, clazz_name) gene_dataset = clazz(save_path=DOWNLOAD_DIR) X = gene_dataset._X if hasattr(X, 'todense'): X = np.array(X.todense()) gene_names = np.array(gene_dataset.gene_names) # convert gene identifier to gene symbol (i.e. name) if hasattr(gene_dataset, 'de_metadata'): from sisua.data.utils import get_gene_id2name meta = gene_dataset.de_metadata converter = {i: j for i, j in zip(meta.ENSG, meta.GS)} pbmc8kconverter = get_gene_id2name() gene_names = np.array([ pbmc8kconverter[i] if i in pbmc8kconverter else converter[i] for i in gene_names ]) assert len(gene_names) == X.shape[1] label_names = np.array(gene_dataset.cell_types) y = one_hot(gene_dataset.labels.ravel(), nb_classes=len(label_names)) assert len(label_names) == y.shape[1] cell_names = np.array(['Cell#%d' % i for i in range(X.shape[0])]) _save_data_to_path(preprocessed_path, X, y, gene_names, label_names, cell_names, verbose) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
def get_data(): """ batch_size = 128 """ batch = [] batch_trans = [] for name, start, end in indices: start = int(start) end = int(end) data = ds['mspec'][start:end] data = (data - data.mean(0)) / data.std(0) data = (data - mean) / std data = np.vstack([ data[i:i + 21].reshape(1, -1) for i in range(0, data.shape[0], 21) if i + 21 < data.shape[0] ]) trans = transcription[name] trans = np.array([int(i) for i in trans.split(' ') if len(i) > 0]) trans = np.vstack([ trans[i + 11].reshape(1, -1) for i in range(0, trans.shape[0], 21) if i + 21 < trans.shape[0] ]) batch.append(data) batch_trans.append(trans) if len(batch) == cache: batch = np.vstack(batch) trans = one_hot(np.vstack(batch_trans).ravel(), 10) idx = np.random.permutation(batch.shape[0]) batch = batch[idx] trans = trans[idx] i = 0 while i < batch.shape[0]: start = i end = i + 128 yield batch[start:end], trans[start:end] i = end batch = [] batch_trans = []
def read_melanoma_cisTopicData(filtered_genes=True, override=False, verbose=True): r""" melanoma ATAC data from (Bravo González-Blas, et al. 2019) Reference: Bravo González-Blas, C. et al. cisTopic: cis-regulatory topic modeling on single-cell ATAC-seq data. Nat Methods 16, 397–400 (2019). Verfaillie, A. et al. Decoding the regulatory landscape of melanoma reveals TEADS as regulators of the invasive cell state. Nat Commun 6, (2015). """ download_dir = os.path.join(DOWNLOAD_DIR, 'cistopic') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'cistopic_preprocessed') if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### downloading the data data = {} for url in _URL: fname = os.path.basename(url) fpath = os.path.join(download_dir, fname) if not os.path.exists(fpath): if verbose: print(f"Downloading file: {fname} ...") urlretrieve(url, filename=fpath) data[fname.split(".")[0]] = fpath ### preprocess data if len(os.listdir(preprocessed_path)) == 0: try: import rpy2.robjects as robjects from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter robjects.r['options'](warn=-1) robjects.r("library(Matrix)") pandas2ri.activate() except ImportError: raise ImportError("Require package 'rpy2' for reading Rdata file.") for k, v in data.items(): robjects.r['load'](v) x = robjects.r[k] outpath = os.path.join(preprocessed_path, k) if k == "counts_mel": with localconverter(robjects.default_converter + pandas2ri.converter): # dgCMatrix x = sparse.csr_matrix((x.slots["x"], x.slots["i"], x.slots["p"]), shape=tuple(robjects.r("dim")(x))[::-1], dtype=np.float32) else: x = robjects.conversion.rpy2py(x) with open(outpath, "wb") as f: pickle.dump(x, f) if verbose: print(f"Loaded file: {k} - {type(x)} - {x.shape}") pandas2ri.deactivate() ### load_data data = {} for name in os.listdir(preprocessed_path): with open(os.path.join(preprocessed_path, name), 'rb') as f: data[name] = pickle.load(f) ### sco # print(data["dm3_CtxRegions"]) x = data['counts_mel'] sco = SingleCellOMIC(X=x, cell_id=data["cellData_mel"].index, gene_id=[f"Region{i + 1}" for i in range(x.shape[1])], omic=OMIC.atac) # celltype labels = [] for i, j in zip(data["cellData_mel"]['cellLine'], data["cellData_mel"]['LineType']): labels.append(i + '_' + j.split("-")[0]) labels = np.array(labels) labels_name = {name: i for i, name in enumerate(sorted(set(labels)))} labels = np.array([labels_name[i] for i in labels]) sco.add_omic(OMIC.celltype, one_hot(labels, len(labels_name)), list(labels_name.keys())) return sco
def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=None, partition='train', inc_labels=False, seed=1) -> tf.data.Dataset: r""" Arguments: partition : {'train', 'valid', 'test'} inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 64, 64, 1))` label - `(tf.float32, (None, 5))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) x = self.transform(partition) y = get_partition(partition, train=self.train_labels, valid=self.valid_labels, test=self.test_labels) # remove empty docs indices = np.array(np.sum(x, axis=-1) > 0).ravel() x = x[indices] if len(y) > 0: y = y[indices] # convert to one-hot if inc_labels > 0 and len(y) > 0 and y.ndim == 1: y = one_hot(y, self.n_labels) def _process(*data): data = tuple([ tf.cast( tf.sparse.to_dense(i) if isinstance(i, tf.SparseTensor) else i, tf.float32) for i in data ]) if inc_labels: if 0. < inc_labels < 1.: # semi-supervised mask mask = gen.uniform(shape=(1, )) < inc_labels return dict(inputs=tuple(data), mask=mask) return data return data[0] # prepare the sparse matrices if isinstance(x, spmatrix): x = tf.SparseTensor(indices=sorted(zip(*x.nonzero())), values=x.data, dense_shape=x.shape) ds = tf.data.Dataset.from_tensor_slices(x) if inc_labels > 0: if isinstance(y, spmatrix): y = tf.SparseTensor(indices=sorted(zip(*y.nonzero())), values=y.data, dense_shape=y.shape) y = tf.data.Dataset.from_tensor_slices(y) ds = tf.data.Dataset.zip((ds, y)) # configurate dataset ds = ds.map(_process, parallel) if cache is not None: ds = ds.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: ds = ds.shuffle(int(shuffle)) ds = ds.batch(batch_size, drop_remainder) if prefetch is not None: ds = ds.prefetch(prefetch) return ds
def read_centenarian(override=False, verbose=False): r""" Data used in: "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in supercentenarians" | bioRxiv [WWW Document], n.d. URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20). """ download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed') if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): labels = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[2])), url=_URL[2], ) data = [] with gzip.open(labels, mode='rb') as f: for line in f: line = str(line, 'utf-8').strip().split('\t') assert line[1][:2] == line[2] data.append(line) labels = np.array(data) y_col = sorted(set(labels[:, 1])) y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]), len(y_col)).astype('float32') y_col = np.array(y_col) # raw = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[0])), url=_URL[0], ) if verbose: print("Unzip and reading raw UMI ...") X_raw, cell_id1, gene_id1 = read_gzip_csv(raw) # norm = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[1])), url=_URL[1], ) if verbose: print("Unzip and reading log-norm UMI ...") X_norm, cell_id2, gene_id2 = read_gzip_csv(norm) # assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \ np.all(gene_id1 == gene_id2) assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \ X_raw.shape[1] == X_norm.shape[1] == len(gene_id1) # if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X=X_raw, X_col=gene_id1, y=y, y_col=y_col, rowname=cell_id1, print_log=verbose) with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'), shape=(0, X_norm.shape[1]), dtype='float32', remove_exist=True) as f: for s, e in batching(batch_size=2048, n=X_norm.shape[0]): f.write(X_norm[s:e]) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
def streamline_classifier(Z_train, y_train, Z_test, y_test, labels_name, mode='ovr', title='', plot_train_results=False, show_plot=True, return_figure=False): r""" Arguments: fig : Figure or tuple (`float`, `float`), optional (default=`None`) width, height in inches Returns: (results_train, results_test), (fig_train, fig_test) results is a dictionary of scores { F1micro=f1_micro * 100, F1macro=f1_macro * 100, F1weight=f1_weight * 100, F1_[classname]=... } """ mode = mode.strip().lower() assert mode in ('ovr', 'ovo'), \ "Only support ovr - one vs rest, ovo - one vs one; mode for streamline classifier" labels_name = [standardize_protein_name(i) for i in labels_name] results_train = {} results_test = {} labels_name = np.array(labels_name) with catch_warnings_ignore(FutureWarning): with catch_warnings_ignore(RuntimeWarning): n_classes = len(labels_name) # ====== preprocessing ====== # if y_train.ndim == 1 or y_train.shape[1] == 1: y_train = one_hot(y_train.ravel(), nb_classes=n_classes) if y_test.ndim == 1 or y_test.shape[1] == 1: y_test = one_hot(y_test.ravel(), nb_classes=n_classes) is_binary_classes = sorted(np.unique( y_train.astype('float32'))) == [0., 1.] # ====== not binary classes ====== # if not is_binary_classes: gmm = ProbabilisticEmbedding() gmm.fit(np.concatenate((y_train, y_test), axis=0)) y_train = gmm.predict(y_train) y_test = gmm.predict(y_test) # kernel : 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' if mode == 'ovr': classifier = OneVsRestClassifier(SVC( kernel='linear', random_state=UNIVERSAL_RANDOM_SEED), n_jobs=n_classes) classifier.fit(X=Z_train, y=y_train) else: raise NotImplementedError classifier = SVC(kernel='linear', decision_function_shape='ovo', random_state=UNIVERSAL_RANDOM_SEED) classifier.fit(X=Z_train, y=y_train) # ====== return ====== # from sklearn.exceptions import UndefinedMetricWarning with catch_warnings_ignore(UndefinedMetricWarning): results_train = plot_evaluate_classifier( y_pred=classifier.predict(Z_train), y_true=y_train, labels=labels_name, title='[train]' + title, show_plot=show_plot and plot_train_results, return_figure=True) results_test = plot_evaluate_classifier( y_pred=classifier.predict(Z_test), y_true=y_test, labels=labels_name, title='[test]' + title, show_plot=show_plot, return_figure=True) if show_plot: if plot_train_results: results_train, fig_train = results_train[0], results_train[ 1] else: fig_train = None results_test, fig_test = results_test[0], results_test[1] results_train = OrderedDict( sorted(results_train.items(), key=lambda x: x[0])) results_test = OrderedDict( sorted(results_test.items(), key=lambda x: x[0])) results = (results_train, results_test) if show_plot and return_figure: return results, (fig_train, fig_test) return results
order='word', engine='odin' ) tk.fit(texts, vocabulary=None) cPickle.dump(tk, open(tokenizer_path, 'w'), protocol=cPickle.HIGHEST_PROTOCOL) print('========== Summary ==========') for i, j in tk.summary.items(): print(i, ':', j) # =========================================================================== # Build dataset # =========================================================================== X = tk.transform(texts, mode='seq', maxlen=MAX_SEQ_LEN, end_document=None, token_not_found='ignore') y = [labels_set.index(i) for i in labels] y = one_hot(np.array(y, dtype='int32'), nb_classes=nb_labels) n = X.shape[0] np.random.seed(1208) idx = np.random.permutation(n) X = X[idx] y = y[idx] X_train = X[:int(0.8 * n)] y_train = y[:int(0.8 * n)] X_valid = X[int(0.8 * n):] y_valid = y[int(0.8 * n):] print('X:', X.shape, 'y:', y.shape) print('X_train:', X_train.shape, 'y_train:', y_train.shape) print('X_valid:', X_valid.shape, 'y_valid:', y_valid.shape)
MODEL_PATH = utils.get_modelpath(name='cifar10_%s' % MODEL_NAME, override=True) LOG_PATH = utils.get_logpath(name='cifar10_%s.log' % MODEL_NAME, override=True) stdio(LOG_PATH) # =========================================================================== # Some handmade constants # =========================================================================== NB_EPOCH = 10 LEARNING_RATE = 0.001 # =========================================================================== # Load dataset # =========================================================================== ds = F.CIFAR10.get_dataset() nb_labels = 10 print(ds) X_train = ds['X_train'][:].astype('float32') / 255. y_train = one_hot(ds['y_train'][:], nb_classes=nb_labels) X_test = ds['X_test'][:].astype('float32') / 255. y_test = one_hot(ds['y_test'][:], nb_classes=nb_labels) # =========================================================================== # Create network # =========================================================================== inputs = [K.placeholder(shape=(None,) + X_train.shape[1:], name='X', dtype='float32'), K.placeholder(shape=(None, nb_labels), name='y', dtype='float32')] print("Inputs:", inputs) model = N.Lambda.search(MODEL_NAME, prefix='models_cifar') outputs = model(*inputs) # ====== create losses ====== # ce = tf.losses.softmax_cross_entropy(inputs[-1], outputs['logit']) acc = K.metrics.categorical_accuracy(outputs['prob'], inputs[-1]) cm = K.metrics.confusion_matrix(y_pred=outputs['prob'], y_true=inputs[-1],
def plot_evaluate_classifier(y_pred, y_true, labels, title, show_plot=True, return_figure=False): r""" Arguments: fig : Figure or tuple (`float`, `float`), optional (default=`None`) width, height in inches Returns: Return a dictionary of scores { F1micro=f1_micro * 100, F1macro=f1_macro * 100, F1weight=f1_weight * 100, F1_[classname]=... } """ from matplotlib import pyplot as plt fontsize = 12 num_classes = len(labels) nrow = int(np.ceil(num_classes / 5)) ncol = int(np.ceil(num_classes / nrow)) if y_pred.ndim == 1: y_pred = one_hot(y_pred, nb_classes=num_classes) if y_true.ndim == 1: y_true = one_hot(y_true, nb_classes=num_classes) if show_plot: fig = plot_figure(nrow=4 * nrow + 3, ncol=4 * ncol) f1_classes = [] for i, (name, pred, true) in enumerate(zip(labels, y_pred.T, y_true.T)): f1_classes.append(f1_score(true, pred)) if show_plot: plot_confusion_matrix(confusion_matrix(y_true=true, y_pred=pred), labels=[0, 1], fontsize=fontsize, ax=(nrow, ncol, i + 1), title=name + '\n') f1_micro = f1_score(y_true=y_true.ravel(), y_pred=y_pred.ravel()) f1_macro = np.mean(f1_classes) f1_weight = f1_score(y_true=y_true, y_pred=y_pred, average='weighted') if show_plot: plt.suptitle('%s\nF1-micro:%.2f F1-macro:%.2f F1-weight:%.2f' % (title, f1_micro * 100, f1_macro * 100, f1_weight * 100), fontsize=fontsize + 6) plt.tight_layout(rect=[0, 0.04, 1, 0.96]) results = dict( F1micro=f1_micro * 100, F1macro=f1_macro * 100, F1weight=f1_weight * 100, ) for name, f1 in zip(labels, f1_classes): results['F1_' + name] = f1 * 100 if show_plot and return_figure: return results, fig return results
def evaluate(y_true, y_pred_proba=None, y_pred_log_proba=None, labels=None, title='', path=None, xlims=None, ylims=None, print_log=True): from odin.backend import to_llr from odin.backend.metrics import (det_curve, compute_EER, roc_curve, compute_Cavg, compute_Cnorm, compute_minDCF) def format_score(s): return ctext('%.4f' % s if is_number(s) else s, 'yellow') nb_classes = None # ====== check y_pred ====== # if y_pred_proba is None and y_pred_log_proba is None: raise ValueError("At least one of `y_pred_proba` or `y_pred_log_proba` " "must not be None") y_pred_llr = to_llr(y_pred_proba) if y_pred_log_proba is None \ else to_llr(y_pred_log_proba) nb_classes = y_pred_llr.shape[1] y_pred = np.argmax(y_pred_llr, axis=-1) # ====== check y_true ====== # if isinstance(y_true, (tuple, list)): y_true = np.array(y_true) if y_true.ndim == 2: # convert one-hot to labels y_true = np.argmax(y_true, axis=-1) # ====== check labels ====== # if labels is None: labels = [str(i) for i in range(nb_classes)] # ====== scoring ====== # if y_pred_proba is None: ll = 'unknown' else: ll = log_loss(y_true=y_true, y_pred=y_pred_proba) acc = accuracy_score(y_true=y_true, y_pred=y_pred) cm = confusion_matrix(y_true=y_true, y_pred=y_pred) # C_norm cnorm, cnorm_arr = compute_Cnorm(y_true=y_true, y_score=y_pred_llr, Ptrue=[0.1, 0.5], probability_input=False) if y_pred_log_proba is not None: cnorm_, cnorm_arr_ = compute_Cnorm(y_true=y_true, y_score=y_pred_log_proba, Ptrue=[0.1, 0.5], probability_input=False) if np.mean(cnorm) > np.mean(cnorm_): # smaller is better cnorm, cnorm_arr = cnorm_, cnorm_arr_ # DET Pfa, Pmiss = det_curve(y_true=y_true, y_score=y_pred_llr) eer = compute_EER(Pfa=Pfa, Pmiss=Pmiss) minDCF = compute_minDCF(Pfa, Pmiss)[0] # PRINT LOG if print_log: print(ctext("--------", 'red'), ctext(title, 'cyan')) print("Log loss :", format_score(ll)) print("Accuracy :", format_score(acc)) print("C_norm :", format_score(np.mean(cnorm))) print("EER :", format_score(eer)) print("minDCF :", format_score(minDCF)) print(print_confusion(arr=cm, labels=labels)) # ====== save report to PDF files if necessary ====== # if path is not None: if y_pred_proba is None: y_pred_proba = y_pred_llr from matplotlib import pyplot as plt plt.figure(figsize=(nb_classes, nb_classes + 1)) plot_confusion_matrix(cm, labels) # Cavg plt.figure(figsize=(nb_classes + 1, 3)) plot_Cnorm(cnorm=cnorm_arr, labels=labels, Ptrue=[0.1, 0.5], fontsize=14) # binary classification if nb_classes == 2 and \ (y_pred_proba.ndim == 1 or (y_pred_proba.ndim == 2 and y_pred_proba.shape[1] == 1)): fpr, tpr = roc_curve(y_true=y_true, y_score=y_pred_proba.ravel()) # det curve plt.figure() plot_detection_curve(Pfa, Pmiss, curve='det', xlims=xlims, ylims=ylims, linewidth=1.2) # roc curve plt.figure() plot_detection_curve(fpr, tpr, curve='roc') # multiclasses else: y_true = one_hot(y_true, nb_classes=nb_classes) fpr_micro, tpr_micro, _ = roc_curve(y_true=y_true.ravel(), y_score=y_pred_proba.ravel()) Pfa_micro, Pmiss_micro = Pfa, Pmiss fpr, tpr = [], [] Pfa, Pmiss = [], [] for i, yi in enumerate(y_true.T): curve = roc_curve(y_true=yi, y_score=y_pred_proba[:, i]) fpr.append(curve[0]) tpr.append(curve[1]) curve = det_curve(y_true=yi, y_score=y_pred_llr[:, i]) Pfa.append(curve[0]) Pmiss.append(curve[1]) plt.figure() plot_detection_curve(fpr_micro, tpr_micro, curve='roc', linewidth=1.2, title="ROC Micro") plt.figure() plot_detection_curve(fpr, tpr, curve='roc', labels=labels, linewidth=1.0, title="ROC for each classes") plt.figure() plot_detection_curve(Pfa_micro, Pmiss_micro, curve='det', xlims=xlims, ylims=ylims, linewidth=1.2, title="DET Micro") plt.figure() plot_detection_curve(Pfa, Pmiss, curve='det', xlims=xlims, ylims=ylims, labels=labels, linewidth=1.0, title="DET for each classes") plot_save(path)
protocol=cPickle.HIGHEST_PROTOCOL) print('========== Summary ==========') for i, j in tk.summary.items(): print(i, ':', j) # =========================================================================== # Build dataset # =========================================================================== X = tk.transform(texts, mode='seq', maxlen=MAX_SEQ_LEN, end_document=None, token_not_found='ignore') y = [labels_set.index(i) for i in labels] y = one_hot(np.array(y, dtype='int32'), nb_classes=nb_labels) n = X.shape[0] np.random.seed(1208) idx = np.random.permutation(n) X = X[idx] y = y[idx] X_train = X[:int(0.8 * n)] y_train = y[:int(0.8 * n)] X_valid = X[int(0.8 * n):] y_valid = y[int(0.8 * n):] print('X:', X.shape, 'y:', y.shape) print('X_train:', X_train.shape, 'y_train:', y_train.shape) print('X_valid:', X_valid.shape, 'y_valid:', y_valid.shape)
def create_dataset(self, batch_size=64, image_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=tf.data.experimental.AUTOTUNE, partition='train', inc_labels=True, seed=1) -> tf.data.Dataset: r""" Arguments: partition : {'train', 'valid', 'test', 'unlablled'} inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 64, 64, 3))` label - `(tf.float32, (None, 10))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ if isinstance(image_size, Number) and image_size == 96: image_size = None ### select partition images_path, labels_path = _partition( partition, train=(self.bin_files['train_X'], self.bin_files['train_y']), test=(self.bin_files['test_X'], self.bin_files['test_y']), unlabeled=(self.bin_files['unlabeled_X'], None), ) X = np.reshape(np.fromfile(images_path, dtype=np.uint8), (-1, ) + SLT10.IMAGE_SHAPE) if labels_path is None: # unlabled data inc_labels = False inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) if inc_labels: y = np.fromfile(labels_path, dtype=np.uint8) - 1 y = one_hot(y, len(self.class_names)).astype(np.float32) ### read and resize the data def resize(img): img = tf.clip_by_value( tf.cast(img, tf.float32) / 255., 1e-6, 1. - 1e-6) img = tf.transpose(img, perm=(1, 2, 0)) if image_size is not None: img = tf.image.resize(img, (image_size, image_size), preserve_aspect_ratio=False, antialias=False) return img def masking(image, label): mask = gen.uniform(shape=(1, )) < inc_labels return dict(inputs=(image, label), mask=mask) ### processing images = tf.data.Dataset.from_tensor_slices(X).map(resize, parallel) if inc_labels: labels = tf.data.Dataset.from_tensor_slices(y) images = tf.data.Dataset.zip((images, labels)) if 0. < inc_labels < 1.: # semi-supervised mask images = images.map(masking) # cache data if cache is not None: images = images.cache(str(cache)) # shuffle must be called after cache if shuffle is not None: images = images.shuffle(int(shuffle)) images = images.batch(batch_size, drop_remainder) if prefetch is not None: images = images.prefetch(prefetch) return images
def read_human_embryos(filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Transcriptional map of human embryo development, including the sequenced transcriptomes of 1529 individual cells from 88 human preimplantation embryos. These data show that cells undergo an intermediate state of co-expression of lineage-specific genes, followed by a concurrent establishment of the trophectoderm, epiblast, and primitive endoderm lineages, which coincide with blastocyst formation. References: Petropoulos S, Edsgärd D, Reinius B, et al. Single-Cell RNA-Seq Reveals Lineage and X Chromosome Dynamics in Human Preimplantation Embryos. Cell. 2016 Sep Note: Gene expression levels (RefSeq annotations) were estimated in terms of reads per kilobase exon model and per million mapped reads (RPKM) using rpkmforgenes Genes were filtered, keeping 15633/26178 genes that * were expressed in at least 5 out of 1919 sequenced cells (RPKM >= 10). and * for which cells with expression came from at least two different embryos. Cells were quality-filtered based on 4 criteria, keeping 1529/1919 cells. * First, Spearman correlations, using the RPKM expression levels of all genes, for every possible pair of cells were calculated and a histogram of the maximum correlation obtained for each cell, corresponding to the most similar cell, was used to identify 305 outlier cells with a maximum pair-wise correlations below 0.63. * Second, a histogram of the number of expressed genes per cell was used to identify 330 outlier cells with less than 5000 expressed genes. * Third, a histogram of the total transcriptional expression output from the sex chromosomes (RPKM sum) was used to identify 33 cells with indeterminable sex, or a called sex that was inconsistent with other cells of that embryo * Fourth, 13 outlier cells were identified using PCA and t-SNE dimensionality reduction. """ download_dir = os.path.join(DOWNLOAD_DIR, 'human_embryos') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'human_embryos_preprocessed') if override: shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at {preprocessed_path}") if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### download data files = [] for url, md5 in zip(_URLs, _MD5s): path = download_file(url=url, filename=os.path.join(download_dir, os.path.basename(url)), override=False, md5=md5) files.append(path) ### preprocessing if len(os.listdir(preprocessed_path)) == 0: data_map = {} for f in files: zipname = os.path.basename(f) with zipfile.ZipFile(f, mode="r") as f: for dat_file in f.filelist: filename = dat_file.filename dat = str(f.read(filename), 'utf-8') x = [] for line in dat.split('\n'): if len(line) == 0: continue line = line.split('\t') x.append(line) x = np.asarray(x).T row_name = x[1:, 0] col_name = x[0, 1:] x = x[1:, 1:].astype(np.float32) x = sparse.coo_matrix(x) data_map[filename] = (x, row_name, col_name) print(f"Read: {zipname} - {filename}") print(f" * Matrix: {x.shape}") print(f" * Row : {row_name.shape}-{row_name[:3]}") print(f" * Col : {col_name.shape}-{col_name[:3]}") # save loaded data to disk for name, (x, row, col) in data_map.items(): with open(os.path.join(preprocessed_path, f"{name}:x"), "wb") as f: sparse.save_npz(f, x) with open(os.path.join(preprocessed_path, f"{name}:row"), "wb") as f: np.save(f, row) with open(os.path.join(preprocessed_path, f"{name}:col"), "wb") as f: np.save(f, col) del data_map ### read the data # counts.txt (1529, 26178) # ercc.counts.txt (1529, 92) # rpkm.txt (1529, 26178) # ercc.rpkm.txt (1529, 92) data = {} genes_path = os.path.join(preprocessed_path, "filtered_genes") for path in os.listdir(preprocessed_path): if path == os.path.basename(genes_path): continue name, ftype = os.path.basename(path).split(':') with open(os.path.join(preprocessed_path, path), 'rb') as f: if ftype == 'x': x = sparse.load_npz(f).tocsr() else: x = np.load(f) data[f"{name}_{ftype}"] = x rpkm = data['rpkm.txt_x'] counts = data['counts.txt_x'] genes = data['counts.txt_col'] cells = data['counts.txt_row'] ### filter genes if not os.path.exists(genes_path): # filter genes by rpkm ids = np.asarray(np.sum(rpkm, axis=0) >= 10).ravel() rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # filter genes by min 5 cells ids = np.asarray(np.sum(counts > 0, axis=0) >= 5).ravel() rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # filter highly variable genes sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes) sco.normalize(omic=OMIC.transcriptomic, log1p=True) sco.filter_highly_variable_genes(n_top_genes=2000) filtered = sco.var_names.to_numpy() with open(genes_path, 'wb') as f: pickle.dump([genes, filtered], f) del sco else: with open(genes_path, 'rb') as f: ids, filtered = pickle.load(f) ids = set(ids) ids = np.asarray([i in ids for i in genes]) rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # last filtering if filtered_genes: filtered = set(filtered) ids = np.asarray([i in filtered for i in genes]) rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] ### create the SingleCellOMIC sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes, omic=OMIC.transcriptomic, name="HumanEmbryos") sco.add_omic(omic=OMIC.rpkm, X=rpkm, var_names=genes) labels = ['.'.join(i.split('.')[:-2]) for i in sco.obs_names] labels = ['E7' if i == 'E7.4' else i for i in labels] labels_name = {j: i for i, j in enumerate(sorted(set(labels)))} labels = np.array([labels_name[i] for i in labels]) sco.add_omic(omic=OMIC.celltype, X=one_hot(labels, len(labels_name)), var_names=list(labels_name.keys())) sco.add_omic(omic=OMIC.ercc, X=data['ercc.counts.txt_x'], var_names=data['ercc.counts.txt_col']) return sco
def __init__(self, path='~/tensorflow_datasets/mnist'): path = os.path.abspath(os.path.expanduser(path)) save_path = os.path.join(path, 'mnist.npz') if not os.path.exists(path): os.makedirs(path) assert os.path.isdir(path) ## check exist processed file all_data = None if os.path.exists(save_path): if not os.path.isfile(save_path): raise ValueError("path to %s must be a file" % save_path) if md5_checksum(save_path) != MNIST.MD5: print("Miss match MD5 remove file at: ", save_path) os.remove(save_path) else: all_data = np.load(save_path) ## download and extract if all_data is None: from tqdm import tqdm def dl_progress(count, block_size, total_size): kB = block_size * count / 1024. prog.update(kB - prog.n) read32 = lambda b: np.frombuffer( b, dtype=np.dtype(np.uint32).newbyteorder('>'))[0] all_data = {} for name, url in MNIST.URL.items(): basename = os.path.basename(url) zip_path = os.path.join(path, basename) prog = tqdm(desc="Downloading %s" % basename, unit='kB') urlretrieve(url, zip_path, dl_progress) prog.clear() prog.close() with gzip.open(zip_path, "rb") as f: magic = read32(f.read(4)) if magic not in (2051, 2049): raise ValueError('Invalid magic number %d in MNIST image file: %s' % (magic, zip_path)) n = read32(f.read(4)) # images if 'X_' in name: rows = read32(f.read(4)) cols = read32(f.read(4)) buf = f.read(rows * cols * n) data = np.frombuffer(buf, dtype=np.uint8) data = data.reshape(n, rows, cols, 1) # labels else: buf = f.read(n) data = np.frombuffer(buf, dtype=np.uint8) data = one_hot(data, 10) all_data[name] = data np.savez_compressed(save_path, **all_data) ## split train, valid, test rand = np.random.RandomState(seed=1) ids = rand.permutation(all_data['X_train'].shape[0]) X_train = all_data['X_train'][ids] y_train = all_data['y_train'][ids] X_valid = X_train[:5000] y_valid = y_train[:5000] X_train = X_train[5000:] y_train = y_train[5000:] X_test = all_data['X_test'] y_test = all_data['y_test'] to_ds = lambda images, labels: tf.data.Dataset.zip( (tf.data.Dataset.from_tensor_slices(images), tf.data.Dataset.from_tensor_slices(labels))) self.train = to_ds(X_train, y_train) self.valid = to_ds(X_valid, y_valid) self.test = to_ds(X_test, y_test)
def evaluate(y_true, y_pred_proba=None, y_pred_log_proba=None, labels=None, title='', path=None, xlims=None, ylims=None, print_log=True): from odin.backend import to_llr from odin.backend.metrics import (det_curve, compute_EER, roc_curve, compute_Cavg, compute_Cnorm, compute_minDCF) def format_score(s): return ctext('%.4f' % s if is_number(s) else s, 'yellow') nb_classes = None # ====== check y_pred ====== # if y_pred_proba is None and y_pred_log_proba is None: raise ValueError("At least one of `y_pred_proba` or `y_pred_log_proba` " "must not be None") y_pred_llr = to_llr(y_pred_proba) if y_pred_log_proba is None \ else to_llr(y_pred_log_proba) nb_classes = y_pred_llr.shape[1] y_pred = np.argmax(y_pred_llr, axis=-1) # ====== check y_true ====== # if isinstance(y_true, Data): y_true = y_true.array if isinstance(y_true, (tuple, list)): y_true = np.array(y_true) if y_true.ndim == 2: # convert one-hot to labels y_true = np.argmax(y_true, axis=-1) # ====== check labels ====== # if labels is None: labels = [str(i) for i in range(nb_classes)] # ====== scoring ====== # if y_pred_proba is None: ll = 'unknown' else: ll = log_loss(y_true=y_true, y_pred=y_pred_proba) acc = accuracy_score(y_true=y_true, y_pred=y_pred) cm = confusion_matrix(y_true=y_true, y_pred=y_pred) # C_norm cnorm, cnorm_arr = compute_Cnorm(y_true=y_true, y_score=y_pred_llr, Ptrue=[0.1, 0.5], probability_input=False) if y_pred_log_proba is not None: cnorm_, cnorm_arr_ = compute_Cnorm(y_true=y_true, y_score=y_pred_log_proba, Ptrue=[0.1, 0.5], probability_input=False) if np.mean(cnorm) > np.mean(cnorm_): # smaller is better cnorm, cnorm_arr = cnorm_, cnorm_arr_ # DET Pfa, Pmiss = det_curve(y_true=y_true, y_score=y_pred_llr) eer = compute_EER(Pfa=Pfa, Pmiss=Pmiss) minDCF = compute_minDCF(Pfa, Pmiss)[0] # PRINT LOG if print_log: print(ctext("--------", 'red'), ctext(title, 'cyan')) print("Log loss :", format_score(ll)) print("Accuracy :", format_score(acc)) print("C_norm :", format_score(np.mean(cnorm))) print("EER :", format_score(eer)) print("minDCF :", format_score(minDCF)) print(print_confusion(arr=cm, labels=labels)) # ====== save report to PDF files if necessary ====== # if path is not None: if y_pred_proba is None: y_pred_proba = y_pred_llr from matplotlib import pyplot as plt plt.figure(figsize=(nb_classes, nb_classes + 1)) plot_confusion_matrix(cm, labels) # Cavg plt.figure(figsize=(nb_classes + 1, 3)) plot_Cnorm(cnorm=cnorm_arr, labels=labels, Ptrue=[0.1, 0.5], fontsize=14) # binary classification if nb_classes == 2 and \ (y_pred_proba.ndim == 1 or (y_pred_proba.ndim == 2 and y_pred_proba.shape[1] == 1)): fpr, tpr = roc_curve(y_true=y_true, y_score=y_pred_proba.ravel()) # det curve plt.figure() plot_detection_curve(Pfa, Pmiss, curve='det', xlims=xlims, ylims=ylims, linewidth=1.2) # roc curve plt.figure() plot_detection_curve(fpr, tpr, curve='roc') # multiclasses else: y_true = one_hot(y_true, nb_classes=nb_classes) fpr_micro, tpr_micro, _ = roc_curve(y_true=y_true.ravel(), y_score=y_pred_proba.ravel()) Pfa_micro, Pmiss_micro = Pfa, Pmiss fpr, tpr = [], [] Pfa, Pmiss = [], [] for i, yi in enumerate(y_true.T): curve = roc_curve(y_true=yi, y_score=y_pred_proba[:, i]) fpr.append(curve[0]) tpr.append(curve[1]) curve = det_curve(y_true=yi, y_score=y_pred_llr[:, i]) Pfa.append(curve[0]) Pmiss.append(curve[1]) plt.figure() plot_detection_curve(fpr_micro, tpr_micro, curve='roc', linewidth=1.2, title="ROC Micro") plt.figure() plot_detection_curve(fpr, tpr, curve='roc', labels=labels, linewidth=1.0, title="ROC for each classes") plt.figure() plot_detection_curve(Pfa_micro, Pmiss_micro, curve='det', xlims=xlims, ylims=ylims, linewidth=1.2, title="DET Micro") plt.figure() plot_detection_curve(Pfa, Pmiss, curve='det', xlims=xlims, ylims=ylims, labels=labels, linewidth=1.0, title="DET for each classes") plot_save(path)
def read_mouse_ATLAS(filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" sci-ATAC-seq, to profile genome-wide chromatin accessibility in ∼100,000 single cells from 13 adult mouse tissues: - The regulatory landscape of adult mouse tissues mapped by single-cell chromatin assay - Characterization of 85 distinct chromatin patterns across 13 different tissues - Annotation of key regulators and regulatory sequences in diverse mammalian cell types - Dataset allows resolution of cell types underlying common human traits and diseases References: Cusanovich, D. A. et al. A Single-Cell Atlas of In Vivo Mammalian Chromatin Accessibility. Cell 174, 1309-1324.e18 (2018). Link https://atlas.gs.washington.edu/mouse-atac/ """ download_path = os.path.join(DOWNLOAD_DIR, f"mouse_atac") preprocessed_path = os.path.join(DATA_DIR, f"mouse_atac_preprocessed") if not os.path.exists(download_path): os.makedirs(download_path) if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### Download data files = {} for name, (url, md5) in _URLs.items(): filepath = os.path.join(download_path, os.path.basename(url)) files[name] = download_file(url, filepath, override=False, md5=md5) ### save counts matrix path = os.path.join(preprocessed_path, 'counts') if not os.path.exists(path): print("Reading counts matrix ...") counts = mmread(files['counts']) counts: sparse.coo_matrix counts = counts.astype(np.unit8) with open(path, 'wb') as f: sparse.save_npz(f, counts, compressed=False) ### save metadata path = os.path.join(preprocessed_path, 'metadata') if not os.path.exists(path): with open(files['cellids'], 'r') as f: cell = np.array([i for i in f.read().split('\n') if len(i) > 0]) with open(files['peakids'], 'r') as f: peak = np.array([i for i in f.read().split('\n') if len(i) > 0]) metadata = pd.read_csv(files['metadata'], sep="\t") assert metadata.shape[0] == len(cell) tissue = metadata['tissue'].to_numpy() celltype = metadata['cell_label'].to_numpy() with open(path, 'wb') as f: np.savez(f, cell=cell, peak=peak, tissue=tissue, celltype=celltype) ### Read all data and create SCO counts = sparse.csr_matrix( sparse.load_npz(os.path.join(preprocessed_path, 'counts'))) metadata = np.load(os.path.join(preprocessed_path, 'metadata'), allow_pickle=True) cell = metadata['cell'] peak = metadata['peak'] tissue = metadata['tissue'] celltype = metadata['celltype'] # need to transpose here, counts matrix is [peaks, cells] sco = SingleCellOMIC(X=counts.T, cell_id=cell, gene_id=peak, omic=OMIC.atac, name="mouse_atlas") # add celltype labels = {name: i for i, name in enumerate(sorted(set(celltype)))} sco.add_omic(OMIC.celltype, X=one_hot(np.array([labels[i] for i in celltype]), len(labels)), var_names=list(labels.keys())) # add tissue type labels = {name: i for i, name in enumerate(sorted(set(tissue)))} sco.add_omic(OMIC.tissue, X=one_hot(np.array([labels[i] for i in tissue]), len(labels)), var_names=list(labels.keys())) return sco
def to_array(x): """ pytorch tensor to numpy array """ if hasattr(x, 'todense'): return np.array(x.todense()) if hasattr(x, 'cpu'): return x.data.cpu().numpy() return x # Load dataset cortex = CortexDataset(save_path=SAVE_DATA_PATH) X = cortex.X labels = cortex.cell_types n_labels = len(labels) Y = one_hot(cortex.labels.ravel(), n_labels) # =========================================================================== # scVI # =========================================================================== scvi = VAE(n_input=cortex.nb_genes, n_batch=0, n_labels=0, n_hidden=n_hidden, n_latent=n_latent, n_layers=n_layer, dispersion=dispersion, dropout_rate=dropout_rate, log_variational=log_variational) trainer = UnsupervisedTrainer(model=scvi, gene_dataset=cortex,
def create_dataset(self, batch_size=64, drop_remainder=False, shuffle=1000, prefetch=tf.data.experimental.AUTOTUNE, cache='', parallel=tf.data.experimental.AUTOTUNE, partition='train', inc_labels=True, seed=1) -> tf.data.Dataset: r""" Arguments: partition : {'train', 'train_labelled', 'valid', 'test', 'unlabelled'} - 'train' : combination of both train and unlablled - 'train-labelled' : only the train data inc_labels : a Boolean or Scalar. If True, return both image and label, otherwise, only image is returned. If a scalar is provided, it indicate the percent of labelled data in the mask. Return : tensorflow.data.Dataset : image - `(tf.float32, (None, 64, 64, 3))` label - `(tf.float32, (None, 10))` mask - `(tf.bool, (None, 1))` if 0. < inc_labels < 1. where, `mask=1` mean labelled data, and `mask=0` for unlabelled data """ image_size = self.image_size if isinstance(image_size, Number) and image_size == 96: image_size = None ### select partition images_path, labels_path = get_partition( partition, train=((self.bin_files['train_X'], self.bin_files['unlabeled_X']), self.bin_files['train_y']), train_labelled=(self.bin_files['train_X'], self.bin_files['train_y']), test=(self.bin_files['test_X'], self.bin_files['test_y']), unlabeled=(self.bin_files['unlabeled_X'], None), unlabelled=(self.bin_files['unlabeled_X'], None), ) X = [ np.reshape(np.fromfile(path, dtype=np.uint8), (-1,) + STL10.IMAGE_SHAPE) for path in tf.nest.flatten(images_path) ] is_unlabelled = (labels_path is None) inc_labels = float(inc_labels) gen = tf.random.experimental.Generator.from_seed(seed=seed) # load the labels if inc_labels: if is_unlabelled: y = [np.zeros(shape=(X[0].shape[0], self.n_labels), dtype=np.float32)] else: y = np.fromfile(labels_path, dtype=np.uint8) - 1 y = [one_hot(y, self.n_labels).astype(np.float32)] if len(X) == 2: # combined of both train and unlablled set y.append( np.zeros(shape=(X[1].shape[0], self.n_labels), dtype=np.float32)) assert len(y) == len(X) ### read and resize the data def resize(img): img = tf.cast(img, tf.float32) img = self.normalize_255(img) img = tf.transpose(img, perm=(2, 1, 0)) if image_size is not None: img = tf.image.resize(img, (image_size, image_size), preserve_aspect_ratio=True, antialias=False) return img def masking(image, label): mask = tf.logical_and( gen.uniform(shape=(1,)) < inc_labels, tf.reduce_sum(label) > 0.) return dict(inputs=(image, label), mask=mask) ### processing datasets = None must_masking = any(np.all(i == 0.) for i in y) for x_i, y_i in zip(X, y if inc_labels else X): images = tf.data.Dataset.from_tensor_slices(x_i).map(resize, parallel) if inc_labels: labels = tf.data.Dataset.from_tensor_slices(y_i) images = tf.data.Dataset.zip((images, labels)) if 0. < inc_labels < 1. or must_masking: # semi-supervised mask images = images.map(masking) datasets = images if datasets is None else datasets.concatenate(images) # cache data if cache is not None: datasets = datasets.cache(str(cache)) # shuffle must be called after cache if shuffle is not None and shuffle > 0: datasets = datasets.shuffle(int(shuffle) * len(X)) datasets = datasets.batch(batch_size, drop_remainder) if prefetch is not None: datasets = datasets.prefetch(prefetch) # return return datasets
def _celltypes(y): labels = sorted(np.unique(y)) index = {name: i for i, name in enumerate(labels)} y = one_hot(np.array([index[i] for i in y], dtype=np.int32), nb_classes=len(labels)) return y, [i.replace("_Like", '').lower() for i in labels]