def plot_latents_binary_scatter(self, test=True, pca=False): start_time = time.time() data_type = 'test' if test else 'train' if len(self) <= 4: nrow = 1 ncol = len(self) else: nrow = 2 ncol = int(np.ceil(len(self) / 2)) fig = plt.figure(figsize=(min(20, 5 * ncol) + 2, nrow * 5)) for idx, pos in enumerate(self): ax = plt.subplot(nrow, ncol, idx + 1) with catch_warnings_ignore(Warning): pos.plot_latents_binary_scatter( test=test, ax=ax, legend=True if idx == 0 else False, pca=pca) with catch_warnings_ignore(Warning): plt.tight_layout() self.add_figure('latents_scatter_%s' % data_type, fig) return self._log( 'plot_latents_binary_scatter[%s] %s(s)' % (data_type, ctext(time.time() - start_time, 'lightyellow')))
def get_dataset(dataset_name, override=False, verbose=True) -> SingleCellOMIC: r""" Check `get_dataset_meta` for more information List of all dataset available: ['call', 'callall', 'mpal', 'mpalall', 'mpalatac', '100yo', '8klyall', '8kmyall', '8kly', '8kmy', '8k', '8kall', 'ecclyall', 'eccly', 'eccmyall', 'eccmy', 'ecc', 'eccall', '8kx', '8kxall', 'eccx', 'eccxall', 'vdj1x', 'vdj1xall', 'vdj4x', 'vdj4xall', 'mpalx', 'mpalxall', 'callx', 'callxall', 'pbmcciteseq', 'cbmcciteseq', 'pbmc5000', 'facs7', 'facs5', 'facs2', 'pbmcscvi', 'cortex', 'retina', 'hemato', 'vdj1', 'vdj1all', 'vdj2', 'vdj2all', 'vdj3', 'vdj3all', 'vdj4', 'vdj4all', 'vdjhs3', 'vdjhs3all', 'vdjhs4', 'vdjhs4all', 'neuron10k', 'neuron10kall', 'heart10k', 'heart10kall', 'memoryt', 'memorytall', 'naivet', 'naivetall', 'regulatoryt', 'regulatorytall', 'cd4t', 'cd4tall', '5k', '5kall', '18k', '18kall', '4k', '4kall', '10k', '10kall'] Return: mRNA data : `SingleCellOMIC` label data: `SingleCellOMIC`. If label data is not availabel, then None Example: gene, prot = get_dataset("cortex") X_train, X_test = gene.split(0.8, seed=1234) y_train, y_test = prot.split(0.8, seed=1234) X_train.assert_matching_cells(y_train) X_test.assert_matching_cells(y_test) """ data_meta = get_dataset_meta() # ====== special case: get all dataset ====== # dataset_name = str(dataset_name).lower().strip() if dataset_name not in data_meta: raise RuntimeError( 'Cannot find dataset with name: "%s", all dataset include: %s' % (dataset_name, ", ".join(list(data_meta.keys())))) with catch_warnings_ignore(FutureWarning): ds = data_meta[dataset_name](override=override, verbose=verbose) # ******************** create SCO ******************** # if isinstance(ds, SingleCellOMIC): return ds # ******************** return ******************** # validating_dataset(ds) with catch_warnings_ignore(FutureWarning): sc = SingleCellOMIC(X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], name=dataset_name) if 'y' in ds: y = ds['y'] if is_binary_dtype(y): sc.add_omic(OMIC.celltype, y, ds['y_col']) else: sc.add_omic(OMIC.proteomic, y, ds['y_col']) return sc
def _adjust(fig, title, pad=0.02): w, h = fig.get_figwidth(), fig.get_figheight() fig.set_size_inches(w=w, h=h + 5) if title is not None: fig.suptitle(title) with catch_warnings_ignore(UserWarning): fig.tight_layout(rect=[0.0, pad, 1.0, 1.0 - pad])
def validate_features_dataset(output_dataset_path, ds_validation_path): ds = F.Dataset(output_dataset_path, read_only=True) print(ds) features = {} for key, val in ds.items(): if 'indices_' in key: name = key.split('_')[-1] features[name] = (val, ds[name]) all_indices = [val[0] for val in features.values()] # ====== sampling 250 files ====== # all_files = sampling_iter(it=all_indices[0].keys(), k=250, seed=Config.SUPER_SEED) all_files = [f for f in all_files if all(f in ids for ids in all_indices)] print("#Samples:", ctext(len(all_files), 'cyan')) # ====== ignore the 20-figures warning ====== # with catch_warnings_ignore(RuntimeWarning): for file_name in all_files: X = {} for feat_name, (ids, data) in features.items(): start, end = ids[file_name] X[feat_name] = data[start:end][:].astype('float32') V.plot_multiple_features(features=X, fig_width=20, title='[%s]%s' % (ds['dsname'][file_name], file_name)) V.plot_save(ds_validation_path, dpi=12)
def test_normalization(self): ds = get_dataset('8kmy') # ignore overflow warning with catch_warnings_ignore(RuntimeWarning): ds1 = ds.expm1(omic=OMIC.transcriptomic, inplace=False) ds2 = ds.expm1(omic=OMIC.proteomic, inplace=False) self.assertTrue(np.all(np.expm1(ds.X) == ds1.X)) self.assertTrue( np.all( np.expm1(ds.numpy(OMIC.proteomic)) == ds2.numpy( OMIC.proteomic))) ds1 = ds.normalize(OMIC.transcriptomic, inplace=False, log1p=True, scale=False, total=False) ds2 = ds.normalize(OMIC.proteomic, inplace=False, log1p=True, scale=False, total=False) self.assertTrue( np.all(ds1.numpy(OMIC.transcriptomic) == np.log1p(ds.X))) self.assertTrue( np.all(ds1.numpy(OMIC.proteomic) == ds.numpy(OMIC.proteomic))) self.assertTrue( np.all( ds2.numpy(OMIC.proteomic) == np.log1p(ds.numpy( OMIC.proteomic)))) self.assertTrue( np.all( ds2.numpy(OMIC.transcriptomic) == ds.numpy( OMIC.transcriptomic)))
def _fit_mapping(x: np.ndarray, y: np.ndarray, n_bins: int): from odin.utils import catch_warnings_ignore from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import KBinsDiscretizer from odin.ml.gmm_classifier import GMMclassifier assert x.ndim == 1 and y.ndim == 1 x = x[:, np.newaxis] # already discrete labels, and the number of bins is enough if np.all(y == y.astype(np.int32)) and len(np.unique(y)) <= n_bins: n_bins = len(np.unique(y)) model = GMMclassifier(strategy='all', n_components=2, covariance_type='full', n_init=5, random_state=1) model.fit(x, y) else: y = KBinsDiscretizer(n_bins=int(n_bins), encode='ordinal', strategy='uniform').fit_transform(y[:, np.newaxis]) y = y.ravel().astype(np.int64) with catch_warnings_ignore(UserWarning): model = GridSearchCV(estimator=LogisticRegression( max_iter=500, solver='liblinear', random_state=1), cv=2, param_grid=dict(C=np.linspace(0.5, 5, num=5))) model.fit(x, y) return model, n_bins
def test_visualization_celltype(self): sco = get_dataset('cortex') for X, var_names, rank_genes, clustering, dendrogram in itertools.product( ('cell', 'tran'), \ (None, 10), (0, 3), ('kmeans', 'louvain', None), (True, False)): if X == 'cell' and rank_genes > 0: continue # check louvain available if clustering == 'louvain': try: import louvain except ImportError: continue # plotting with catch_warnings_ignore(ignore_warnings): sco.plot_heatmap(X=X, groupby=OMIC.celltype, var_names=var_names, clustering=clustering, rank_genes=rank_genes) sco.plot_dotplot(X=X, groupby=OMIC.celltype, var_names=var_names, clustering=clustering, rank_genes=rank_genes) sco.plot_stacked_violins(X=X, groupby=OMIC.celltype, var_names=var_names, clustering=clustering, rank_genes=rank_genes) sco.save_figures('/tmp/tmp2.pdf')
def _analyze(ds_name, model_path, outpath, y_true, all_proteins, verbose): from sisua.analysis import Posterior with open(model_path, 'rb') as f: infer = pickle.load(f) ds_infer = infer.configs['dataset'] ds = [j for i, j in all_datasets if i == ds_name][0] # path is a folder path = os.path.join( outpath, 'data%s_model%s' % (ds_name.replace('_', '').upper(), ds_infer.replace('_', '').upper())) path = os.path.join(path, infer.short_id) if not os.path.exists(path): os.mkdir(path) # log start if verbose: print("\nData:%s - Model:%s" % (ctext(ds_name, 'yellow'), ctext(ds_infer, 'yellow'))) print(" Outpath:", ctext(path, 'cyan')) # create a mixed Posterior pos = Posterior(infer, ds=ds) # a lot of figures so RuntimeWarning about maximum amount # of figure will be appeared with catch_warnings_ignore(RuntimeWarning): # analysis pos.new_figure().plot_latents_binary_scatter( size=4).plot_latents_distance_heatmap( ).plot_correlation_marker_pairs() # protein series if infer.is_semi_supervised: y_pred = { i: j for i, j in zip( dict(all_datasets)[ds_infer]['y_col'], infer.predict_y(ds['X']).T) if i in all_proteins } y_pred = np.hstack( [y_pred[i][:, np.newaxis] for i in all_proteins]) pos.plot_protein_predicted_series(y_true_new=y_true, y_pred_new=y_pred, labels_new=all_proteins) for prot_name in all_proteins: pos.plot_protein_scatter(protein_name=prot_name, y_true_new=y_true, y_pred_new=y_pred, labels_new=all_proteins) # save plot and show log pos.save_plots(path, dpi=80)
def clustering_scores(latent, labels, n_labels, prediction_algorithm='both'): """ Clustering Scores: * silhouette_score (higher is better, best is 1, worst is -1) * adjusted_rand_score (higher is better) * normalized_mutual_info_score (higher is better) * unsupervised_clustering_accuracy (higher is better) note: remember the order of returned value Parameters ---------- labels : categorical labels (i.e. single classes or one-hot encoded) prediction_algorithm : {'knn', 'gmm', 'both'} """ # simple normalization to 0-1, then pick the argmax if labels.ndim == 2: min_val = np.min(labels, axis=0, keepdims=True) max_val = np.max(labels, axis=0, keepdims=True) labels = (labels - min_val) / (max_val - min_val) labels = np.argmax(labels, axis=-1) if prediction_algorithm == 'knn': km = KMeans(n_labels, n_init=200, random_state=5218) labels_pred = km.fit_predict(latent) elif prediction_algorithm == 'gmm': gmm = GaussianMixture(n_labels, random_state=5218) gmm.fit(latent) labels_pred = gmm.predict(latent) elif prediction_algorithm == 'both': score1 = clustering_scores(latent, labels, n_labels=n_labels, prediction_algorithm='knn') score2 = clustering_scores(latent, labels, n_labels=n_labels, prediction_algorithm='gmm') return {k: (v + score2[k]) / 2 for k, v in score1.items()} else: raise ValueError("Not support for prediction_algorithm: '%s'" % prediction_algorithm) # with catch_warnings_ignore(FutureWarning): asw_score = silhouette_score(latent, labels) ari_score = adjusted_rand_score(labels, labels_pred) nmi_score = normalized_mutual_info_score(labels, labels_pred) uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0] return dict(ASW=asw_score, ARI=ari_score, NMI=nmi_score, UCA=uca_score)
def multi_label_adj_Rindex(label_bin_true, label_pred): assert label_bin_true.ndim == 2 assert label_bin_true.shape[1] == len(np.unique(label_pred)) n_classes = label_bin_true.shape[1] with catch_warnings_ignore(Warning): scores = [] for y in label_bin_true.T: y = y.astype('int32') s = max( adjusted_rand_score(labels_true=y, labels_pred=( label_pred == i).astype('int32')) for i in range(n_classes)) scores.append(s) return scores
def test_metrics(self): sco = get_dataset('8kmy') with catch_warnings_ignore(ConvergenceWarning): sco.rank_vars_groups(clustering='kmeans') sco.calculate_quality_metrics() with sco._swap_omic('prot'): sco.rank_vars_groups(clustering='kmeans') sco.calculate_quality_metrics() if _SCVI: sco = get_dataset('cortex') sco.rank_vars_groups(clustering='kmeans') sco.calculate_quality_metrics() with sco._swap_omic('cell'): sco.rank_vars_groups(clustering='kmeans') sco.calculate_quality_metrics()
def _fit(x, y, n_bins): from sklearn.preprocessing import KBinsDiscretizer from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from odin.utils import catch_warnings_ignore x = x[:, np.newaxis] y = KBinsDiscretizer(n_bins=int(n_bins), encode='ordinal').fit_transform( y[:, np.newaxis]).ravel().astype(np.int64) with catch_warnings_ignore(UserWarning): lr = GridSearchCV(estimator=LogisticRegression(max_iter=500, solver='liblinear', random_state=1234), cv=2, param_grid=dict(C=np.linspace(0.5, 5, num=5))) lr.fit(x, y) return lr
def _report(y_p, y_t, pad=''): with catch_warnings_ignore(Warning): z_ = np.concatenate(y_p, axis=0) z = np.concatenate(y_t, axis=0) print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1))) z_ = np.concatenate([np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0) z = np.array([i[0] for i in y_t]) print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))
def robust_run(method_name, log_text, fn, *args, **kwargs): r""" Run an evaluation function and catch exception without interupting the execution """ assert callable(fn) with catch_warnings_ignore(UserWarning): try: fn(*args, **kwargs) except Exception as e: text = StringIO() traceback.print_exception(*sys.exc_info(), limit=None, file=text, chain=True) text.seek(0) text = text.read().strip() text += f"\n{e}" SE.write_error(traceback=text, method_name=method_name, config=log_text)
def prepare(self): with catch_warnings_ignore(RuntimeWarning): sco = get_dataset('cortex') om1, om2 = sco.omics train, test = sco.split(train_percent=0.8, seed=1) n_gene = sco.numpy(om1).shape[1] n_prot = sco.numpy(om2).shape[1] rvs = [ RandomVariable(n_gene, 'zinbd', om1.name), RandomVariable(n_prot, 'onehot', om2.name) ] all_models = [ DeepCountAutoencoder, SCALE, SCVI, VariationalAutoEncoder ] all_configs = [ NetworkConfig(), NetworkConfig(pyramid=True), NetworkConfig(use_conv=True), NetworkConfig(pyramid=True, use_conv=True) ] return train, test, rvs, all_models, all_configs
def _clustering_scores(y, X=None, z=None, algo='kmeans', random_state=1): n_factors = len(np.unique(y)) if z is None: if algo == 'kmeans': model = KMeans(n_factors, n_init=200, random_state=random_state) elif algo == 'gmm': model = GaussianMixture(n_factors, random_state=random_state) elif algo in ('both', 'avg', 'avr', 'average', 'mean'): score1 = _clustering_scores(X=X, y=y, z=z, algo='kmeans', random_state=random_state) score2 = _clustering_scores(X=X, y=y, z=z, algo='gmm', random_state=random_state) return {k: (v + score2[k]) / 2 for k, v in score1.items()} else: raise ValueError("Not support for prediction_algorithm: '%s'" % algo) # the scores y_pred = model.fit_predict(X) else: z = z.ravel() assert z.shape[0] == y.shape[0], \ f"predictions must have shape: {y.shape}, but given: {z.shape}" y_pred = z with catch_warnings_ignore(FutureWarning): return dict( ASW=silhouette_score( X if X is not None else np.expand_dims(z, axis=-1), y), ARI=adjusted_rand_score(y, y_pred), NMI=normalized_mutual_info_score(y, y_pred), UCA=_unsupervised_clustering_accuracy(y, y_pred)[0], HOS=homogeneity_score(y, y_pred), COS=_cluster_completeness_score(y, y_pred), )
def _report(y_p, y_t, pad=''): with catch_warnings_ignore(Warning): z_ = np.concatenate(y_p, axis=0) z = np.concatenate(y_t, axis=0) print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1))) z_ = np.concatenate( [np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0) z = np.array([i[0] for i in y_t]) print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))
# =========================================================================== # ====== basic path ====== # output_dataset_path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) processor_log_path = os.path.join(EXP_DIR, 'processor_%s.log' % FEATURE_RECIPE) if os.path.exists(processor_log_path): os.remove(processor_log_path) print("Log path:", ctext(processor_log_path, 'cyan')) ds_validation_path = os.path.join(EXP_DIR, 'validate_%s.pdf' % FEATURE_RECIPE) if os.path.exists(ds_validation_path): os.remove(ds_validation_path) print("Validation path:", ctext(ds_validation_path, 'cyan')) # ====== running the processing ====== # with catch_warnings_ignore(Warning): processor = pp.FeatureProcessor(jobs=ALL_FILES, path=output_dataset_path, extractor=recipe, n_cache=320, ncpu=NCPU, override=True, identifier='name', log_path=processor_log_path, stop_on_failure=False) processor.run() # =========================================================================== # Make some visualization # =========================================================================== validate_features_dataset(output_dataset_path, ds_validation_path)
def test_save_load_2(self): r""" Load and train the model """ print("*** Test loading model ***") from matplotlib import pyplot as plt from odin import visual as vs import seaborn as sns sns.set() # train, test, rvs, models, configs = self.prepare() for (MODEL, network, is_semi, path, log_path, pca_path, stat_path, hist_path) in model_iteration(models, configs): with open(log_path, 'rb') as f: log = pickle.load(f) model = load(path) with catch_warnings_ignore(UserWarning): # test statistics plt.figure(figsize=(12, 5)) text_train, p_train, zmean_train, zvar_train = predict2info( model, train) text_test, p_test, zmean_test, zvar_test = predict2info( model, test) # check latent mean and variance zmean_train1, zmean_test1 = log['zmean'] zvar_train1, zvar_test1 = log['zvar'] self.assertTrue(np.allclose(zmean_train, zmean_train1)) self.assertTrue(np.allclose(zmean_test, zmean_test1)) self.assertTrue(np.allclose(zvar_train, zvar_train1)) self.assertTrue(np.allclose(zvar_test, zvar_test1)) # plotting plt.subplot(1, 2, 1) plt.plot(tf.math.log(text_train), label='Loaded') plt.plot(tf.math.log(log['predict_train']), label='Saved') plt.title("Train") plt.legend() plt.subplot(1, 2, 2) plt.plot(tf.math.log(text_test), label='Loaded') plt.plot(tf.math.log(log['predict_test']), label='Saved') plt.title("Test") plt.legend() plt.tight_layout() vs.plot_save(stat_path, dpi=120, clear_all=True, log=True) # test pca pca = extract_pca(p_train, p_test) plt.figure(figsize=(8, 3 * len(pca))) for i, (dist, old, new) in enumerate(zip(p_train, log['pca'], pca)): assert old.shape == new.shape plt.subplot(len(pca), 2, i * 2 + 1) plt.scatter(old[:, 0], old[:, 1], s=4) if i == 0: plt.title('Saved') plt.ylabel(dist.name) # plt.subplot(len(pca), 2, i * 2 + 2) plt.scatter(new[:, 0], new[:, 1], s=4) if i == 0: plt.title('Loaded') plt.tight_layout() vs.plot_save(pca_path, dpi=120, clear_all=True, log=True) # model.fit(train, epochs=2, verbose=False) model.plot_learning_curves() model.save_figures(hist_path)
def _initialize(self): scm = self.scm sco = self.sco_corrupted outputs, latents = scm.predict( sco.create_dataset(self.scm.output_layers[0].name, batch_size=self.batch_size, shuffle=0, drop_remainder=False), sample_shape=self.sample_shape, verbose=self.verbose, ) # infer output OMICs dim2omic = defaultdict(list) for om in self.input_omics: dim2omic[self.sco_original.get_dim(om)].append(om) for o in tf.nest.flatten(outputs): assert isinstance(o, tfd.Distribution), \ f"SingleCellModel must output Distribution but return {o}" name = o.name try: om = OMIC.parse(name) except Exception: om = None if om is None: oms = dim2omic[o.event_shape[0]] if len(oms) > 1: raise RuntimeError( f"Cannot infer OMIC type for output {o}") om = oms[0] self.output_omics.append(om.name) # variables' description self._n_latents = len(tf.nest.flatten(latents)) self._n_outputs = len(tf.nest.flatten(outputs)) ## default inputs for om in self.input_omics: self.omics_data[(om, 'corrupted')] = sco.get_omic(om) # latent is the same for all self.omics_data[(OMIC.latent.name, 'corrupted')] = tf.nest.flatten(latents) # infer if the distribution is imputed for l, o in zip(scm.output_layers, tf.nest.flatten(outputs)): self.omics_data[(l.name, 'reconstructed')] = o is_independent = 0 if isinstance(o, tfd.Independent): is_independent = o.reinterpreted_batch_ndims o = o.distribution if isinstance(o, tfd.ZeroInflated): o = o.count_distribution if is_independent > 0: o = tfd.Independent(o, reinterpreted_batch_ndims=is_independent) self.omics_data[(l.name, 'imputed')] = o ### create the SingleCellOMIC dataset for analysis sco = self.sco_original.copy() for om in self.input_omics: if (om, 'imputed') in self.omics_data: data_type = 'imputed' elif (om, 'reconstructed') in self.omics_data: data_type = 'reconstructed' else: continue data = self.omics_data[(om, data_type)] om_new = OMIC.parse(f'i{om}') # prepare the new data if isinstance(data, tfd.Distribution): data = data.mean().numpy() if data.ndim == 3: data = np.mean(data, axis=0) # find the variable's names if om in self.scm.metadata: var_names = self.scm.metadata[om] else: var_names = np.array( [f'{om}{i}' for i in range(data.shape[1])]) sco.add_omic(omic=om_new, X=data, var_names=var_names) # add the latents Zs = self.omics_data[('latent', 'corrupted')] if len(Zs) > 1: means = [z.mean() for z in Zs] Zs = self.reduce_latents(means) else: Zs = Zs[0].mean() with catch_warnings_ignore(UserWarning, RuntimeWarning): sco.add_omic(omic=OMIC.latent, X=Zs.numpy(), var_names=np.array( [f'Z{i}' for i in range(Zs.shape[1])])) # store the extracted SingleCellOMIC dataset self._dataset = sco
def get_criticizer(self, factor_omic='proteomic', latent_indices=None, n_bins=5, strategy='quantile') -> Criticizer: r""" Create a probabilistic criticizer for evaluating the latent codes of variational models. Arguments: factor_omic : instance of OMIC. which OMIC type be used as factors (or labels). n_bins : int (default=8) The number of bins to produce discretized factors. strategy : {'uniform', 'quantile', 'kmeans', 'gmm'}, (default='quantile') Strategy used to define the widths of the bins. uniform - All bins in each feature have identical widths. quantile - All bins in each feature have the same number of points. kmeans - Values in each bin have the same nearest center of a 1D k-means cluster. """ sco = self.dataset assert factor_omic in sco.omics, \ f"factor_omic='{factor_omic}' not found, available are: {sco.omics}" factor_omic = OMIC.parse(factor_omic) if latent_indices is None: key = f"{factor_omic.name}" else: name = '_'.join(f'{i:d}' for i in latent_indices) key = f"{factor_omic.name}{name}" # create the Criticizer if key not in self._criticizers: # check the factors is valid factors = sco.numpy(factor_omic) factor_names = sco.get_var_names(factor_omic) kw = dict(n_bins=int(n_bins), strategy=None) # binary classes if np.all(np.sum(factors, axis=1) == 1): factors = np.argmax(factors, axis=1)[:, np.newaxis] factor_names = np.asarray([factor_omic.name]) # continuous or discrete cases elif factor_omic in (OMIC.proteomic, OMIC.iproteomic, OMIC.pmhc, OMIC.ipmhc): kw['strategy'] = strategy # categorical factors elif factor_omic in (OMIC.progenitor, OMIC.iprogenitor, OMIC.celltype, OMIC.icelltype): pass # unknown factor else: warnings.warn( f"No support for discretization of OMIC: {factor_omic}", RuntimeWarning) return # only valid factors with > 1 classes ids = [len(np.unique(i)) > 1 for i in factors.T] if not any(ids): # no valid factor found warnings.warn(f"Not a valid factor: {factor_omic.name}", RuntimeWarning) return factors = factors[:, ids] factor_names = factor_names[ids] # create the criticizer crt = Criticizer(vae=self.scm, latent_indices=latent_indices, random_state=self.rand.randint(1e8)) crt.factor_omic: OMIC = factor_omic with catch_warnings_ignore(UserWarning): latents = self.omics_data[('latent', 'corrupted')] crt.sample_batch(latents=latents, factors=factors, factor_names=factor_names, **kw) self._criticizers[key] = crt return self._criticizers[key]
def plot_diagnosis(self, X, labels=None, n_bins=200): X, labels, n_classes = self._check_input(X, labels) nrow = n_classes ncol = 1 fig = plot_figure(nrow=nrow * 2, ncol=8) # add 1 for threshold color # add 1 for PDF color colors = sns.color_palette(n_colors=self.n_components_per_class + 2) for i, (name, (order, gmm)) in enumerate(zip(labels, self._models)): start = ncol * i means_ = gmm.means_.ravel()[order] precision_ = gmm.precisions_.ravel()[order] x = self.normalize(X[:, i], test_mode=False) # ====== scores ====== # # score score_llk = gmm.score(x[:, np.newaxis]) score_bic = gmm.bic(x[:, np.newaxis]) score_aic = gmm.aic(x[:, np.newaxis]) # ====== the histogram ====== # ax = plt.subplot(nrow, ncol, start + 1) count, bins = _draw_hist(x, ax=ax, title="[%s] LLK:%.2f BIC:%.2f AIC:%.2f" % (name, score_llk, score_bic, score_aic), n_bins=n_bins, show_yticks=True) # ====== draw GMM PDF ====== # y_ = np.exp(gmm.score_samples(bins[:, np.newaxis])) y_ = (y_ - np.min(y_)) / (np.max(y_) - np.min(y_)) * np.max(count) ax.plot(bins, y_, color='red', linestyle='-', linewidth=1.5, alpha=0.6) # ====== draw the threshold ====== # ci = stats.norm.interval( np.abs(self.ci_threshold), loc=gmm.means_[order[self.positive_component]], scale=np.sqrt(1 / gmm.precisions_[order[self.positive_component]])) threshold = ci[0] if self.ci_threshold < 0 else ci[1] ids = np.where(bins >= threshold, True, False) ax.fill_between(bins[ids], y1=0, y2=np.max(count), facecolor=colors[-2], alpha=0.3) ax.text(np.min(bins[ids]), np.min(count), "%.2f" % threshold) # ====== plot GMM probability ====== # x_ = np.linspace(np.min(bins), np.max(bins), 1200) y_ = gmm.predict_proba(x_[:, np.newaxis]) * np.max(count) for c, j in zip(colors, y_.T): plt.plot(x_, j, color=c, linestyle='--', linewidth=1.8, alpha=0.6) # ====== draw the each Gaussian bell ====== # ax = ax.twinx() _x = np.linspace(start=np.min(x), stop=np.max(x), num=800) for c, m, p in zip(colors, means_, precision_): with catch_warnings_ignore(Warning): j = mlab.normpdf(_x, m, np.sqrt(1 / p)) ax.plot(_x, j, color=c, linestyle='-', linewidth=1) ax.scatter(_x[np.argmax(j)], np.max(j), s=66, alpha=0.8, linewidth=0, color=c) ax.yaxis.set_ticklabels([]) fig.tight_layout() self.add_figure('diagnosis', fig) return self
def train_and_evaluate(model_name, train_ds): if model_name == 'dca': from sisua.inference import InferenceDCA as Inference elif model_name == 'scvae': from sisua.inference import InferenceSCVAE as Inference elif model_name == 'sisua': from sisua.inference import InferenceSISUA as Inference elif model_name == 'scvi': from sisua.inference import InferenceSCVI as Inference else: raise NotImplementedError from sisua.analysis import Posterior outpath = os.path.join(FIGURE_PATH, '%s_train%s' % (model_name, train_ds.upper())) if not os.path.exists(outpath): os.mkdir(outpath) print("\n======== Running experiment ========") print("Model :", ctext(model_name, 'cyan')) print("Inference :", ctext(Inference, 'cyan')) print("Train data:", ctext(train_ds, 'cyan')) print("Out path :", ctext(outpath, 'cyan')) ds, gene, prot = all_datasets[train_ds] n_prots = prot.feat_dim org_prot = [standardize_protein_name(i) for i in prot.col_name] # ====== Main model training ====== # if model_name == 'sisua': model = Inference(gene_dim=n_genes, prot_dim=n_prots) else: model = Inference(gene_dim=n_genes) model.fit(X=gene.X_train, y=prot.X_train if model.is_semi_supervised else None, corruption_rate=corruption_rate, corruption_dist=corruption_dist, n_epoch=n_epoch, batch_size=batch_size, detail_logging=False) # ====== start evaluation ====== # for name, (ds, gene, prot) in all_datasets.items(): y_true = { i: j for i, j in zip( [standardize_protein_name(i) for i in prot.col_name], ds['y'].T) if i in all_proteins } # preserve the same order of all_proteins y_true = np.hstack([y_true[i][:, np.newaxis] for i in all_proteins]) prot = SingleCellOMIC(matrix=y_true, rowname=ds['X_row'], colname=all_proteins) # create a mixed Posterior pos = Posterior(model, ds=dict(X_train=gene.X_train, X_test=gene.X_test, X_col=gene.col_name, y_train=prot.X_train, y_test=prot.X_test, y_col=prot.col_name)) # a lot of figures so RuntimeWarning about maximum amount # of figure will be appeared with catch_warnings_ignore(RuntimeWarning): # analysis pos.new_figure().plot_latents_binary_scatter( size=4).plot_latents_distance_heatmap( ).plot_correlation_marker_pairs() # protein series if model.is_semi_supervised: y_true = pos.y_test y_pred = model.predict_y(pos.X_test) y_pred = { i: j for i, j in zip(org_prot, y_pred.T) if i in all_proteins } y_pred = np.hstack( [y_pred[i][:, np.newaxis] for i in all_proteins]) pos.plot_protein_predicted_series(y_true_new=y_true, y_pred_new=y_pred, labels_new=all_proteins) for prot_name in all_proteins: pos.plot_protein_scatter(protein_name=prot_name, y_true_new=y_true, y_pred_new=y_pred, labels_new=all_proteins) # save plot and show log pos.save_plots(os.path.join(outpath, '%s.pdf' % name), dpi=80)
def plot_gaussian_mixture(x, gmm, bins=80, fontsize=12, linewidth=2, show_pdf=False, show_probability=False, show_components=True, legend=True, ax=None, title=None): import seaborn as sns from odin.utils import as_tuple, catch_warnings_ignore from scipy import stats from sklearn.mixture import GaussianMixture ax = to_axis(ax, is_3D=False) n_points = int(bins * 12) assert gmm.means_.shape[1] == 1, "Only support plotting 1-D series GMM" x = x.ravel() order = np.argsort(gmm.means_.ravel()) means_ = gmm.means_.ravel()[order] precision_ = gmm.precisions_.ravel()[order] colors = sns.color_palette(n_colors=gmm.n_components + 2) # ====== Histogram ====== # count, bins = plot_histogram(x=x, bins=int(bins), ax=ax, normalize=False, kde=False, range_0_1=False, covariance_factor=0.25, centerlize=False, fontsize=fontsize, alpha=0.25, title=title) ax.set_ylabel("Histogram Count", fontsize=fontsize) ax.set_xlim((np.min(x), np.max(x))) ax.set_xticks( np.linspace(start=np.min(x), stop=np.max(x), num=5, dtype='float32')) ax.set_yticks( np.linspace(start=np.min(count), stop=np.max(count), num=5, dtype='int32')) # ====== GMM PDF ====== # x_ = np.linspace(np.min(bins), np.max(bins), n_points) y_ = np.exp(gmm.score_samples(x_[:, np.newaxis])) y_ = (y_ - np.min(y_)) / (np.max(y_) - np.min(y_)) * np.max(count) if show_pdf: ax.plot(x_, y_, color='red', linestyle='-', linewidth=linewidth * 1.2, alpha=0.6, label="GMM log-likelihood") # ====== GMM probability ====== # twinx = None ymax = 0.0 if show_probability: if twinx is None: twinx = ax.twinx() y_ = gmm.predict_proba(x_[:, np.newaxis]) for idx, (c, j) in enumerate(zip(colors, y_.T)): twinx.plot(x_, j, color=c, linestyle='--', linewidth=linewidth, alpha=0.8, label=r"$p_{\#%d}(x)$" % idx) ymax = max(ymax, np.max(y_)) # ====== draw the each Gaussian bell ====== # if show_components: if twinx is None: twinx = ax.twinx() for idx, (c, m, p) in enumerate(zip(colors, means_, precision_)): with catch_warnings_ignore(Warning): j = stats.norm.pdf(x_, m, np.sqrt(1 / p)) twinx.plot(x_, j, color=c, linestyle='-', linewidth=linewidth, label=r"$PDF_{\#%d}$" % idx) # mean, top of the bell twinx.scatter(x_[np.argmax(j)], np.max(j), s=88, alpha=0.8, linewidth=0, color=c) ymax = max(ymax, np.max(j)) twinx.set_ylabel("Probability Density", fontsize=fontsize) twinx.grid(False) # set the limit for twinx if twinx is not None: twinx.set_ylim(0.0, ymax * 1.05) # ====== show legend ====== # if twinx is not None: twinx.yaxis.label.set_color(colors[0]) twinx.tick_params(axis='y', colors=colors[0]) if legend: ax.legend(fontsize=fontsize) if twinx is not None: twinx.legend(fontsize=fontsize) return ax
# =========================================================================== # ====== basic path ====== # output_dataset_path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) processor_log_path = os.path.join(EXP_DIR, 'processor_%s.log' % FEATURE_RECIPE) if os.path.exists(processor_log_path): os.remove(processor_log_path) print("Log path:", ctext(processor_log_path, 'cyan')) ds_validation_path = os.path.join(EXP_DIR, 'validate_%s.pdf' % FEATURE_RECIPE) if os.path.exists(ds_validation_path): os.remove(ds_validation_path) print("Validation path:", ctext(ds_validation_path, 'cyan')) # ====== running the processing ====== # with catch_warnings_ignore(Warning): processor = pp.FeatureProcessor( jobs=ALL_FILES, path=output_dataset_path, extractor=recipe, n_cache=320, ncpu=NCPU, override=True, identifier='name', log_path=processor_log_path, stop_on_failure=False) processor.run() # =========================================================================== # Make some visualization # =========================================================================== validate_features_dataset(output_dataset_path, ds_validation_path)
def plot_correlation_marker_pairs(self, test=True, fontsize=8): start_time = time.time() from scipy.stats import pearsonr, spearmanr n_system = len(self) data_type = 'test' if test else 'train' # OrderDict(name -> series) original_series = None imputed_series = [] for pos in self: if test: v, x, y = pos.V_test, pos.X_test_org, pos.y_test else: v, x, y = pos.V_train, pos.X_train_org, pos.y_train if original_series is None: original_series = correlation_scores(X=x, y=y, gene_name=pos.gene_name, protein_name=pos.labels, return_series=True) imputed_series.append( correlation_scores(X=v, y=y, gene_name=pos.gene_name, protein_name=pos.labels, return_series=True)) # ====== plotting ====== # n_pair = len(original_series) fig = plt.figure(figsize=(20, 5 * n_pair), constrained_layout=True) width = 4 grids = fig.add_gridspec(n_pair, (n_system + 1) * width) for row_idx, prot_gene in enumerate(original_series.keys()): prot_name, gene_name = prot_gene.split('/') original_gene, prot = original_series[prot_gene] # gather all series gene = [original_gene] system_name = ["Original"] for s, posetrior in zip(imputed_series, self.posteriors): i, j = s[prot_gene] assert np.all(prot == j) gene.append(i) system_name.append(posetrior.short_id_lines) # plotting each series for col_idx, (name, g) in enumerate(zip(system_name, gene)): ax = fig.add_subplot(grids[row_idx, width * col_idx:(width * col_idx + width)]) ax.scatter(prot, g, s=25, alpha=0.6, linewidths=0) plot_aspect('auto', 'box', ax) title = data_type + ' - ' + prot_gene + ' - %s' if col_idx == 0 else "%s" title += '\nPearson:%.2f Spearman:%.2f' ax.set_title(title % (name, pearsonr( g, prot)[0], spearmanr(g, prot).correlation), fontsize=fontsize + (2 if col_idx == 0 else 0)) if col_idx == 0: ax.set_xlabel('[Protein] %s' % prot_name, fontsize=fontsize) ax.set_ylabel('[Gene] %s' % gene_name, fontsize=fontsize) if np.mean(g) < 0.1: for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(6) # ax = fig.add_subplot( # grids[row_idx, (width * col_idx + width - 1): (width * col_idx + width)]) ax = ax.twiny() ax.boxplot(g) ax.set_xticks(()) # ax.set_xlabel(gene_name, fontsize=fontsize) with catch_warnings_ignore(UserWarning): plt.tight_layout() self.add_figure('correlation_%s' % data_type, fig) return self._log( 'plot_correlation_marker_pairs[%s] %s(s)' % (data_type, ctext(time.time() - start_time, 'lightyellow')))
def plot_imputation_scatter(self, test=True, pca=False, color_by_library=True): start_time = time.time() n_system = len(self) + 2 # add the original and the corrupted data_type = 'test' if test else 'train' if n_system <= 5: nrow = 1 ncol = n_system else: nrow = 2 ncol = int(np.ceil(n_system / 2)) X_org = self.posteriors[0].X_test_org if test else self.posteriors[ 0].X_train_org X_crr = self.posteriors[0].X_test if test else self.posteriors[ 0].X_train y = self.posteriors[0].y_test if test else self.posteriors[0].y_train labels = self.posteriors[0].labels is_binary_classes = self.posteriors[0].is_binary_classes allV = [X_org, X_crr] + [ pos.V_test if test else pos.V_train for pos in self.posteriors ] assert X_org.shape == X_crr.shape and all(v.shape == X_org.shape for v in allV) all_names = ["[%s]Original" % data_type, "[%s]Corrupted" % data_type ] + [i.short_id_lines for i in self.posteriors] # log-normalize everything if len(X_org) > 5000: np.random.seed(5218) ids = np.random.permutation(X_org.shape[0])[:5000] allV = [v[ids] for v in allV] y = y[ids] if is_binary_classes: y = np.argmax(y, axis=-1) else: y = ProbabilisticEmbedding().fit_transform(y) y = np.argmax(y, axis=-1) allV = [log_norm(v) for v in allV] fig = plt.figure(figsize=(min(20, 5 * ncol) + 2, nrow * 5)) for idx, (name, v) in enumerate(zip(all_names, allV)): ax = plt.subplot(nrow, ncol, idx + 1) n = np.sum(v, axis=-1) v = fast_pca(v, n_components=2) if pca else fast_tsne( v, n_components=2) with catch_warnings_ignore(Warning): if color_by_library: plot_scatter(x=v, val=n, ax=ax, size=8, legend_enable=False, grid=False, title=name) else: plot_scatter(x=v, color=[labels[i] for i in y], marker=[labels[i] for i in y], ax=ax, size=8, legend_enable=True if idx == 0 else False, grid=False, title=name) with catch_warnings_ignore(Warning): plt.tight_layout() self.add_figure( 'imputation_scatter_%s_%s' % ('lib' if color_by_library else 'cell', data_type), fig) return self._log( 'plot_imputation_scatter[%s] %s(s)' % (data_type, ctext(time.time() - start_time, 'lightyellow')))
def main(model, ds1, ds2, batch_size, score_enable, plot_enable, override=False): print("Start evaluation:") print(f" - model : {model}") print(f" - dataset1 : {ds1}") print(f" - dataset2 : {ds2}") print(f" - batch_size: {batch_size}") print(f" - override : {override}") print(f" - plot:{plot_enable} score:{score_enable}") result_dir = SE.get_result_dir() if len(ds2) == 0: outpath = os.path.join(result_dir, f"{model}_{ds1}") else: outpath = os.path.join(result_dir, f"{model}_{ds1}_{ds2}") # overriding exist paths if override and os.path.exists(outpath): print(f"Override path '{outpath}'") shutil.rmtree(outpath) if not os.path.exists(outpath): os.makedirs(outpath) ### Load the model and dataset hash1, cfg1, m1 = SE.get_models(f"dataset.name={ds1} model.name={model}", load_models=True, return_hash=True)[0] test1: SingleCellOMIC = m1.test vae1: SingleCellModel = m1.model is_semi = vae1.is_semi_supervised if len(ds2) > 0: hash2, cfg2, m2 = SE.get_models( f"dataset.name={ds2} model.name={model}", load_models=True, return_hash=True)[0] test2: SingleCellOMIC = m2.test vae2: SingleCellModel = m2.model else: test2 = None vae2 = None cfg2 = None hash2 = None # Create the posterior kw = dict(batch_size=batch_size, verbose=True) # mapping from: if vae2 is None: posterior = Posterior(vae1, test1, name=f"{model}_{ds1}", **kw) else: posterior = Posterior(vae1, test2, name=f"{model}_{ds1}_{ds2}", **kw) ### running the evaluation train_ds = ds1 test_ds = ds2 with catch_warnings_ignore(UserWarning): # calculateing the scores if score_enable: robust_run("evaluate_scoring", f"model:{model} train:{train_ds} test:{test_ds}", scoring, posterior, outpath, train_ds, test_ds) # plotting the figures if plot_enable: robust_run("evaluate_plotting", f"model:{model} train:{train_ds} test:{test_ds}", plotting, posterior, outpath, train_ds, test_ds)
def streamline_classifier(Z_train, y_train, Z_test, y_test, labels_name, mode='ovr', title='', plot_train_results=False, show_plot=True, return_figure=False): r""" Arguments: fig : Figure or tuple (`float`, `float`), optional (default=`None`) width, height in inches Returns: (results_train, results_test), (fig_train, fig_test) results is a dictionary of scores { F1micro=f1_micro * 100, F1macro=f1_macro * 100, F1weight=f1_weight * 100, F1_[classname]=... } """ mode = mode.strip().lower() assert mode in ('ovr', 'ovo'), \ "Only support ovr - one vs rest, ovo - one vs one; mode for streamline classifier" labels_name = [standardize_protein_name(i) for i in labels_name] results_train = {} results_test = {} labels_name = np.array(labels_name) with catch_warnings_ignore(FutureWarning): with catch_warnings_ignore(RuntimeWarning): n_classes = len(labels_name) # ====== preprocessing ====== # if y_train.ndim == 1 or y_train.shape[1] == 1: y_train = one_hot(y_train.ravel(), nb_classes=n_classes) if y_test.ndim == 1 or y_test.shape[1] == 1: y_test = one_hot(y_test.ravel(), nb_classes=n_classes) is_binary_classes = sorted(np.unique( y_train.astype('float32'))) == [0., 1.] # ====== not binary classes ====== # if not is_binary_classes: gmm = ProbabilisticEmbedding() gmm.fit(np.concatenate((y_train, y_test), axis=0)) y_train = gmm.predict(y_train) y_test = gmm.predict(y_test) # kernel : 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' if mode == 'ovr': classifier = OneVsRestClassifier(SVC( kernel='linear', random_state=UNIVERSAL_RANDOM_SEED), n_jobs=n_classes) classifier.fit(X=Z_train, y=y_train) else: raise NotImplementedError classifier = SVC(kernel='linear', decision_function_shape='ovo', random_state=UNIVERSAL_RANDOM_SEED) classifier.fit(X=Z_train, y=y_train) # ====== return ====== # from sklearn.exceptions import UndefinedMetricWarning with catch_warnings_ignore(UndefinedMetricWarning): results_train = plot_evaluate_classifier( y_pred=classifier.predict(Z_train), y_true=y_train, labels=labels_name, title='[train]' + title, show_plot=show_plot and plot_train_results, return_figure=True) results_test = plot_evaluate_classifier( y_pred=classifier.predict(Z_test), y_true=y_test, labels=labels_name, title='[test]' + title, show_plot=show_plot, return_figure=True) if show_plot: if plot_train_results: results_train, fig_train = results_train[0], results_train[ 1] else: fig_train = None results_test, fig_test = results_test[0], results_test[1] results_train = OrderedDict( sorted(results_train.items(), key=lambda x: x[0])) results_test = OrderedDict( sorted(results_test.items(), key=lambda x: x[0])) results = (results_train, results_test) if show_plot and return_figure: return results, (fig_train, fig_test) return results
def test_clustering(self): ds = get_dataset('8kmy') with catch_warnings_ignore(EfficiencyWarning): ds.clustering(algo='kmeans') ds.clustering(algo='knn')
linestyle=linestyles[i % len(linestyles)], label=dsname) plt.legend() plt.suptitle("[%s]Mean" % title) V.plot_figure(nrow=6, ncol=20) for i, dsname in enumerate(all_dataset): _, std = _map[dsname] plt.plot(std, linewidth=1., linestyle=linestyles[i % len(linestyles)], label=dsname) plt.legend() plt.suptitle("[%s]StandardDeviation" % title) with catch_warnings_ignore(RuntimeWarning), catch_warnings_ignore(FutureWarning): data_map = {} stats_map = {} spk_map = {} for dsname, text, data, stats, spk_stats in mpi.MPI(jobs=all_dataset, func=dataset_statistics, ncpu=None, batch=1): data_map[dsname] = data stats_map[dsname] = stats spk_map[dsname] = spk_stats print(text) for dsname in all_dataset: print("Plotting ...", ctext(dsname, 'cyan')) data = data_map[dsname] V.plot_figure(nrow=2, ncol=20) ax = plt.subplot(1, n_col, 1)
label=dsname) plt.legend() plt.suptitle("[%s]Mean" % title) V.plot_figure(nrow=6, ncol=20) for i, dsname in enumerate(all_dataset): _, std = _map[dsname] plt.plot(std, linewidth=1., linestyle=linestyles[i % len(linestyles)], label=dsname) plt.legend() plt.suptitle("[%s]StandardDeviation" % title) with catch_warnings_ignore(RuntimeWarning), catch_warnings_ignore( FutureWarning): data_map = {} stats_map = {} spk_map = {} for dsname, text, data, stats, spk_stats in mpi.MPI( jobs=all_dataset, func=dataset_statistics, ncpu=None, batch=1): data_map[dsname] = data stats_map[dsname] = stats spk_map[dsname] = spk_stats print(text) for dsname in all_dataset: print("Plotting ...", ctext(dsname, 'cyan')) data = data_map[dsname] V.plot_figure(nrow=2, ncol=20)
def plot_series(self, omic1=OMIC.transcriptomic, omic2=OMIC.proteomic, var_names1='auto', var_names2='auto', log1=True, log2=True, fontsize=10, title='', return_figure=False): r""" Plot lines of 2 OMICs sorted in ascending order of `omic1` """ import seaborn as sns ## prepare omic1 = OMIC.parse(omic1) omic2 = OMIC.parse(omic2) omic1_ids = self.get_var_indices(omic1) omic2_ids = self.get_var_indices(omic2) if isinstance(var_names1, string_types) and var_names1 == 'auto': var_names1 = omic1.markers if isinstance(var_names2, string_types) and var_names2 == 'auto': var_names2 = omic2.markers ## filtering variables ids1 = [] ids2 = [] for v1, v2 in zip(var_names1, var_names2): i1 = omic1_ids.get(v1, None) i2 = omic2_ids.get(v2, None) if i1 is not None and i2 is not None: ids1.append(i1) ids2.append(i2) assert len(ids1) > 0, \ (f"No variables found for omic1={omic1} var1={var_names1} " f"and omic2={omic2} var2={var_names2}") x1 = self.get_omic(omic1)[:, ids1] x2 = self.get_omic(omic2)[:, ids2] if log1: x1 = np.log1p(x1) if log2: x2 = np.log1p(x2) names1 = self.get_var_names(omic1)[ids1] names2 = self.get_var_names(omic2)[ids2] n_series = len(names1) ### prepare the plot colors = sns.color_palette(n_colors=2) fig = plt.figure(figsize=(12, n_series * 4)) for idx in range(n_series): y1 = x1[:, idx] y2 = x2[:, idx] order = np.argsort(y1) ax = plt.subplot(n_series, 1, idx + 1) ## the second series ax.plot(y1[order], linewidth=1.8, color=colors[0], label=f"{omic1.name}-{names1[idx]}") ax.set_ylabel( f"{'log' if log1 else 'raw'}-{omic1.name}-{names1[idx]}", color=colors[0]) ax.set_xlabel(f"Cell in ascending order of {omic1.name}") ax.tick_params(axis='y', colors=colors[0], labelcolor=colors[0]) ax.grid(False) ## the second series ax = ax.twinx() ax.plot(y2[order], linestyle='--', alpha=0.88, linewidth=1.2, color=colors[1]) ax.set_ylabel( f"{'log' if log1 else 'raw'}-{omic2.name}-{names2[idx]}", color=colors[1]) ax.tick_params(axis='y', colors=colors[1], labelcolor=colors[1]) ax.grid(False) ### finalize the figure style if len(title) > 0: plt.suptitle(title, fontsize=fontsize + 2) with catch_warnings_ignore(UserWarning): plt.tight_layout(rect=[0., 0.02, 1., 0.98]) if return_figure: return fig return self.add_figure(f'series_{omic1.name}_{omic2.name}', fig)