def plot_histogram( self, histogram_bins: int = 120, original_factors: bool = True, return_figure: bool = False, ): Z = self.dist_to_tensor(self.latents).numpy() F = self.factors_original if original_factors else self.factors X = [i for i in F.T] + [i for i in Z.T] labels = self.factor_names + self.latent_names # create the figure ncol = int(np.ceil(np.sqrt(len(X)))) + 1 nrow = int(np.ceil(len(X) / ncol)) fig = vs.plot_figure(nrow=12, ncol=20, dpi=100) for i, (x, lab) in enumerate(zip(X, labels)): vs.plot_histogram(x, ax=(nrow, ncol, i + 1), bins=int(histogram_bins), title=lab, alpha=0.8, color='blue', fontsize=16) fig.tight_layout() if return_figure: return fig return self.add_figure( f"histogram_{'original' if original_factors else 'discretized'}", fig)
def plot_histogram(self, histogram_bins=120, original_factors=True): r""" orginal_factors : optional original factors before discretized by `Criticizer` """ self.assert_sampled() from matplotlib import pyplot as plt ## prepare the data Z = np.concatenate(self.representations_mean, axis=0) F = np.concatenate( self.original_factors if original_factors else self.factors, axis=0) X = [i for i in F.T] + [i for i in Z.T] labels = self.factors_name.tolist() + self.codes_name.tolist() # create the figure ncol = int(np.ceil(np.sqrt(len(X)))) + 1 nrow = int(np.ceil(len(X) / ncol)) fig = vs.plot_figure(nrow=18, ncol=25, dpi=80) for i, (x, lab) in enumerate(zip(X, labels)): vs.plot_histogram(x, ax=(nrow, ncol, i + 1), bins=int(histogram_bins), title=lab, alpha=0.8, color='blue', fontsize=16) plt.tight_layout() self.add_figure( "histogram_%s" % ("original" if original_factors else "discretized"), fig) return self
def plot_histogram(self, omic=OMIC.proteomic, bins=80, log_norm=True, var_names=None, max_plots=100, fig=None, return_figure=False): r""" Plot histogram for each variable of given OMIC type """ omic = OMIC.parse(omic) x = self.numpy(omic) bins = min(int(bins), x.shape[0] // 2) max_plots = int(max_plots) ### prepare the data var_ids = self.get_var_indices(omic) if var_names is None: var_names = var_ids.keys() var_names = np.array([i for i in var_names if i in var_ids]) assert len(var_names) > 0, \ f"No matching variables found for {omic.name}" # randomly select variables if len(var_names) > max_plots: rand = np.random.RandomState(seed=1) ids = rand.permutation(len(var_names))[:max_plots] var_names = var_names[ids] ids = [var_ids[i] for i in var_names] x = x[:, ids] ### the figures ncol = 8 nrow = int(np.ceil(x.shape[1] / ncol)) if fig is None: fig = vs.plot_figure(nrow=nrow * 2, ncol=ncol * 3, dpi=80) # plot for idx, (y, name) in enumerate(zip(x.T, var_names)): sparsity = sparsity_percentage(y, batch_size=2048) y = y[y != 0.] if log_norm: y = np.log1p(y) vs.plot_histogram(x=y, bins=bins, alpha=0.8, ax=(nrow, ncol, idx + 1), title=f"{name}\n({sparsity*100:.1f}% zeros)") fig.gca().tick_params(axis='y', labelleft=False) ### adjust and return fig.suptitle(f"{omic.name}") fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97]) if return_figure: return fig return self.add_figure(f"histogram_{omic.name}", fig)
def _draw_hist(x, ax, title, n_bins, show_yticks=True): count, bins = plot_histogram(x=x, bins=n_bins, ax=ax, normalize=False, kde=False, range_0_1=False, covariance_factor=0.25, centerlize=False, fontsize=8, title=title) plt.xlim((np.min(x), np.max(x))) plt.xticks(np.linspace(start=np.min(x), stop=np.max(x), num=5, dtype='float32'), fontsize=6) if show_yticks: plt.yticks(np.linspace(start=np.min(count), stop=np.max(count), num=5, dtype='int32'), fontsize=5) else: plt.yticks([], []) return count, bins
def plot_hist(hist, ax, name): count, bins = plot_histogram(true, bins=nbins, ax=ax, title=name, fontsize=fontsize) plt.xlim((np.min(bins), np.max(bins))) plt.xticks(np.linspace(start=np.min(bins), stop=np.max(bins), num=8, dtype='int32'), fontsize=6) plt.yticks(np.linspace(start=np.min(count), stop=np.max(count), num=8, dtype='int32'), fontsize=6)
def plot_percentile_histogram(self, omic=OMIC.transcriptomic, n_hist=10, title="", outlier=0.001, non_zeros=False, fig=None): r""" Data is chopped into multiple percentile (`n_hist`) and the histogram is plotted for each percentile. """ omic = OMIC.parse(omic) arr = self.numpy(omic) if non_zeros: arr = arr[arr != 0] n_percentiles = n_hist + 1 n_col = 5 n_row = int(np.ceil(n_hist / n_col)) if fig is None: fig = vs.plot_figure(nrow=int(n_row * 1.5), ncol=20) self.assert_figure(fig) percentile = np.linspace(start=np.min(arr), stop=np.max(arr), num=n_percentiles) n_samples = len(arr) for i, (p_min, p_max) in enumerate(zip(percentile, percentile[1:])): min_mask = arr >= p_min max_mask = arr <= p_max mask = np.logical_and(min_mask, max_mask) a = arr[mask] _, bins = vs.plot_histogram( a, bins=120, ax=(n_row, n_col, i + 1), fontsize=8, color='red' if len(a) / n_samples < outlier else 'blue', title=f"{len(a)}(samples) Range:[{p_min:.2g},{p_max:.2g}]") plt.gca().set_xticks(np.linspace(np.min(bins), np.max(bins), num=8)) if len(title) > 0: plt.suptitle(title) plt.tight_layout(rect=[0.0, 0.02, 1.0, 0.98]) self.add_figure(f'histogram{n_hist}_{omic.name}', fig) return self
def plot_histogram(series, ax, title): V.plot_histogram(x=clipping_quartile(series), bins=n_bin, ax=ax, title=title, fontsize=4)
def plot_disentanglement( self, factor_indices: Optional[Union[int, str, List[Union[int, str]]]] = None, n_bins_factors: int = 15, n_bins_codes: int = 80, corr_type: Union[Literal['spearman', 'pearson', 'lasso', 'average', 'mi'], ndarray] = 'average', original_factors: bool = True, show_all_codes: bool = False, sort_pairs: bool = True, title: str = '', return_figure: bool = False, seed: int = 1, ): r""" To illustrate the disentanglement of the codes, the codes' histogram bars are colored by the value of factors. Arguments: factor_names : list of String or Integer. Name or index of which factors will be used for visualization. factor_bins : factor is discretized into bins, then a LogisticRegression model will predict the bin (with color) given the code as input. corr_type : {'spearman', 'pearson', 'lasso', 'average', 'mi', None, matrix} Type of correlation, with special case 'mi' for mutual information. - If None, no sorting by correlation provided. - If an array, the array must have shape `[n_codes, n_factors]` show_all_codes : a Boolean. if False, only show most correlated codes-factors, otherwise, all codes are shown for each factor. This option only in effect when `corr_type` is not `None`. original_factors : optional original factors before discretized by `Criticizer` """ ### prepare styled plot styles = dict(fontsize=12, cbar_horizontal=False, bins_color=int(n_bins_factors), bins=int(n_bins_codes), color='bwr', alpha=0.8) # get all relevant factors if factor_indices is None: factor_indices = list(range(self.n_factors)) factor_indices = [ int(i) if isinstance(i, Number) else self.factor_names.index(i) for i in as_tuple(factor_indices) ] ### correlation if isinstance(corr_type, string_types): if corr_type == 'mi': corr = self.mutualinfo_matrix( convert_to_tensor=self.dist_to_tensor, seed=seed) score_type = 'mutual-info' else: corr = self.correlation_matrix( convert_to_tensor=self.dist_to_tensor, method=corr_type, seed=seed) score_type = corr_type # [n_factors, n_codes] corr = corr.T[factor_indices] ### directly give the correlation matrix elif isinstance(corr_type, ndarray): corr = corr_type if self.n_latents != self.n_factors and corr.shape[ 0] == self.n_latents: corr = corr.T assert corr.shape == (self.n_factors, self.n_latents), \ (f"Correlation matrix expect shape (n_factors={self.n_factors}, " f"n_codes={self.n_codes}) but given shape: {corr.shape}") score_type = 'score' corr = corr[factor_indices] ### exception else: raise ValueError( f"corr_type could be string, None or a matrix but given: {type(corr_type)}" ) ### sorting the latents if sort_pairs: latent_indices = diagonal_linear_assignment(np.abs(corr), nan_policy=0) else: latent_indices = np.arange(self.n_latents, dtype=np.int32) if not show_all_codes: latent_indices = latent_indices[:len(factor_indices)] corr = corr[:, latent_indices] ### prepare the data # factors F = (self.factors_original if original_factors else self.factors)[:, factor_indices] factor_names = np.asarray(self.factor_names)[factor_indices] # codes Z = self.dist_to_tensor(self.latents).numpy()[:, latent_indices] latent_names = np.asarray(self.latent_names)[latent_indices] ### create the figure nrow = F.shape[1] ncol = Z.shape[1] + 1 fig = vs.plot_figure(nrow=nrow * 3, ncol=ncol * 2.8, dpi=100) count = 1 for fidx, (f, fname) in enumerate(zip(F.T, factor_names)): # the first plot show how the factor clustered ax, _, _ = vs.plot_histogram(x=f, color_val=f, ax=(nrow, ncol, count), cbar=False, title=f"{fname}", **styles) ax.tick_params(axis='y', labelleft=False) count += 1 # the rest of the row show how the codes align with the factor for zidx, (score, z, zname) in enumerate(zip(corr[fidx], Z.T, latent_names)): text = "*" if fidx == zidx else "" ax, _, _ = vs.plot_histogram( x=z, color_val=f, ax=(nrow, ncol, count), cbar=False, title=f"{text}{fname}-{zname} (${score:.2f}$)", bold_title=True if fidx == zidx else False, **styles) ax.tick_params(axis='y', labelleft=False) count += 1 ### fine tune the plot fig.suptitle(f"[{score_type}]{title}", fontsize=12) fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97]) if return_figure: return fig return self.add_figure( f"disentanglement_{'original' if original_factors else 'discretized'}", fig)
def plot_histogram_heatmap(self, factors=None, factor_bins=15, histogram_bins=80, n_codes_per_factor=6, corr_method='average', original_factors=True): r""" The histogram bars are colored by the value of factors Arguments: factors : which factors will be used factor_bins : factor is discretized into bins, then a LogisticRegression model will predict the bin (with color) given the code as input. orginal_factors : optional original factors before discretized by `Criticizer` """ self.assert_sampled() from matplotlib import pyplot as plt import seaborn as sns sns.set() if n_codes_per_factor is None: n_codes_per_factor = self.n_codes else: n_codes_per_factor = int(n_codes_per_factor) styles = dict(fontsize=12, val_bins=int(factor_bins), color='bwr', bins=int(histogram_bins), alpha=0.8) ## correlation train_corr, test_corr = self.cal_correlation_matrix(mean=True, method=corr_method, decode=False) corr = (train_corr + test_corr) / 2 ## prepare the data factors = self._check_factors(factors) Z = np.concatenate(self.representations_mean, axis=0) F = np.concatenate( self.original_factors if original_factors else self.factors, axis=0)[:, factors] # annotations factors_name = self.factors_name[factors] codes_name = self.codes_name # create the figure nrow = F.shape[1] ncol = int(1 + n_codes_per_factor) fig = vs.plot_figure(nrow=nrow * 3, ncol=ncol * 3, dpi=80) plot_count = 1 for fidx, (f, fname) in enumerate(zip(F.T, factors_name)): c = corr[:, fidx] vs.plot_histogram(f, val=f, ax=(nrow, ncol, plot_count), cbar=True, cbar_horizontal=False, title=fname, **styles) plot_count += 1 # all codes are visualized if n_codes_per_factor == self.n_codes: all_codes = range(self.n_codes) # lower to higher correlation else: zids = np.argsort(c) bottom = zids[:n_codes_per_factor // 2] top = zids[-(n_codes_per_factor - n_codes_per_factor // 2):] all_codes = (top.tolist()[::-1] + bottom.tolist()[::-1]) for i in all_codes: z = Z[:, i] zname = codes_name[i] vs.plot_histogram(z, val=f, ax=(nrow, ncol, plot_count), title='[%.2g]%s' % (c[i], zname), **styles) plot_count += 1 fig.tight_layout() self.add_figure( "histogram_%s" % ("original" if original_factors else "discretized"), fig) return self
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' tf.random.set_seed(8) np.random.seed(8) sns.set() shape = (1024, 1) total_figures = 1 + 4 * 2 ncol = nrow = int(np.ceil(np.sqrt(total_figures))) hist_bins = 120 for dist, fn in [('uniform', np.random.rand), ('normal', np.random.randn)]: x = fn(*shape) vs.plot_figure(nrow=12, ncol=12, dpi=120) ax = vs.subplot(nrow, ncol, 1) ax = vs.plot_histogram(x, bins=hist_bins, title=dist, ax=ax) idx = 2 for strategy in ('gmm', 'uniform', 'quantile', 'kmeans'): for n_bins in (5, 10): y = discretizing(x, n_bins=n_bins, strategy=strategy) title = '%s-%d' % (strategy, n_bins) ax = vs.subplot(nrow, ncol, idx) vs.plot_histogram(y, bins=hist_bins, ax=ax, title=title) idx += 1 plt.tight_layout() # ====== special case: GMM discretizing ====== # vs.plot_figure() y, gmm = discretizing(x, n_bins=2, strategy='gmm', return_model=True) gmm = gmm[0] vs.plot_gaussian_mixture(x,
def plot_disentanglement(self, factor_names=None, n_bins_factors=15, n_bins_codes=80, corr_type='average', original_factors=True, show_all_codes=False, title='', return_figure=False): r""" To illustrate the disentanglement of the codes, the codes' histogram bars are colored by the value of factors. Arguments: factor_names : list of String or Integer. Name or index of which factors will be used for visualization. factor_bins : factor is discretized into bins, then a LogisticRegression model will predict the bin (with color) given the code as input. corr_type : {'spearman', 'pearson', 'lasso', 'average', 'mi', None, matrix} Type of correlation, with special case 'mi' for mutual information. - If None, no sorting by correlation provided. - If an array, the array must have shape `[n_codes, n_factors]` show_all_codes : a Boolean. if False, only show most correlated codes-factors, otherwise, all codes are shown for each factor. This option only in effect when `corr_type` is not `None`. original_factors : optional original factors before discretized by `Criticizer` """ self.assert_sampled() ### prepare styled plot from matplotlib import pyplot as plt import seaborn as sns sns.set() styles = dict(fontsize=12, cbar_horizontal=False, bins_color=int(n_bins_factors), bins=int(n_bins_codes), color='bwr', alpha=0.8) # get all relevant factors factor_ids = self._check_factors(factor_names) ### correlation if isinstance(corr_type, string_types): if corr_type == 'mi': train_corr, test_corr = self.create_mutualinfo_matrix(mean=True) score_type = 'mutual-info' else: train_corr, test_corr = self.create_correlation_matrix(mean=True, method=corr_type) score_type = corr_type # [n_factors, n_codes] corr = ((train_corr + test_corr) / 2.).T corr = corr[factor_ids] code_ids = diagonal_linear_assignment(np.abs(corr), nan_policy=0) if not show_all_codes: code_ids = code_ids[:len(factor_ids)] # directly give the correlation matrix elif isinstance(corr_type, np.ndarray): corr = corr_type if self.n_codes != self.n_factors and corr.shape[0] == self.n_codes: corr = corr.T assert corr.shape == (self.n_factors, self.n_codes), \ (f"Correlation matrix expect shape (n_factors={self.n_factors}, " f"n_codes={self.n_codes}) but given shape: {corr.shape}") score_type = 'score' corr = corr[factor_ids] code_ids = diagonal_linear_assignment(np.abs(corr), nan_policy=0) if not show_all_codes: code_ids = code_ids[:len(factor_ids)] # no correlation provided elif corr_type is None: train_corr, test_corr = self.create_correlation_matrix(mean=True, method='spearman') score_type = 'spearman' # [n_factors, n_codes] corr = ((train_corr + test_corr) / 2.).T code_ids = np.arange(self.n_codes, dtype=np.int32) # exception else: raise ValueError( f"corr_type could be string, None or a matrix but given: {type(corr_type)}" ) # applying the indexing corr = corr[:, code_ids] ### prepare the data # factors F = np.concatenate( self.original_factors if original_factors else self.factors, axis=0, )[:, factor_ids] factor_names = self.factor_names[factor_ids] # codes Z = np.concatenate(self.representations_mean, axis=0)[:, code_ids] code_names = self.code_names[code_ids] ### create the figure nrow = F.shape[1] ncol = Z.shape[1] + 1 fig = vs.plot_figure(nrow=nrow * 3, ncol=ncol * 2.8, dpi=80) count = 1 for fidx, (f, fname) in enumerate(zip(F.T, factor_names)): # the first plot show how the factor clustered ax = vs.plot_histogram(x=f, color_val=f, ax=(nrow, ncol, count), cbar=False, title=f"{fname}", **styles) plt.gca().tick_params(axis='y', labelleft=False) count += 1 # the rest of the row show how the codes align with the factor for zidx, (score, z, zname) in enumerate(zip(corr[fidx], Z.T, code_names)): text = "*" if fidx == zidx else "" ax = vs.plot_histogram(x=z, color_val=f, ax=(nrow, ncol, count), cbar=False, title=f"{text}{fname}-{zname} (${score:.2f}$)", bold_title=True if fidx == zidx else False, **styles) plt.gca().tick_params(axis='y', labelleft=False) count += 1 ### fine tune the plot fig.suptitle(f"[{score_type}]{title}", fontsize=12) fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97]) if return_figure: return fig return self.add_figure( f"disentanglement_{'original' if original_factors else 'discretized'}", fig)