def plot_divergence(self,
                     X=OMIC.transcriptomic,
                     omic=OMIC.proteomic,
                     algo='tsne',
                     n_pairs=18,
                     ncol=6):
     r""" Select the most diverged pair within given `omic`, use `X` as
 coordinate and the pair's value as intensity for plotting the scatter
 heatmap. """
     om1 = OMIC.parse(X)
     om2 = OMIC.parse(omic)
     ## prepare the coordinate
     X = self.dimension_reduce(om1, n_components=2, algo=algo)
     n_points = X.shape[0]
     ## prepare the value
     y = self.numpy(om2)
     varnames = self.get_var_names(om2)
     ## check correlation type
     corr_fn = lambda m, n: (spearmanr(m, n, nan_policy='omit').correlation
                             + pearsonr(m, n)[0]) / 2
     ## create the correlation matrix
     corr_ids = []
     corr = []
     for i in range(y.shape[1]):
         for j in range(i + 1, y.shape[1]):
             corr_ids.append((i, j))
             corr.append(corr_fn(y[:, i], y[:, j]))
     ## sorting and select the smallest correlated pairs
     sort_ids = np.argsort(corr)[:int(n_pairs)]
     corr = np.array(corr)[sort_ids]
     corr_ids = np.array(corr_ids)[sort_ids]
     ## plotting
     nrow = int(np.ceil((n_pairs / ncol)))
     fig = plt.figure(figsize=(ncol * 3, nrow * 3))
     for idx, ((i, j), c) in enumerate(zip(corr_ids, corr)):
         name1 = varnames[i]
         name2 = varnames[j]
         y1 = y[:, i]
         y1 = (y1 - np.min(y1)) / (np.max(y1) - np.min(y1))
         y2 = y[:, j]
         y2 = (y2 - np.min(y2)) / (np.max(y2) - np.min(y2))
         val = y1 - y2
         vs.plot_scatter(X,
                         color='bwr',
                         size=20 if n_points < 1000 else
                         (100000 / n_points),
                         val=val,
                         alpha=0.6,
                         cbar=True,
                         cbar_ticks=[name2, 'Others', name1],
                         cbar_horizontal=True,
                         fontsize=8,
                         ax=(nrow, ncol, idx + 1))
     ## adjust and save
     self.add_figure("divergence_%s_%s_%s" % (om1.name, om2.name, algo),
                     fig)
     return self
def _process_varnames(sco, input_omic, var_names):
    input_omic = OMIC.parse(input_omic)
    if isinstance(var_names, string_types) and var_names == 'auto':
        var_names = input_omic.markers
    original_varnames = var_names
    with sco._swap_omic(input_omic) as sco:
        # select top variables
        if isinstance(var_names, Number):
            var_names = sco.top_vars(n_vars=int(var_names))
        # provided markers
        elif var_names is None:
            if input_omic == OMIC.transcriptomic:
                markers = set(MARKER_GENES)
            elif input_omic == OMIC.proteomic:
                markers = set(MARKER_ADTS)
            elif input_omic == OMIC.atac:
                markers = set(MARKER_ATAC)
            else:  # just take all variables
                markers = set(sco.var_names)
            var_names = [i for i in sco.var_names if i in markers]
        # given list of specific var_names
        else:
            var_names = [
                i for i in as_tuple(var_names, t=string_types)
                if i in set(sco.var_names)
            ]
    # check all var names are exist
    assert len(var_names) > 0, \
      (f"Cannot find appropriate variables name for OMIC type {input_omic.name}"
       f" given var_names={original_varnames}")
    return input_omic, sorted(var_names)
示例#3
0
 def labels(self, omic=OMIC.proteomic):
     omic = OMIC.parse(omic)
     for om in list(omic):
         name = self.get_labels_name(om)
         if name in self.obs:
             return self.obs[name]
     raise ValueError("OMIC not found, give: '%s', support: '%s'" %
                      (omic, self.omics))
示例#4
0
    def stats(self, omic=None):
        r""" Return a matrix of shape `[n_obs, 4]`.

    The columns are: 'total_counts', 'log_counts', 'local_mean', 'local_var'
    """
        if omic is None:
            omic = self._current_omic
        omic = OMIC.parse(omic)
        return self.obsm[omic.name + '_stats']
示例#5
0
 def get_var(self, omic=None) -> pd.DataFrame:
     if omic is None:
         omic = self.current_omic
     omic = OMIC.parse(omic)
     for om in list(omic):
         name = om.name + '_var'
         if name in self.uns:
             return self.uns[om.name + '_var']
     raise ValueError("OMIC not found, give: '%s', support: '%s'" %
                      (omic, self.omics))
def _validate_arguments(kw):
  r""" Validate the argument and return a descriptive enough title for the
  figure """
  X = OMIC.parse(kw.get('X'))
  group_by = kw.get('group_by')
  if group_by is not None:
    group_by = OMIC.parse(group_by).name
  else:
    group_by = 'none'
  rank_genes = kw.get('rank_genes')
  clustering = kw.get('clustering')
  log = kw.get('log')
  if rank_genes:
    assert X == OMIC.transcriptomic, \
      f"Only visualize transcriptomic in case of rank_genes>0, but given: {X.name}"
  title = '_'.join(i for i in [
      X.name, group_by,
      str(clustering), ('rank' if rank_genes else ''), ('log' if log else 'raw')
  ] if len(i) > 0)
  return title
 def plot_histogram(self,
                    omic=OMIC.proteomic,
                    bins=80,
                    log_norm=True,
                    var_names=None,
                    max_plots=100,
                    fig=None,
                    return_figure=False):
     r""" Plot histogram for each variable of given OMIC type """
     omic = OMIC.parse(omic)
     x = self.numpy(omic)
     bins = min(int(bins), x.shape[0] // 2)
     max_plots = int(max_plots)
     ### prepare the data
     var_ids = self.get_var_indices(omic)
     if var_names is None:
         var_names = var_ids.keys()
     var_names = np.array([i for i in var_names if i in var_ids])
     assert len(var_names) > 0, \
       f"No matching variables found for {omic.name}"
     # randomly select variables
     if len(var_names) > max_plots:
         rand = np.random.RandomState(seed=1)
         ids = rand.permutation(len(var_names))[:max_plots]
         var_names = var_names[ids]
     ids = [var_ids[i] for i in var_names]
     x = x[:, ids]
     ### the figures
     ncol = 8
     nrow = int(np.ceil(x.shape[1] / ncol))
     if fig is None:
         fig = vs.plot_figure(nrow=nrow * 2, ncol=ncol * 3, dpi=80)
     # plot
     for idx, (y, name) in enumerate(zip(x.T, var_names)):
         sparsity = sparsity_percentage(y, batch_size=2048)
         y = y[y != 0.]
         if log_norm:
             y = np.log1p(y)
         vs.plot_histogram(x=y,
                           bins=bins,
                           alpha=0.8,
                           ax=(nrow, ncol, idx + 1),
                           title=f"{name}\n({sparsity*100:.1f}% zeros)")
         fig.gca().tick_params(axis='y', labelleft=False)
     ### adjust and return
     fig.suptitle(f"{omic.name}")
     fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97])
     if return_figure:
         return fig
     return self.add_figure(f"histogram_{omic.name}", fig)
 def get_var_indices(self, omic=None) -> dict:
   r""" Mapping from variable name to its integer index (i.e. column index)
   of the data matrix.
   """
   if omic is None:
     omic = self._current_omic
   else:
     omic = OMIC.parse(omic)
   name = f"{omic.name}_var_indices"
   if name not in self.uns:
     self.uns[name] = {
         name: i for i, name in enumerate(self.get_var(omic).index)
     }
   return self.uns[name]
 def get_rv(self, omic, distribution=None) -> RandomVariable:
   r""" Shortcut for creating `RandomVariable` for given OMIC type """
   omic = OMIC.parse(omic)
   if distribution is None:
     if omic in (OMIC.transcriptomic, OMIC.atac):
       distribution = 'zinb'
     elif omic == OMIC.proteomic:
       distribution = 'nb'
     elif omic in (OMIC.celltype, OMIC.disease, OMIC.progenitor):
       distribution = 'onehot'
     else:
       raise ValueError(f"No default distribution for OMIC {omic.name}")
   return RandomVariable(event_shape=self.get_dim(omic),
                         posterior=distribution,
                         projection=True,
                         name=omic.name)
示例#10
0
 def _calculate_statistics(self, omic=None):
     if omic is None:
         omic = self.current_omic
     else:
         omic = OMIC.parse(omic)
     X = self.numpy(omic)
     # start processing
     if sparse.issparse(X):
         total_counts = np.sum(X, axis=1)
         if total_counts.ndim < 2:
             total_counts = np.expand_dims(total_counts, axis=-1)
     else:
         total_counts = np.sum(X, axis=1, keepdims=True)
     log_counts, local_mean, local_var = get_library_size(
         X, return_log_count=True)
     self.obsm[omic.name + '_stats'] = np.hstack(
         [total_counts, log_counts, local_mean, local_var])
示例#11
0
 def numpy(self, omic=None):
   r""" Return observation ndarray in `obsm` or `obs` """
   if omic is None:
     omic = self._current_omic
   omic_name = omic.name if hasattr(omic, 'name') else str(omic)
   # obs
   if omic_name in self.obs:
     x = self.obs[omic_name].values
     if hasattr(x, 'to_numpy'):
       x = x.to_numpy()
     return x
   # obsm
   omic = OMIC.parse(omic)
   for om in list(omic):
     if om.name in self.obsm:
       return self.obsm[om.name]
   # not found
   raise ValueError(f"OMIC not found, give: {omic}, support: {self.omics}")
 def plot_percentile_histogram(self,
                               omic=OMIC.transcriptomic,
                               n_hist=10,
                               title="",
                               outlier=0.001,
                               non_zeros=False,
                               fig=None):
     r""" Data is chopped into multiple percentile (`n_hist`) and the
 histogram is plotted for each percentile. """
     omic = OMIC.parse(omic)
     arr = self.numpy(omic)
     if non_zeros:
         arr = arr[arr != 0]
     n_percentiles = n_hist + 1
     n_col = 5
     n_row = int(np.ceil(n_hist / n_col))
     if fig is None:
         fig = vs.plot_figure(nrow=int(n_row * 1.5), ncol=20)
     self.assert_figure(fig)
     percentile = np.linspace(start=np.min(arr),
                              stop=np.max(arr),
                              num=n_percentiles)
     n_samples = len(arr)
     for i, (p_min, p_max) in enumerate(zip(percentile, percentile[1:])):
         min_mask = arr >= p_min
         max_mask = arr <= p_max
         mask = np.logical_and(min_mask, max_mask)
         a = arr[mask]
         _, bins = vs.plot_histogram(
             a,
             bins=120,
             ax=(n_row, n_col, i + 1),
             fontsize=8,
             color='red' if len(a) / n_samples < outlier else 'blue',
             title=f"{len(a)}(samples)  Range:[{p_min:.2g},{p_max:.2g}]")
         plt.gca().set_xticks(np.linspace(np.min(bins), np.max(bins),
                                          num=8))
     if len(title) > 0:
         plt.suptitle(title)
     plt.tight_layout(rect=[0.0, 0.02, 1.0, 0.98])
     self.add_figure(f'histogram{n_hist}_{omic.name}', fig)
     return self
示例#13
0
 def add_omic(self, omic: OMIC, X: np.ndarray, var_names=None):
     self._record('add_omic', locals())
     omic = OMIC.parse(omic)
     assert X.shape[0] == self.X.shape[0], \
       "Number of samples of new omic type mismatch, given: %s, require: %s" % \
         (str(X.shape), self.X.shape[0])
     self.obsm[omic.name] = X
     # variable name
     if var_names is not None:
         var_names = np.array(var_names).ravel()
         assert len(var_names) == X.shape[1]
         if omic in (OMIC.proteomic | OMIC.celltype | OMIC.iproteomic
                     | OMIC.icelltype):
             var_names = standardize_protein_name(var_names)
     else:
         var_names = ['%s%d' % (omic.name, i) for i in range(X.shape[1])]
     self.uns[omic.name + '_var'] = pd.DataFrame(index=var_names)
     # update
     self._omics |= omic
     self._calculate_statistics(omic)
     return self
示例#14
0
 def _swap_omic(self, omic):
     r""" Temporary change the main OMIC type to other than the default
 transcriptomic """
     omic = OMIC.parse(omic)
     last_omic = self._current_omic
     # do nothing if transcriptomic (the default)
     if omic == last_omic:
         yield self
     # swap then reset back to transcriptomic
     else:
         x = self.numpy(omic)
         var = self.get_var(omic)
         self._X = x
         self._var = var
         self._n_vars = self._X.shape[1]
         self._current_omic = omic
         yield self
         self._X = self.numpy(last_omic)
         self._var = self.get_var(last_omic)
         self._n_vars = self._X.shape[1]
         self._current_omic = last_omic
示例#15
0
 def set_omic(self, omic, X, recalculate_statistics=True):
     r""" Update the value of given OMIC stored in this dataset """
     self._record('set_omic', locals())
     omic = OMIC.parse(omic)
     assert omic in self.omics, \
       (f"Cannot set value for omic='{omic}', "
        f"all available omics are: {self.omics}")
     assert X.shape == self.numpy(omic).shape, \
       (f"Dimensions mismatch, {omic} has dim={self.numpy(omic).shape} "
        f"but given: {X.shape}")
     # skip if the same ArrayView
     if id(X) == id(self.get_omic(omic)):
         print("SKIP!")
         return self
     # set the new data
     self.obsm[f'{omic.name}'] = X
     if omic == self._current_omic:
         self._X = X
     # have to recalculate the statistic
     if recalculate_statistics:
         self._calculate_statistics(omic)
     return self
示例#16
0
 def __init__(self,
              X: Union[np.ndarray, sparse.spmatrix],
              cell_id: Optional[List[str]] = None,
              gene_id: Optional[List[str]] = None,
              dtype: Optional[str] = None,
              omic: OMIC = OMIC.transcriptomic,
              name: Optional[str] = None,
              duplicated_var: bool = False,
              **kwargs):
     omic = OMIC.parse(omic)
     # directly first time init from file
     if 'filename' in kwargs:
         X = None
         kwargs['dtype'] = dtype
     # init as view or copy of created SCO
     elif isinstance(X, sc.AnnData):
         self._omics = get_all_omics(X)
         self._history = IndexedList(X._history) if hasattr(X, '_history') else \
           IndexedList()
         asview = kwargs.get('asview', False)
         name = X._name
         if hasattr(X, '_current_omic'):
             omic = X._current_omic
     # init as completely new dataset
     else:
         self._omics = omic
         self._history = IndexedList()
         if cell_id is None:
             cell_id = ['Cell#%d' % i for i in range(X.shape[0])]
         if gene_id is None:
             gene_id = ['Gene#%d' % i for i in range(X.shape[1])]
         if dtype is None:
             dtype = X.dtype
         if name is None:
             name = "scOMICS"
         if not duplicated_var:
             # check duplicated var_names
             gene_id = np.asarray(gene_id)
             u, c = np.unique(gene_id, return_counts=True)
             ids = np.ones(shape=(len(gene_id), ), dtype=np.bool)
             for v in u[c > 1]:
                 ids[gene_id == v] = False
             gene_id = gene_id[ids]
             X = _check_array(X)[:, ids]
         kwargs['dtype'] = dtype
         kwargs['obs'] = pd.DataFrame(index=cell_id)
         kwargs['var'] = pd.DataFrame(index=gene_id)
         kwargs['asview'] = False
     # init
     super().__init__(X, **kwargs)
     self._name = str(name)
     self._verbose = False
     self._current_omic = omic
     # store given omic
     if omic.name + '_var' not in self.uns:
         self.uns[omic.name + '_var'] = self.var
     if not kwargs.get('asview', False):
         self.obsm[omic.name] = self._X
     # The class is created for first time
     if not isinstance(X, sc.AnnData):
         self.obs['indices'] = np.arange(self.X.shape[0], dtype='int64')
         self._calculate_statistics(omic)
示例#17
0
    def create_dataset(self,
                       omics: OMIC = None,
                       labels_percent=0,
                       batch_size=64,
                       drop_remainder=False,
                       shuffle=1000,
                       cache='',
                       framework='tensorflow',
                       seed=1) -> tf.data.Dataset:
        r""" Create dataset for training using one or multiple OMIC data

    Arguments:
      omics : `OMIC` or list of `OMIC`. Specify all the OMIC types will be
        included in the dataset
      library_size : a Boolean or list of Boolean. If true, log mean and log
        var will be include, the length of the list is coordinated to the `omics`
      labels_percent : a Scalar [0., 1.]. If > 0, create a mask with given
        percent set to True.
    """
        if omics is None:
            omics = self.current_omic
        framework = str(framework).lower().strip()
        assert framework in ('tf', 'pt', 'tensorflow', 'pytorch'), \
          f"Only support tensorflow or pytorch framework, given: {framework}"
        if isinstance(omics, OMIC):
            omics = list(omics)
        omics = [OMIC.parse(o) for o in tf.nest.flatten(omics)]
        inputs = [self.get_omic(o) for o in omics]
        # library size
        library = []
        for o in omics:
            library.append(np.concatenate(self.get_library_size(o), axis=-1))
        # create the dataset
        ds = [tf.data.Dataset.from_tensor_slices(i) for i in inputs] + \
          [tf.data.Dataset.from_tensor_slices(i) for i in library]
        if len(ds) > 0:
            ds = tf.data.Dataset.zip(tuple(ds))
        # for labels_percent
        labels_percent = np.clip(labels_percent, 0., 1.)
        if len(omics) == 1:
            labels_percent = 0.
        gen = tf.random.experimental.Generator.from_seed(seed=seed)

        def masking(*data):
            if labels_percent == 0.:
                mask = False
            else:
                mask = gen.uniform(shape=(1, )) < labels_percent
            inputs = data[:len(omics)]
            library = data[len(omics):]
            return dict(inputs=inputs[0] if len(inputs) == 1 else inputs,
                        library=library[0] if len(library) == 1 else library,
                        mask=mask)

        ds = ds.map(masking, tf.data.experimental.AUTOTUNE)
        # post processing
        if cache is not None:
            ds = ds.cache(str(cache))
        # shuffle must be called after cache
        if shuffle is not None and shuffle > 0:
            ds = ds.shuffle(int(shuffle))
        ds = ds.batch(batch_size, drop_remainder)
        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
        return ds
    def plot_scatter(self,
                     X=OMIC.transcriptomic,
                     color_by=OMIC.proteomic,
                     marker_by=None,
                     clustering='kmeans',
                     legend=True,
                     dimension_reduction='tsne',
                     max_scatter_points=5000,
                     ax=None,
                     fig=None,
                     title='',
                     return_figure=False):
        r""" Scatter plot of dimension using binarized protein labels

    Arguments:
      X : instance of OMIC.
        which OMIC data used for coordinates
      color_by : instance of OMIC.
        which OMIC data will be used for coloring the points
      marker_by : instance of OMIC.
        which OMIC data will be used for selecting the marker type
        (e.g. dot, square, triangle ...)
      clustering : {'kmeans', 'knn', 'pca', 'tsne', 'umap', 'louvain'}.
        Clustering algorithm, in case algorithm in ('pca', 'tsne', 'umap'),
        perform dimension reduction before clustering.
        Note: clustering is only applied in case of continuous data.
      dimension_reduction : {'tsne', 'umap', 'pca', None}.
        Dimension reduction algorithm. If None, just take the first 2
        dimension
    """
        ax = vs.to_axis2D(ax, fig=fig)
        omic = OMIC.parse(X)
        omic_name = omic.name
        max_scatter_points = int(max_scatter_points)
        ## prepare data
        X = self.dimension_reduce(omic,
                                  n_components=2,
                                  algo=dimension_reduction)
        color_name, colors = _process_omics(self,
                                            color_by,
                                            clustering=clustering,
                                            allow_none=True)
        marker_name, markers = _process_omics(self,
                                              marker_by,
                                              clustering=clustering,
                                              allow_none=True)
        ## downsampling
        if max_scatter_points > 0:
            ids = np.random.permutation(X.shape[0])[:max_scatter_points]
            X = X[ids]
            if colors is not None:
                colors = colors[ids]
            if markers is not None:
                markers = markers[ids]
        n_points = X.shape[0]
        ## ploting
        kw = dict(color='b')
        if colors is not None:
            if is_categorical_dtype(colors):  # categorical values
                kw['color'] = colors
            else:  # integral values
                kw['val'] = colors
                kw['color'] = 'bwr'
        name = '_'.join(str(i) for i in [omic_name, color_name, marker_name])
        title = f"[{dimension_reduction}-{name}]{title}"
        vs.plot_scatter(X,
                        marker='.' if markers is None else markers,
                        size=88 if n_points < 1000 else (120000 / n_points),
                        alpha=0.8,
                        legend_enable=bool(legend),
                        grid=False,
                        ax=ax,
                        title=title,
                        **kw)
        fig = ax.get_figure()
        if return_figure:
            return fig
        self.add_figure(f"scatter_{name}_{str(dimension_reduction).lower()}",
                        fig)
        return self
示例#19
0
 def get_labels_name(self, omic=OMIC.proteomic):
     omic = OMIC.parse(omic)
     return omic.name + '_labels'
def _process_omics(sco, omic, clustering=None, allow_none=False):
    r""" Return the name of the observation and the extracted observation """
    if allow_none and (omic is None):
        return None, None
    if isinstance(omic, OMIC):
        omic = omic.name
    else:
        try:
            omic = OMIC.parse(omic).name
        except ValueError:
            pass
    omic = str(omic)
    x = None
    ## the omic provided already in observation
    if omic in sco.obs:
        x = sco.obs[omic].to_numpy()
    ## processing of multi-dimensional OMIC for labeling and clustering
    elif omic in sco.omics:
        # binary classes
        if np.all(sco.total_counts(omic).ravel() == 1.):
            label_name = f"{omic}_labels"
            if label_name in sco.obs:  # already stored labels
                x = sco.obs[label_name]
            else:  # one-hot encoded to labels vector
                labels = sco.get_var_names(omic)
                x = np.array([
                    labels[i] for i in np.argmax(sco.get_omic(omic), axis=-1)
                ])
                sco.obs[label_name] = x
            omic = label_name
        # Use Louvain community detection
        elif isinstance(clustering, string_types):
            clustering = clustering.lower().strip()
            if 'louvain' in clustering:  # community detection
                _, x = sco.louvain(omic)
                omic = omic + '_louvain'
            else:  # clustering
                n_clusters = None
                if omic == 'transcriptomic':
                    for om in (OMIC.proteomic, OMIC.celltype, OMIC.iproteomic,
                               OMIC.icelltype):
                        if om in sco:
                            n_clusters = om
                            break
                omic = sco.clustering(omic,
                                      n_clusters=n_clusters,
                                      algo=clustering,
                                      return_key=True)
                x = sco.obs[omic].to_numpy()
        # probabilistic embedding
        else:
            x = sco.numpy(omic)
            _, prob, _ = sco.probabilistic_embedding(omic)
            try:
                x = sco.labels(omic).to_numpy()
                omic = sco.get_labels_name(omic)
            except KeyError:  # no variable name, just use raw integer values
                x = np.argmax(prob, axis=1)
    ## Exception
    else:
        raise ValueError("No support for omic: '%s' and clustering: '%s'" %
                         (omic, str(clustering)))
    return omic, x
    def _plot_heatmap_matrix(self,
                             matrix,
                             figname,
                             omic1=OMIC.transcriptomic,
                             omic2=OMIC.proteomic,
                             var_names1=MARKER_ADT_GENE.values(),
                             var_names2=MARKER_ADT_GENE.keys(),
                             is_marker_pairs=True,
                             title='',
                             return_figure=False):
        omic1 = OMIC.parse(omic1)
        omic2 = OMIC.parse(omic2)
        if isinstance(var_names1, string_types) and var_names1 == 'auto':
            var_names1 = omic1.markers
        if isinstance(var_names2, string_types) and var_names2 == 'auto':
            var_names2 = omic2.markers
        if var_names1 is None or var_names2 is None:
            is_marker_pairs = False
        names1 = self.get_var_names(omic1)
        names2 = self.get_var_names(omic2)
        om1_idx = {j: i for i, j in enumerate(names1)}
        om2_idx = {j: i for i, j in enumerate(names2)}
        assert matrix.shape == (len(names1), len(names2)), \
          (f"Given OMIC {omic1.name}({len(names1)} variables) and "
           f"OMIC {omic2.name}({len(names2)} variables) "
           f"mistmach matrix shape {matrix.shape}")
        ## filter the variables
        if is_marker_pairs:
            pairs = [(v1, v2) for v1, v2 in zip(var_names1, var_names2)
                     if v1 in om1_idx and v2 in om2_idx]
            var_names1 = [i for i, _ in pairs]
            var_names2 = [i for _, i in pairs]
        if var_names1 is not None:
            names1 = np.array([i for i in var_names1 if i in om1_idx])
            matrix = matrix[[om1_idx[i] for i in names1]]
        if var_names2 is not None:
            names2 = np.array([i for i in var_names2 if i in om2_idx])
            matrix = matrix[:, [om2_idx[i] for i in names2]]
        ## find the best diagonal match
        if is_marker_pairs:
            ids2 = list(range(len(names2)))
        else:
            ids2 = search.diagonal_linear_assignment(matrix, nan_policy=0)
        matrix = matrix[:, ids2]
        names2 = names2[ids2].tolist()
        names1 = names1.tolist()
        n1 = len(names1)
        n2 = len(names2)

        ## helper for marking the marker
        def _mark(ax):
            # row is yaxis and col is xaxis
            for y, row in enumerate(matrix):
                # sort descending order
                order = np.argsort(row)[::-1]
                x = order[0]
                ax.text(x + 0.02,
                        y + 0.03,
                        s=f"{matrix[y, x]:.2f}",
                        horizontalalignment='center',
                        verticalalignment='center',
                        fontsize=32 / np.log1p(max(n1, n2)),
                        color='magenta',
                        alpha=0.8,
                        weight='regular')

        ## plotting
        styles = dict(cmap="bwr",
                      xticklabels=names2,
                      yticklabels=names1,
                      xlabel=omic2.name,
                      ylabel=omic1.name,
                      gridline=0.1,
                      fontsize=10,
                      cbar=True)
        width = min(25, matrix.shape[1] / 1.2)
        fig = plt.figure(figsize=(width,
                                  width * matrix.shape[0] / matrix.shape[1]))
        _mark(
            vs.plot_heatmap(
                matrix,
                **styles,
                ax=None,
                title=f"[{figname}_x:{omic2.name}_y:{omic1.name}]{title}"))
        with catch_warnings_ignore(UserWarning):
            fig.tight_layout(rect=[0.0, 0.02, 1.0, 0.98])
        ## store and return
        if return_figure:
            return fig
        self.add_figure(f"{figname.lower()}_{omic1.name}_{omic2.name}", fig)
        return self
    def plot_correlation_scatter(self,
                                 omic1=OMIC.transcriptomic,
                                 omic2=OMIC.proteomic,
                                 var_names1='auto',
                                 var_names2='auto',
                                 is_marker_pairs=True,
                                 log1=True,
                                 log2=True,
                                 max_scatter_points=200,
                                 top=3,
                                 bottom=3,
                                 title='',
                                 return_figure=False):
        r""" Mapping from omic1 to omic2

    Arguments:
      omic1, omic2 : instance of OMIC.
        With `omic1` represent the x-axis, and `omic2` represent the y-axis.
      var_names1 : list of all variable name for `omic1`
    """
        omic1 = OMIC.parse(omic1)
        omic2 = OMIC.parse(omic2)
        if isinstance(var_names1, string_types) and var_names1 == 'auto':
            var_names1 = omic1.markers
        if isinstance(var_names2, string_types) and var_names2 == 'auto':
            var_names2 = omic2.markers
        if var_names1 is None or var_names2 is None:
            is_marker_pairs = False
        max_scatter_points = int(max_scatter_points)
        # get all correlations
        corr = self.get_correlation(omic1, omic2)
        corr_map = {(x[0], x[1]): (0 if np.isnan(x[2]) else x[2],
                                   0 if np.isnan(x[3]) else x[3])
                    for x in corr}
        om1_names = self.get_var_names(omic1)
        om2_names = self.get_var_names(omic2)
        om1_idx = {j: i for i, j in enumerate(om1_names)}
        om2_idx = {j: i for i, j in enumerate(om2_names)}
        # extract the data and normalization
        X1 = self.numpy(omic1)
        library = np.sum(X1, axis=1, keepdims=True)
        library = discretizing(library, n_bins=10, strategy='quantile').ravel()
        if log1:
            s = np.sum(X1, axis=1, keepdims=True)
            X1 = np.log1p(X1 / s * np.median(s))
        X2 = self.numpy(omic2)
        if log2:
            s = np.sum(X2, axis=1, keepdims=True)
            X2 = np.log1p(X2 / s * np.median(s))
        ### getting the marker pairs
        all_pairs = []
        # coordinate marker pairs
        if is_marker_pairs:
            pairs = [(i1, i2) for i1, i2 in zip(var_names1, var_names2)
                     if i1 in om1_idx and i2 in om2_idx]
            var_names1 = [i for i, _ in pairs]
            var_names2 = [i for _, i in pairs]
        # filter omic2
        if var_names2 is not None:
            var_names2 = [i for i in var_names2 if i in om2_names]
        else:
            var_names2 = om2_names
        assert len(var_names2) > 0, \
          (f"None of the variables {var_names2} is contained in variable list "
           f"of OMIC {omic2.name}")
        nrow = len(var_names2)
        # filter omic1
        if var_names1 is not None:
            var_names1 = [i for i in var_names1 if i in om1_names]
            ncol = len(var_names1)
            assert len(var_names1) > 0, \
              (f"None of the variables {var_names1} is contained in variable list "
               f"of OMIC {omic1.name}")
            for name2 in var_names2:
                for name1 in var_names1:
                    all_pairs.append((om1_idx[name1], om2_idx[name2]))
        else:
            # top and bottom correlation pairs
            top = int(top)
            bottom = int(bottom)
            ncol = top + bottom
            # pick all top and bottom of omic1 coordinated to omic2
            for name in var_names2:
                i2 = om2_idx[name]
                pairs = sorted([[sum(corr_map[(i1, i2)]), i1]
                                for i1 in range(len(om1_names))])
                for _, i1 in pairs[-top:][::-1] + pairs[:bottom][::-1]:
                    all_pairs.append((i1, i2))
        ### downsampling scatter points
        if max_scatter_points > 0:
            ids = np.random.permutation(len(X1))[:max_scatter_points]
        else:
            ids = np.arange(len(X1), dtype=np.int32)
        ### plotting
        fig = plt.figure(figsize=(ncol * 2, nrow * 2 + 2), dpi=80)
        for i, pair in enumerate(all_pairs):
            ax = plt.subplot(nrow, ncol, i + 1)
            p, s = corr_map[pair]
            idx1, idx2 = pair
            x1 = X1[:, idx1]
            x2 = X2[:, idx2]
            crow = i // ncol
            ccol = i % ncol
            if is_marker_pairs:
                color = 'salmon' if crow == ccol else 'blue'
            else:
                color = 'salmon' if ccol < top else 'blue'
            vs.plot_scatter(x=x1[ids],
                            y=x2[ids],
                            color=color,
                            ax=ax,
                            size=library[ids],
                            size_range=(6, 30),
                            legend_enable=False,
                            linewidths=0.,
                            cbar=False,
                            alpha=0.3)
            # additional title for first column
            ax.set_title(f"{om1_names[idx1]}\n$p={p:.2g}$ $s={s:.2g}$",
                         fontsize=8)
            # beginning of every column
            if i % ncol == 0:
                ax.set_ylabel(f"{om2_names[idx2]}", fontsize=8, weight='bold')
        ## big title
        plt.suptitle(f"[x:{omic1.name}_y:{omic2.name}]{title}", fontsize=10)
        fig.tight_layout(rect=[0.0, 0.02, 1.0, 0.98])
        ### store and return
        if return_figure:
            return fig
        self.add_figure(
            f"corr_{omic1.name}{'log' if log1 else 'raw'}_"
            f"{omic2.name}{'log' if log2 else 'raw'}", fig)
        return self
 def plot_series(self,
                 omic1=OMIC.transcriptomic,
                 omic2=OMIC.proteomic,
                 var_names1='auto',
                 var_names2='auto',
                 log1=True,
                 log2=True,
                 fontsize=10,
                 title='',
                 return_figure=False):
     r""" Plot lines of 2 OMICs sorted in ascending order of `omic1` """
     import seaborn as sns
     ## prepare
     omic1 = OMIC.parse(omic1)
     omic2 = OMIC.parse(omic2)
     omic1_ids = self.get_var_indices(omic1)
     omic2_ids = self.get_var_indices(omic2)
     if isinstance(var_names1, string_types) and var_names1 == 'auto':
         var_names1 = omic1.markers
     if isinstance(var_names2, string_types) and var_names2 == 'auto':
         var_names2 = omic2.markers
     ## filtering variables
     ids1 = []
     ids2 = []
     for v1, v2 in zip(var_names1, var_names2):
         i1 = omic1_ids.get(v1, None)
         i2 = omic2_ids.get(v2, None)
         if i1 is not None and i2 is not None:
             ids1.append(i1)
             ids2.append(i2)
     assert len(ids1) > 0, \
       (f"No variables found for omic1={omic1} var1={var_names1} "
        f"and omic2={omic2} var2={var_names2}")
     x1 = self.get_omic(omic1)[:, ids1]
     x2 = self.get_omic(omic2)[:, ids2]
     if log1:
         x1 = np.log1p(x1)
     if log2:
         x2 = np.log1p(x2)
     names1 = self.get_var_names(omic1)[ids1]
     names2 = self.get_var_names(omic2)[ids2]
     n_series = len(names1)
     ### prepare the plot
     colors = sns.color_palette(n_colors=2)
     fig = plt.figure(figsize=(12, n_series * 4))
     for idx in range(n_series):
         y1 = x1[:, idx]
         y2 = x2[:, idx]
         order = np.argsort(y1)
         ax = plt.subplot(n_series, 1, idx + 1)
         ## the second series
         ax.plot(y1[order],
                 linewidth=1.8,
                 color=colors[0],
                 label=f"{omic1.name}-{names1[idx]}")
         ax.set_ylabel(
             f"{'log' if log1 else 'raw'}-{omic1.name}-{names1[idx]}",
             color=colors[0])
         ax.set_xlabel(f"Cell in ascending order of {omic1.name}")
         ax.tick_params(axis='y', colors=colors[0], labelcolor=colors[0])
         ax.grid(False)
         ## the second series
         ax = ax.twinx()
         ax.plot(y2[order],
                 linestyle='--',
                 alpha=0.88,
                 linewidth=1.2,
                 color=colors[1])
         ax.set_ylabel(
             f"{'log' if log1 else 'raw'}-{omic2.name}-{names2[idx]}",
             color=colors[1])
         ax.tick_params(axis='y', colors=colors[1], labelcolor=colors[1])
         ax.grid(False)
     ### finalize the figure style
     if len(title) > 0:
         plt.suptitle(title, fontsize=fontsize + 2)
     with catch_warnings_ignore(UserWarning):
         plt.tight_layout(rect=[0., 0.02, 1., 0.98])
     if return_figure:
         return fig
     return self.add_figure(f'series_{omic1.name}_{omic2.name}', fig)