Пример #1
0
  def correlation_matrix(
      self,
      method: Literal['spearman', 'pearson', 'lasso', 'average'] = 'spearman',
      sort_pairs: bool = False,
      seed: int = 1,
  ) -> ndarray:
    """Correlation matrix of `latent codes` (row) and `groundtruth factors`
    (column).

    Parameters
    ----------
    method : {'spearman', 'pearson', 'lasso', 'average'}
        method for calculating the correlation,
        'spearman' - rank or monotonic correlation
        'pearson' - linear correlation
        'lasso' - lasso regression
        'average' - compute all known method then taking average,
        by default 'spearman'
    sort_pairs : bool, optional
        If True, reorganize the row of correlation matrix
        for the best match between code-factor (i.e. the largest diagonal sum).
        Note: the decoding is performed on train matrix, then applied to test
        matrix, by default False
    seed : int, optional
        random state seed, by default 1

    Returns
    -------
    ndarray
        correlation matrices `[n_latents, n_factors]`, all entries are in `[0, 1]`.
    OrderedDict (optional)
        mapping from decoded factor index to latent code index.
    """
    corr_mat = correlation_matrix(x1=self.dist_to_tensor(self.latents),
                                  x2=self.factors,
                                  method=method,
                                  seed=seed)
    ## decoding and return
    if sort_pairs:
      ids = diagonal_linear_assignment(corr_mat)
      corr_mat = corr_mat[ids, :]
      return corr_mat, OrderedDict(zip(range(self.n_factors), ids))
    return corr_mat
Пример #2
0
    def correlation_matrix(
        self,
        method: Literal['spearman', 'pearson', 'lasso',
                        'average'] = 'spearman',
        sort_pairs: bool = False,
        seed: int = 1,
    ) -> ndarray:
        """Correlation matrix of `latent codes` (row) and `groundtruth factors`
    (column).

    Parameters
    ----------
    method : {'spearman', 'pearson', 'lasso', 'average'}
        method for calculating the correlation,
        'spearman' - rank or monotonic correlation
        'pearson' - linear correlation
        'lasso' - lasso regression
        'average' - compute all known method then taking average,
        by default 'spearman'
    sort_pairs : bool, optional
        If True, reorganize the row of correlation matrix
        for the best match between code-factor (i.e. the largest diagonal sum).
        Note: the decoding is performed on train matrix, then applied to test
        matrix, by default False
    seed : int, optional
        random state seed, by default 1

    Returns
    -------
    ndarray
        correlation matrices `[n_latents, n_factors]`, all entries are in `[0, 1]`.
    OrderedDict (optional)
        mapping from decoded factor index to latent code index.
    """
        method = str(method).strip().lower()
        all_corr = ['spearman', 'lasso', 'pearson', 'average']
        assert method in all_corr, \
          f"Support {all_corr} correlation but given method='{method}'"
        ### average mode
        if method == 'average':
            corr_mat = sum(
                self.correlation_matrix(convert_to_tensor=self.dist_to_tensor,
                                        method=corr,
                                        sort_pairs=False,
                                        seed=seed)
                for corr in ['spearman', 'pearson', 'lasso']) / 3
        ### specific mode
        else:
            # start form correlation matrix
            z = self.dist_to_tensor(self.latents).numpy()
            f = self.factors
            # lasso
            if method == 'lasso':
                from sklearn.linear_model import Lasso
                model = Lasso(random_state=seed, alpha=0.1)
                model.fit(z, f)
                # coef_ is [n_target, n_features], so we need transpose here
                corr_mat = np.transpose(np.absolute(model.coef_))
            # spearman and pearson
            else:
                corr_mat = np.empty(shape=(self.n_latents, self.n_factors),
                                    dtype=np.float64)
                for code in range(self.n_latents):
                    for fact in range(self.n_factors):
                        x, y = z[:, code], f[:, fact]
                        if method == 'spearman':
                            corr = sp.stats.spearmanr(x, y,
                                                      nan_policy="omit")[0]
                        elif method == 'pearson':
                            corr = sp.stats.pearsonr(x, y)[0]
                        corr_mat[code, fact] = corr
        ## decoding and return
        if sort_pairs:
            ids = diagonal_linear_assignment(corr_mat)
            corr_mat = corr_mat[ids, :]
            return corr_mat, OrderedDict(zip(range(self.n_factors), ids))
        return corr_mat
Пример #3
0
    def plot_disentanglement(
        self,
        factor_indices: Optional[Union[int, str, List[Union[int,
                                                            str]]]] = None,
        n_bins_factors: int = 15,
        n_bins_codes: int = 80,
        corr_type: Union[Literal['spearman', 'pearson', 'lasso', 'average',
                                 'mi'], ndarray] = 'average',
        original_factors: bool = True,
        show_all_codes: bool = False,
        sort_pairs: bool = True,
        title: str = '',
        return_figure: bool = False,
        seed: int = 1,
    ):
        r""" To illustrate the disentanglement of the codes, the codes' histogram
    bars are colored by the value of factors.

    Arguments:
      factor_names : list of String or Integer.
        Name or index of which factors will be used for visualization.
      factor_bins : factor is discretized into bins, then a LogisticRegression
        model will predict the bin (with color) given the code as input.
      corr_type : {'spearman', 'pearson', 'lasso', 'average', 'mi', None, matrix}
        Type of correlation, with special case 'mi' for mutual information.
          - If None, no sorting by correlation provided.
          - If an array, the array must have shape `[n_codes, n_factors]`
      show_all_codes : a Boolean.
        if False, only show most correlated codes-factors, otherwise,
        all codes are shown for each factor.
        This option only in effect when `corr_type` is not `None`.
      original_factors : optional original factors before discretized by
        `Criticizer`
    """
        ### prepare styled plot
        styles = dict(fontsize=12,
                      cbar_horizontal=False,
                      bins_color=int(n_bins_factors),
                      bins=int(n_bins_codes),
                      color='bwr',
                      alpha=0.8)
        # get all relevant factors
        if factor_indices is None:
            factor_indices = list(range(self.n_factors))
        factor_indices = [
            int(i) if isinstance(i, Number) else self.factor_names.index(i)
            for i in as_tuple(factor_indices)
        ]
        ### correlation
        if isinstance(corr_type, string_types):
            if corr_type == 'mi':
                corr = self.mutualinfo_matrix(
                    convert_to_tensor=self.dist_to_tensor, seed=seed)
                score_type = 'mutual-info'
            else:
                corr = self.correlation_matrix(
                    convert_to_tensor=self.dist_to_tensor,
                    method=corr_type,
                    seed=seed)
                score_type = corr_type
            # [n_factors, n_codes]
            corr = corr.T[factor_indices]
        ### directly give the correlation matrix
        elif isinstance(corr_type, ndarray):
            corr = corr_type
            if self.n_latents != self.n_factors and corr.shape[
                    0] == self.n_latents:
                corr = corr.T
            assert corr.shape == (self.n_factors, self.n_latents), \
              (f"Correlation matrix expect shape (n_factors={self.n_factors}, "
               f"n_codes={self.n_codes}) but given shape: {corr.shape}")
            score_type = 'score'
            corr = corr[factor_indices]
        ### exception
        else:
            raise ValueError(
                f"corr_type could be string, None or a matrix but given: {type(corr_type)}"
            )
        ### sorting the latents
        if sort_pairs:
            latent_indices = diagonal_linear_assignment(np.abs(corr),
                                                        nan_policy=0)
        else:
            latent_indices = np.arange(self.n_latents, dtype=np.int32)
        if not show_all_codes:
            latent_indices = latent_indices[:len(factor_indices)]
        corr = corr[:, latent_indices]
        ### prepare the data
        # factors
        F = (self.factors_original
             if original_factors else self.factors)[:, factor_indices]
        factor_names = np.asarray(self.factor_names)[factor_indices]
        # codes
        Z = self.dist_to_tensor(self.latents).numpy()[:, latent_indices]
        latent_names = np.asarray(self.latent_names)[latent_indices]
        ### create the figure
        nrow = F.shape[1]
        ncol = Z.shape[1] + 1
        fig = vs.plot_figure(nrow=nrow * 3, ncol=ncol * 2.8, dpi=100)
        count = 1
        for fidx, (f, fname) in enumerate(zip(F.T, factor_names)):
            # the first plot show how the factor clustered
            ax, _, _ = vs.plot_histogram(x=f,
                                         color_val=f,
                                         ax=(nrow, ncol, count),
                                         cbar=False,
                                         title=f"{fname}",
                                         **styles)
            ax.tick_params(axis='y', labelleft=False)
            count += 1
            # the rest of the row show how the codes align with the factor
            for zidx, (score, z,
                       zname) in enumerate(zip(corr[fidx], Z.T, latent_names)):
                text = "*" if fidx == zidx else ""
                ax, _, _ = vs.plot_histogram(
                    x=z,
                    color_val=f,
                    ax=(nrow, ncol, count),
                    cbar=False,
                    title=f"{text}{fname}-{zname} (${score:.2f}$)",
                    bold_title=True if fidx == zidx else False,
                    **styles)
                ax.tick_params(axis='y', labelleft=False)
                count += 1
        ### fine tune the plot
        fig.suptitle(f"[{score_type}]{title}", fontsize=12)
        fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97])
        if return_figure:
            return fig
        return self.add_figure(
            f"disentanglement_{'original' if original_factors else 'discretized'}",
            fig)
Пример #4
0
  def create_correlation_matrix(self,
                                method='spearman',
                                mean=True,
                                decode=False):
    r""" Correlation matrix of `latent codes` (row) and `groundtruth factors`
    (column).

    Arguments:
      mean : a Boolean. Using mean as the statistics, otherwise, sampling.
      method : {'spearman', 'pearson', 'lasso', 'avg'}
        spearman - rank or monotonic correlation
        pearson - linear correlation
        lasso - lasso regression
        avg - compute all known method then taking average
      decode : a Boolean. If True, reorganize the row of correlation matrix
        for the best match between code-factor (i.e. the largest diagonal sum).
        Note: the decoding is performed on train matrix, then applied to test
        matrix

    Returns:
      train, test : correlation matrices `[n_codes, n_factors]`
        for both training and testing data.
        All entries are in `[0, 1]`.
      (optional) OrderedDict mapping from decoded factor index to
        latent code index.
    """
    method = str(method).strip().lower()
    if method in ('avg', 'avr', 'average'):
      method = 'average'
    all_corr = ['spearman', 'lasso', 'pearson', 'average']
    assert isinstance(mean, bool), "mean is boolean but given: %s" % mean
    assert method in all_corr, \
      "Support %s correlation but given method='%s'" % (str(all_corr), method)
    # special average mode
    if method == 'average':
      mat = [
          self.create_correlation_matrix(mean=mean, method=corr, decode=False)
          for corr in ['spearman', 'pearson']
      ]
      n = len(all_corr) - 1
      train = sum(i[0] for i in mat) / n
      test = sum(i[1] for i in mat) / n
    else:
      # start form correlation matrix
      z_train, z_test = self._latent_codes(mean)
      f_train, f_test = self.factors

      # helper function
      def fn_corr(x1, x2):
        if method == 'lasso':
          model = Lasso(random_state=self.randint, alpha=0.1)
          model.fit(x1, x2)
          # coef_ is [n_target, n_features], so we need transpose here
          corr_mat = np.transpose(np.absolute(model.coef_))
        else:
          corr_mat = np.empty(shape=(self.n_representations, self.n_factors),
                              dtype=np.float64)
          for code in range(self.n_representations):
            for fact in range(self.n_factors):
              x, y = x1[:, code], x2[:, fact]
              if method == 'spearman':
                corr = sp.stats.spearmanr(x, y, nan_policy="omit")[0]
              elif method == 'pearson':
                corr = sp.stats.pearsonr(x, y)[0]
              elif method == 'lasso':
                pass
              corr_mat[code, fact] = corr
        return corr_mat

      train, test = fn_corr(z_train, f_train), fn_corr(z_test, f_test)
    ## decoding and return
    if decode:
      ids = search.diagonal_linear_assignment(train.T)
      train = train[ids, :]
      test = test[ids, :]
      return train, test, OrderedDict(zip(range(self.n_factors), ids))
    return train, test
Пример #5
0
  def create_divergence_matrix(self,
                               n_samples=1000,
                               lognorm=True,
                               n_components=2,
                               normalize_per_code=True,
                               decode=False):
    r""" Using GMM fitted on the factors to estimate the divergence to each
    latent code.

    It means calculating the divergence: `DKL(q(z|x)||p(y))`, where:
      - q(z|x) is latent code of Gaussian distribution
      - p(y) is factor of Gaussian mixture model with `n_components`

    The calculation is repeated for each pair of (code, factor). This method is
    recommended for factors that are continuous values.

    Return:
      a matrix of shape `[n_codes, n_factors]`
    """
    n_samples = int(n_samples)
    n_codes = self.n_codes
    n_factors = self.n_factors
    matrices = []
    for qZ, y in zip(self.representations, self.original_factors):
      ### normalizing the factors
      if lognorm:
        y = np.log1p(y)
      # standardizing for each factor
      y = (y - np.mean(y, axis=0, keepdims=True)) / (
          np.std(y, axis=0, keepdims=True) + 1e-10)
      ### train the Gaussian mixture on the factors
      f_gmm = []
      for fidx, (f, fname) in enumerate(zip(y.T, self.factor_names)):
        gmm = tfd.GaussianMixture.init(f[:, np.newaxis],
                                       n_components=n_components,
                                       covariance_type='diag',
                                       batch_shape=None,
                                       dtype=tf.float64,
                                       name=fname)
        f_gmm.append(gmm)
      ### the code Gaussian
      z_gau = []
      for mean, stddev, code_name in zip(tf.transpose(qZ.mean()),
                                         tf.transpose(qZ.stddev()),
                                         self.code_names):
        mean = tf.cast(mean, tf.float64)
        stddev = tf.cast(stddev, tf.float64)
        z_gau.append(
            tfd.Independent(tfd.Normal(loc=mean, scale=stddev, name=code_name),
                            reinterpreted_batch_ndims=1))
      ### calculate the KL divergence
      density_matrix = np.empty(shape=(n_codes, n_factors), dtype=np.float64)
      for zidx, gau in enumerate(z_gau):
        for fidx, gmm in enumerate(f_gmm):
          # non-analytic KL(q=gau||p=gmm)
          samples = gau.sample(n_samples)
          with tf.device("/CPU:0"):
            qllk = gau.log_prob(samples)
            pllk = tf.reduce_sum(tf.reshape(
                gmm.log_prob(tf.reshape(samples, (-1, 1))), (n_samples, -1)),
                                 axis=1)
            kl = tf.reduce_mean(qllk - pllk)
          density_matrix[zidx, fidx] = kl.numpy()
      if bool(normalize_per_code):
        density_matrix = density_matrix / np.sum(
            density_matrix, axis=1, keepdims=True)
      matrices.append(density_matrix)
    ### decoding and return
    train, test = matrices
    if decode:
      ids = search.diagonal_linear_assignment(train.T)
      train = train[ids]
      test = test[ids]
      return train, test, ids
    return train, test
    def _plot_heatmap_matrix(self,
                             matrix,
                             figname,
                             omic1=OMIC.transcriptomic,
                             omic2=OMIC.proteomic,
                             var_names1=MARKER_ADT_GENE.values(),
                             var_names2=MARKER_ADT_GENE.keys(),
                             is_marker_pairs=True,
                             title='',
                             return_figure=False):
        omic1 = OMIC.parse(omic1)
        omic2 = OMIC.parse(omic2)
        if isinstance(var_names1, string_types) and var_names1 == 'auto':
            var_names1 = omic1.markers
        if isinstance(var_names2, string_types) and var_names2 == 'auto':
            var_names2 = omic2.markers
        if var_names1 is None or var_names2 is None:
            is_marker_pairs = False
        names1 = self.get_var_names(omic1)
        names2 = self.get_var_names(omic2)
        om1_idx = {j: i for i, j in enumerate(names1)}
        om2_idx = {j: i for i, j in enumerate(names2)}
        assert matrix.shape == (len(names1), len(names2)), \
          (f"Given OMIC {omic1.name}({len(names1)} variables) and "
           f"OMIC {omic2.name}({len(names2)} variables) "
           f"mistmach matrix shape {matrix.shape}")
        ## filter the variables
        if is_marker_pairs:
            pairs = [(v1, v2) for v1, v2 in zip(var_names1, var_names2)
                     if v1 in om1_idx and v2 in om2_idx]
            var_names1 = [i for i, _ in pairs]
            var_names2 = [i for _, i in pairs]
        if var_names1 is not None:
            names1 = np.array([i for i in var_names1 if i in om1_idx])
            matrix = matrix[[om1_idx[i] for i in names1]]
        if var_names2 is not None:
            names2 = np.array([i for i in var_names2 if i in om2_idx])
            matrix = matrix[:, [om2_idx[i] for i in names2]]
        ## find the best diagonal match
        if is_marker_pairs:
            ids2 = list(range(len(names2)))
        else:
            ids2 = search.diagonal_linear_assignment(matrix, nan_policy=0)
        matrix = matrix[:, ids2]
        names2 = names2[ids2].tolist()
        names1 = names1.tolist()
        n1 = len(names1)
        n2 = len(names2)

        ## helper for marking the marker
        def _mark(ax):
            # row is yaxis and col is xaxis
            for y, row in enumerate(matrix):
                # sort descending order
                order = np.argsort(row)[::-1]
                x = order[0]
                ax.text(x + 0.02,
                        y + 0.03,
                        s=f"{matrix[y, x]:.2f}",
                        horizontalalignment='center',
                        verticalalignment='center',
                        fontsize=32 / np.log1p(max(n1, n2)),
                        color='magenta',
                        alpha=0.8,
                        weight='regular')

        ## plotting
        styles = dict(cmap="bwr",
                      xticklabels=names2,
                      yticklabels=names1,
                      xlabel=omic2.name,
                      ylabel=omic1.name,
                      gridline=0.1,
                      fontsize=10,
                      cbar=True)
        width = min(25, matrix.shape[1] / 1.2)
        fig = plt.figure(figsize=(width,
                                  width * matrix.shape[0] / matrix.shape[1]))
        _mark(
            vs.plot_heatmap(
                matrix,
                **styles,
                ax=None,
                title=f"[{figname}_x:{omic2.name}_y:{omic1.name}]{title}"))
        with catch_warnings_ignore(UserWarning):
            fig.tight_layout(rect=[0.0, 0.02, 1.0, 0.98])
        ## store and return
        if return_figure:
            return fig
        self.add_figure(f"{figname.lower()}_{omic1.name}_{omic2.name}", fig)
        return self
Пример #7
0
  def plot_disentanglement(self,
                           factor_names=None,
                           n_bins_factors=15,
                           n_bins_codes=80,
                           corr_type='average',
                           original_factors=True,
                           show_all_codes=False,
                           title='',
                           return_figure=False):
    r""" To illustrate the disentanglement of the codes, the codes' histogram
    bars are colored by the value of factors.

    Arguments:
      factor_names : list of String or Integer.
        Name or index of which factors will be used for visualization.
      factor_bins : factor is discretized into bins, then a LogisticRegression
        model will predict the bin (with color) given the code as input.
      corr_type : {'spearman', 'pearson', 'lasso', 'average', 'mi', None, matrix}
        Type of correlation, with special case 'mi' for mutual information.
          - If None, no sorting by correlation provided.
          - If an array, the array must have shape `[n_codes, n_factors]`
      show_all_codes : a Boolean.
        if False, only show most correlated codes-factors, otherwise,
        all codes are shown for each factor.
        This option only in effect when `corr_type` is not `None`.
      original_factors : optional original factors before discretized by
        `Criticizer`
    """
    self.assert_sampled()
    ### prepare styled plot
    from matplotlib import pyplot as plt
    import seaborn as sns
    sns.set()
    styles = dict(fontsize=12,
                  cbar_horizontal=False,
                  bins_color=int(n_bins_factors),
                  bins=int(n_bins_codes),
                  color='bwr',
                  alpha=0.8)
    # get all relevant factors
    factor_ids = self._check_factors(factor_names)
    ### correlation
    if isinstance(corr_type, string_types):
      if corr_type == 'mi':
        train_corr, test_corr = self.create_mutualinfo_matrix(mean=True)
        score_type = 'mutual-info'
      else:
        train_corr, test_corr = self.create_correlation_matrix(mean=True,
                                                               method=corr_type)
        score_type = corr_type
      # [n_factors, n_codes]
      corr = ((train_corr + test_corr) / 2.).T
      corr = corr[factor_ids]
      code_ids = diagonal_linear_assignment(np.abs(corr), nan_policy=0)
      if not show_all_codes:
        code_ids = code_ids[:len(factor_ids)]
    # directly give the correlation matrix
    elif isinstance(corr_type, np.ndarray):
      corr = corr_type
      if self.n_codes != self.n_factors and corr.shape[0] == self.n_codes:
        corr = corr.T
      assert corr.shape == (self.n_factors, self.n_codes), \
        (f"Correlation matrix expect shape (n_factors={self.n_factors}, "
         f"n_codes={self.n_codes}) but given shape: {corr.shape}")
      score_type = 'score'
      corr = corr[factor_ids]
      code_ids = diagonal_linear_assignment(np.abs(corr), nan_policy=0)
      if not show_all_codes:
        code_ids = code_ids[:len(factor_ids)]
    # no correlation provided
    elif corr_type is None:
      train_corr, test_corr = self.create_correlation_matrix(mean=True,
                                                             method='spearman')
      score_type = 'spearman'
      # [n_factors, n_codes]
      corr = ((train_corr + test_corr) / 2.).T
      code_ids = np.arange(self.n_codes, dtype=np.int32)
    # exception
    else:
      raise ValueError(
          f"corr_type could be string, None or a matrix but given: {type(corr_type)}"
      )
    # applying the indexing
    corr = corr[:, code_ids]
    ### prepare the data
    # factors
    F = np.concatenate(
        self.original_factors if original_factors else self.factors,
        axis=0,
    )[:, factor_ids]
    factor_names = self.factor_names[factor_ids]
    # codes
    Z = np.concatenate(self.representations_mean, axis=0)[:, code_ids]
    code_names = self.code_names[code_ids]
    ### create the figure
    nrow = F.shape[1]
    ncol = Z.shape[1] + 1
    fig = vs.plot_figure(nrow=nrow * 3, ncol=ncol * 2.8, dpi=80)
    count = 1
    for fidx, (f, fname) in enumerate(zip(F.T, factor_names)):
      # the first plot show how the factor clustered
      ax = vs.plot_histogram(x=f,
                             color_val=f,
                             ax=(nrow, ncol, count),
                             cbar=False,
                             title=f"{fname}",
                             **styles)
      plt.gca().tick_params(axis='y', labelleft=False)
      count += 1
      # the rest of the row show how the codes align with the factor
      for zidx, (score, z, zname) in enumerate(zip(corr[fidx], Z.T,
                                                   code_names)):
        text = "*" if fidx == zidx else ""
        ax = vs.plot_histogram(x=z,
                               color_val=f,
                               ax=(nrow, ncol, count),
                               cbar=False,
                               title=f"{text}{fname}-{zname} (${score:.2f}$)",
                               bold_title=True if fidx == zidx else False,
                               **styles)
        plt.gca().tick_params(axis='y', labelleft=False)
        count += 1
    ### fine tune the plot
    fig.suptitle(f"[{score_type}]{title}", fontsize=12)
    fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97])
    if return_figure:
      return fig
    return self.add_figure(
        f"disentanglement_{'original' if original_factors else 'discretized'}",
        fig)