示例#1
0
    def __init__(self,
                 kernel,
                 kernel_params={},
                 n_components=None,
                 n_pv=None,
                 n_jobs=1):
        """
        Parameters
        ----------
        kernel : str, default to 'linear'
            Name of the kernel to be used in the algorithm. Has to be compliant with
            <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics">
            scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ...

        kernel_params : dict, default to None
            Parameters of the kernel (degree for polynomial kernel, gamma for RBF).
            Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}.

        n_components : int or dict, default to None
            Number of components for kernel PCA.
            <br/> If int, then indicates the same number of components for source and target.
            <br/> If dict, then must be of the form {'source':int, 'target':int}.

        n_pv : int, default to None
            Number of principal vectors.

        n_jobs : int, default to 1
            Number of concurrent threads to use for tasks that can be parallelized.
        """

        self.gamma_coef = None
        self.alpha_coef = None
        self.beta_coef = None
        self.canonical_angles = None

        self.kernel = kernel
        self.kernel_params_ = kernel_params

        self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_,
                                             n_jobs)

        # Put n_components in dictionary format.
        self.n_components = n_components
        if type(self.n_components) == int:
            self.n_components = {
                s: self.n_components
                for s in ['source', 'target']
            }
        self.n_pv = n_pv

        self.n_jobs = n_jobs
示例#2
0
 def uncentered_rbf_kernel_computer(self, source_data, target_data):
     return KernelComputer('rbf').fit(source_data,
                                      target_data,
                                      center=False)
示例#3
0
 def rbf_kernel_computer(self):
     return KernelComputer('rbf')
示例#4
0
 def uncentered_linear_kernel_computer(self, source_data, target_data):
     return KernelComputer('linear').fit(source_data,
                                         target_data,
                                         center=False)
示例#5
0
 def linear_kernel_computer(self):
     return KernelComputer('linear')
示例#6
0
def linear_kernel_matrix(source_data, target_data):
    k = KernelComputer('linear')
    k.fit(source_data, target_data, center=False)

    return k
示例#7
0
def rbf_kernel_matrix(source_data, target_data):
    k = KernelComputer('rbf', rbf_params)
    k.fit(source_data, target_data, center=False)

    return k
示例#8
0
class PVComputation:
    """
    PVComputation handles the dimensionality reduction and alignment of learned manifold.
    <br/><br/>
    This class contains all the following tasks and sub-routines:
    <ul>
        <li> Kernel PCA decomposition on source and target independently.
         <li> Kernel principal components comparison.
         <li> Computation of Principal Vectors (PVs).
    </ul>
    """
    def __init__(self,
                 kernel,
                 kernel_params={},
                 n_components=None,
                 n_pv=None,
                 n_jobs=1):
        """
        Parameters
        ----------
        kernel : str, default to 'linear'
            Name of the kernel to be used in the algorithm. Has to be compliant with
            <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics">
            scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ...

        kernel_params : dict, default to None
            Parameters of the kernel (degree for polynomial kernel, gamma for RBF).
            Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}.

        n_components : int or dict, default to None
            Number of components for kernel PCA.
            <br/> If int, then indicates the same number of components for source and target.
            <br/> If dict, then must be of the form {'source':int, 'target':int}.

        n_pv : int, default to None
            Number of principal vectors.

        n_jobs : int, default to 1
            Number of concurrent threads to use for tasks that can be parallelized.
        """

        self.gamma_coef = None
        self.alpha_coef = None
        self.beta_coef = None
        self.canonical_angles = None

        self.kernel = kernel
        self.kernel_params_ = kernel_params

        self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_,
                                             n_jobs)

        # Put n_components in dictionary format.
        self.n_components = n_components
        if type(self.n_components) == int:
            self.n_components = {
                s: self.n_components
                for s in ['source', 'target']
            }
        self.n_pv = n_pv

        self.n_jobs = n_jobs

    def fit(self,
            source_data,
            target_data,
            method='two-stage',
            n_components=None,
            n_pv=None):
        """
        Computes the kernel principal vectors between source and target data.

        Parameters
        -------
        source_data: numpy.ndarray, shape (n_samples, n_genes)
            Source data

        target_data: numpy.ndarray, shape (n_samples, n_genes)
            Source data

        method: str, default to "two-stage"
            Method used for computed the kernel PVs, either "two-stage" (first kernel PCA, then
            alignment), or "direct" (direct minimization).
            <br/>
            <b>NOT IMPLEMENTED:</b> The one-shot computation of the PVs has not been implemented.

        n_components: int, default to None
            Number of components taken into the decomposition.

        n_pv: int, default to None
            Number of Principal Vectors. If not set here or in __init__, then maximum number of PV will be computed.

        Returned Values
        -------
        self : PVComputation
            Fitted instance.
        """

        # Compute kernel matrices
        self.kernel_values_.fit(source_data, target_data, center=True)

        if method == 'two-stage':
            self._two_stage_computation(n_components, n_pv)
        elif method == 'direct':
            self._direct_computation(n_components)

        return self

    def transform(self, X, right_center=False):
        """
        Project data X on source and target kernel principal vectors

        Parameters
        -------
        X: numpy.ndarray, shape (n_samples, n_genes)
            Data to project

        right_center: Boolean, default to False
            Whether data should be implicitly mean centered

        Returned Values
        -------
        Dictionary with 'source' and 'target' as keys, and projected arrays as values.
        """

        X_projected = {}
        for t in ['source', 'target']:
            X_projected[t] = self._project_PV_from_data(X, t, right_center)

        return X_projected

    def fit_transform(self,
                      source_data,
                      target_data,
                      method='two-stage',
                      n_components=None,
                      n_pv=None):
        """
        Computes the kernel principal vectors between source and target data.

        -------
        source_data: numpy.ndarray, shape (n_samples, n_genes)
            Source data

        target_data: numpy.ndarray, shape (n_samples, n_genes)
            Source data

        method: str, default to "two-stage"
            Method used for computed the kernel PVs, either "two-stage" (first kernel PCA, then
            alignment), or "direct" (direct minimization).
            <br/>
            <b>NOT IMPLEMENTED:</b> The one-shot computation of the PVs has not been implemented.

        n_components: int or dictionary, default to None
            Number of components taken into account for PCA. Can be int (if same number of components
            for source or target) or dictionary with {'source': int, 'target':int} indicating the
            number of source and target principal components.

        n_pv: int, default to None
            Number of Principal Vectors. If not set here or in __init__, then maximum number of PV will be computed.

        Returned Values
        -------
        source_projected: dictionary

        target_projected: dictionary
        """

        self.fit(source_data, target_data, method, n_components)

        source_projected = {
            'source': self._project_PV_from_data(source_data, 'source'),
            'target': self._project_PV_from_data(source_data, 'target')
        }

        target_projected = {
            'source': self._project_PV_from_data(target_data, 'source'),
            'target': self._project_PV_from_data(target_data, 'target')
        }

        return source_projected, target_projected

    def _two_stage_computation(self, n_components=None, n_pv=None):

        self.n_components = n_components or self.n_components
        if self.n_components is None or type(self.n_components) == int:
            self.n_components = {
                s: self.n_components
                for s in ['source', 'target']
            }

        self.n_pv = n_pv or (self.n_pv or min(self.n_components.values()))

        ## First step: Kernel PCA
        self._dim_reduction()

        ## Second step: Align based on cosine similarity
        self._align_principal_components()

    def _dim_reduction(self):
        self.dim_reduc_clf_ = {}
        self.alpha_coef = {}

        # Independent processing of source and target
        for t in ['source', 'target']:
            # Reduce dimensionality using kernelPCA.
            self.dim_reduc_clf_[t] = KernelPCA(
                self.n_components[t],
                kernel='precomputed'
                if self.kernel == 'mallow' else self.kernel,
                n_jobs=self.n_jobs,
                **self.kernel_params_)
            if self.kernel == 'mallow':
                self.dim_reduc_clf_[t].fit(
                    self.kernel_values_.kernel_submatrices[t])
            else:
                self.dim_reduc_clf_[t].fit(self.kernel_values_.data[t])

            # Save kernel PCA coefficients
            self.alpha_coef[t] = self.dim_reduc_clf_[t].alphas_ / np.sqrt(
                self.dim_reduc_clf_[t].lambdas_)

    def _align_principal_components(self):
        self.cosine_similarity_ = self.alpha_coef['source'].T.dot(
            self.kernel_values_.k_st).dot(self.alpha_coef['target'])

        beta_s, theta, beta_t = np.linalg.svd(self.cosine_similarity_)
        self.beta_coef = {}
        self.beta_coef['source'] = beta_s
        self.beta_coef[
            'target'] = beta_t.T  # Due to definition of SVD by matplotlib

        # Computation of gamma coefficients
        self.gamma_coef = {}
        for t in ['source', 'target']:
            self.gamma_coef[t] = self.beta_coef[t].T.dot(self.alpha_coef[t].T)
            self.gamma_coef[t] = self.gamma_coef[t][:self.n_pv]

        # Canonical angles
        self.canonical_angles = np.arccos(theta[:self.n_pv])

    def _direct_computation(self, n_components=None):
        raise NotImplementedError(
            'Direct computation of PVs has not been implemented.')

    def _project_PV_from_data(self, X, t, right_center=False):
        """
        Project data X on source and target kernel principal vectors

        -------
        X: numpy.ndarray, shape (n_samples, n_genes)
            Data to project

        t: str
            Type, either 'source' or 'target'

        right_center: Boolean, default to False
            Whether data should be implicitly mean centered

        Returned Values
        -------
        Dictionary with 'source' and 'target' as keys, and projected arrays as values.
        Projected arrays are of size (n_samples, n_pv)
        """

        K = self.kernel_(self.kernel_values_.data[t], X, **self.kernel_params_)
        K = _left_center_kernel(K)
        if right_center:
            K = _right_center_kernel(K)

        return self._project_PV_from_kernel(K, t)

    def _project_PV_from_kernel(self, K, t):
        """
        Project kernel X on source and target kernel principal vectors

        -------
        K: numpy.ndarray, shape (n_samples, n_samples)
            Kernel matrix between data from type t and specific dataset.
            Source (or target) samples in the rows (same order as given to the algorithm)
            New dataset samples in the columns 

        t: str
            Type, either 'source' or 'target'

        Returned Values
        -------
        Dictionary with 'source' and 'target' as keys, and projected arrays as values.
        Projected arrays are of size (n_samples, n_pv)
        """

        return self.gamma_coef[t].dot(K).T
示例#9
0
    def __init__(self,
                 kernel='linear',
                 kernel_params=None,
                 n_components=None,
                 n_pv=None,
                 method='two-stage',
                 step=100,
                 n_jobs=1,
                 verbose=False):
        """
        Parameters
        ----------
        kernel : str, default to 'linear'
            Name of the kernel to be used in the algorithm. Has to be compliant with
            <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics">
            scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ...

        kernel_params : dict, default to None
            Parameters of the kernel (degree for polynomial kernel, gamma for RBF).
            Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}.

        n_components : int or dict, default to None
            Number of components for kernel PCA.
            <br/> If int, then indicates the same number of components for source and target.
            <br/> If dict, then must be of the form {'source':int, 'target':int}.

        n_pv : int, default to None
            Number of principal vectors.

        method : str, default to 'two-stage'
            Method used for computing the principal vectors. Only 'two-stage' has been implemented.

        step: int, default to 100
            Number of interpolation steps.

        n_jobs: int, default to 1
            Number of concurrent threads to use for tasks that can be parallelized.

        verbose: bool or int, default to False
            Degree of verbosity in joblib routines.
        """

        self.kernel = kernel
        self.kernel_params_ = kernel_params or {}
        self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_,
                                             n_jobs)

        self.source_data_ = None
        self.target_data_ = None

        self.is_fitted = False

        self.n_components = n_components
        self.n_pv = n_pv
        self.method = method
        self.step = step

        self.predictive_clf = None

        self.n_jobs = n_jobs
        self.verbose = verbose
示例#10
0
class TRANSACT:
    """
    TRANSACT is a package designed to adapt predictors of drug response from pre-clinical models to the clinic.
    <br/><br/>
    This class contains all the tasks and sub-routines required for training the domain adaptation framework, i.e.:
    <ul>
        <li> Kernel PCA decomposition on source and target independently.
         <li> Kernel principal components comparison.
         <li> Computation of Principal Vectors (PVs).
         <li> Interpolation between source and target PVs and extraction of Consensus Features (CFs).
         <li> Out-of-sample extension: project new dataset onto the consensus features.
    </ul>
    """
    def __init__(self,
                 kernel='linear',
                 kernel_params=None,
                 n_components=None,
                 n_pv=None,
                 method='two-stage',
                 step=100,
                 n_jobs=1,
                 verbose=False):
        """
        Parameters
        ----------
        kernel : str, default to 'linear'
            Name of the kernel to be used in the algorithm. Has to be compliant with
            <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics">
            scikit-learn kernel</a>, e.g., "rbf", "polynomial", "laplacian", "linear", ...

        kernel_params : dict, default to None
            Parameters of the kernel (degree for polynomial kernel, gamma for RBF).
            Naming has to be compliant with scikit-learn, e.g., {"gamma": 0.0005}.

        n_components : int or dict, default to None
            Number of components for kernel PCA.
            <br/> If int, then indicates the same number of components for source and target.
            <br/> If dict, then must be of the form {'source':int, 'target':int}.

        n_pv : int, default to None
            Number of principal vectors.

        method : str, default to 'two-stage'
            Method used for computing the principal vectors. Only 'two-stage' has been implemented.

        step: int, default to 100
            Number of interpolation steps.

        n_jobs: int, default to 1
            Number of concurrent threads to use for tasks that can be parallelized.

        verbose: bool or int, default to False
            Degree of verbosity in joblib routines.
        """

        self.kernel = kernel
        self.kernel_params_ = kernel_params or {}
        self.kernel_values_ = KernelComputer(self.kernel, self.kernel_params_,
                                             n_jobs)

        self.source_data_ = None
        self.target_data_ = None

        self.is_fitted = False

        self.n_components = n_components
        self.n_pv = n_pv
        self.method = method
        self.step = step

        self.predictive_clf = None

        self.n_jobs = n_jobs
        self.verbose = verbose

    def fit(self,
            source_data,
            target_data,
            n_components=None,
            n_pv=None,
            method='two-stage',
            step=100,
            with_interpolation=True,
            left_center=True):
        """
        Compute the Consensus Features (CFs) onto which predictive models can be trained.
        <br/> Specifically:
        <ul>
            <li> Compute the kernel matrices.
            <li> Compute the cosine similarity matrix.
            <li> Compute principal vectors.
            <li> Interpolate between the PVs.
            <li> Find optimal interpolation time.
        </ul>

        Parameters
        ----------
        source_data : np.ndarray, dtype=float
            Source data, matrix with samples in the rows, i.e. shape (n_source_samples, n_features).
            <br./> pandas.DataFrame are supported.

        target_data : np.ndarray, dtype=float
            Source data, matrix with samples in the rows, i.e. shape (n_target_samples, n_features).
            <br./> pandas.DataFrame are supported.
            <br/><b>WARNING</b>: features need to be ordered in the same way as in source_data.

        n_components: int, default to None
            Number of components. If not set here or in __init__, then use the maximum number of principal components
            possible for source and target.

        n_pv: int, default to None
            Number of Principal Vectors. If not set here or in __init__, then maximum number of PV will be computed.

        method : str, default to 'two-stage'
            Method used for computing the principal vectors. Only 'two-stage' has been implemented.

        step: int, default to 100
            Number of interpolation steps.

        with_interpolation: bool, default to True
            Bool indicating whether interpolation shall also be fitted. Useful for just computing PV
            prior to null distribution fitting (and choose of PV number).

        left_center: bool, default to True
            Bool indicating whether the output should be mean-centered, i.e. whether source and target
            consensus features values (or PVs if no interpolation) must have an independent mean-centering.

        Returns
        -------
        self : TRANSACT
            Fitted instance.
        """

        # Save parameters
        self.source_data_ = source_data
        self.target_data_ = target_data
        self.method = method or self.method
        self.n_components = n_components or self.n_components
        self.n_pv = n_pv or self.n_pv
        self.step = step or self.step
        self.left_center = left_center

        # Compute kernel values
        self.kernel_values_.fit(source_data, target_data, center=False)

        # Compute principal vectors
        self.principal_vectors_ = PVComputation(self.kernel,
                                                self.kernel_params_,
                                                n_jobs=self.n_jobs)
        self.principal_vectors_.fit(self.source_data_,
                                    self.target_data_,
                                    method=self.method,
                                    n_components=self.n_components,
                                    n_pv=self.n_pv)

        # Stop here if interpolation should not be computed.
        if not with_interpolation:
            return self

        # Set up interpolation scheme
        self.interpolation_ = Interpolation(self.kernel, self.kernel_params_,
                                            self.n_jobs)
        self.interpolation_.fit(self.principal_vectors_, self.kernel_values_)

        # Compute optimal interpolation time
        self._compute_optimal_time(step=self.step,
                                   left_center=self.left_center)

        self.is_fitted = True

        return self

    def null_distribution_pv_similarity(self,
                                        method='gene_shuffling',
                                        n_iter=100):
        """
        Generate a null distribution for the PV similarity function:
        <ul>
            <li> Gene shuffling: genes get shuffled in source to destroy any structure existing
            at the gene-level while preserving the sample structure. PV get recomputed and 
            similarity is saved.
        </ul>

        Parameters
        ----------
        method : string, default to gene_shuffling
            Method used for generating the null distribution.
            Only method developped: gene_shuffling

        n_iter: int, default to 100
            Number of iterations

        Returns
        -------
        np.ndarray, dtype=float, shape (n_iter, n_pv)
            Array containing the distribution of similarity after shuffling. Each row
            contains the values of one shuffling across PVs.
        """

        if method.lower() == 'gene_shuffling':
            null_method = self._gene_shuffling
        else:
            raise NotImplementedError(
                '%s is not a proper method for generating null distribution' %
                (method))

        null_distribution = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)\
                                    (delayed(null_method)() for _ in range(n_iter))

        return np.array(null_distribution)

    def _gene_shuffling(self):
        perm = np.random.permutation(self.source_data_.shape[1])
        pv = PVComputation(self.kernel, self.kernel_params_)
        pv.fit(self.source_data_[:, perm],
               self.target_data_,
               method=self.method,
               n_components=self.n_components,
               n_pv=self.n_pv)

        return np.cos(pv.canonical_angles)

    def fit_predictor(self, X, y, alpha_values=None, l1_ratio=0.5):
        """
        Project X on consensus features and train a predictor of drug response.

        Parameters
        ----------
        X : np.ndarray of shape (n_samples, n_features), dtype=float
            Dataset to project. Features should be ordered in same way as in source_data
            and target_data.

        y : np.ndarray of shape (n_samples, 1), dtype=float
            Output to predict

        Returns
        -------
        """
        self.alpha_values = alpha_values if alpha_values is not None else np.logspace(
            -10, 5, 34)
        self.l1_ratio_values = [0., .1, .2, .4, .5, .6, .8, .9, 1.]
        param_grid = {
            'regression__alpha': self.alpha_values,
            'regression__l1_ratio': self.l1_ratio_values
        }

        #Grid search setup
        self.predictive_clf = GridSearchCV(Pipeline([
                                ('regression', ElasticNet())
                                ]),\
                                cv=10,
                                n_jobs=self.n_jobs,
                                param_grid=param_grid,
                                verbose=self.verbose,
                                scoring='neg_mean_squared_error')
        self.predictive_clf.fit(self.transform(X, center=False), y)

        return self

    def compute_pred_performance(self, X, y, cv=10):
        """
        Compute predictive performance of predictive model by cross-validation
        on X and y.

        Parameters
        ----------
        X : np.ndarray of shape (n_samples, n_features), dtype=float
            Dataset to project. Features should be ordered in same way as in source_data
            and target_data.

        Returns
        -------
        np.ndarray of shape (n_samples, n_pv), dtype=float
            Dataset projected on consensus features.
        """

        kf = KFold(n_splits=cv, shuffle=True)
        X_projected = self.transform(X)

        if self.predictive_clf is None:
            print('BEWARE: NOT FITTED INSTANCE')
            self.fit_predictor(X, y)
        clf = clone(self.predictive_clf)

        y_predicted = np.zeros(X.shape[0])
        for train_index, test_index in kf.split(X_projected):
            clf.fit(X_projected[train_index], y[train_index])
            y_predicted[test_index] = clf.predict(X_projected[test_index])

        return scipy.stats.pearsonr(y_predicted, y)

    def predict(self, X):
        """
        Predict the drug response of a set of samples, i.e.:
        <ul>
            <li> Project data on consensus features.
            <li> Use the Elastic Net model to predict based on the consensus features.
        </ul>

        Parameters
        ----------
        X : np.ndarray, dtype=float
            Dataset to project, of shape (n_samples, n_features). Features should be ordered in same way as
            in source_data and target_data.

        Returns
        -------
        np.ndarray of shape (n_samples, 1), dtype=float
            Predicted drug response values.
        """
        return self.predictive_clf.predict(self.transform(X, center=False))

    def transform(self, X, center=False):
        """
        Project a dataset X onto the consensus features.

        Parameters
        ----------
        X : np.ndarray, dtype=float
            Dataset to project, of shape (n_samples, n_features). Features should be ordered in same way as
            in source_data and target_data.

        Returns
        -------
        np.ndarray of shape (n_samples, n_pv), dtype=float
            Dataset projected on consensus features.
        """
        return self.interpolation_.transform(X,
                                             self.optimal_time,
                                             center=center)

    def _compute_optimal_time(self, step=100, left_center=True):
        # Based on Kolmogorov Smirnov statistics, find interpolation time

        # Compute the interpolated values
        interpolated_values = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)\
                            (delayed(self.interpolation_.project_data)(s/step, center=left_center)
                                for s in range(step+1))
        interpolated_values = np.array(interpolated_values).transpose(2, 0, 1)
        source_interpolated_values = interpolated_values[:, :, :self.
                                                         source_data_.shape[0]]
        target_interpolated_values = interpolated_values[:, :,
                                                         self.source_data_.
                                                         shape[0]:]

        self.optimal_time = []
        self.ks_statistics = []
        self.ks_p_values = []

        # For each PV, find the time when interpolation has the largest overlap.
        for source_pv, target_pv in zip(source_interpolated_values,
                                        target_interpolated_values):
            self.ks_statistics.append([])
            for s, t in zip(source_pv, target_pv):
                self.ks_statistics[-1].append(scipy.stats.ks_2samp(s, t))
            self.ks_statistics[-1] = list(zip(*self.ks_statistics[-1]))
            self.ks_p_values.append(self.ks_statistics[-1][-1])
            self.ks_statistics[-1] = self.ks_statistics[-1][0]
            self.optimal_time.append(np.argmin(self.ks_statistics[-1]) / step)

        # Save the different statistics
        self.optimal_time = np.array(
            self.optimal_time)  # Optimal tau for each PV.
        self.ks_statistics = np.array(
            self.ks_statistics)  # Computed KS statistics between each PV.
        self.ks_p_values = np.array(
            self.ks_p_values)  # Corresponding p_values.