예제 #1
0
    def _compute_principal_vectors(self):
    	n_factors = np.min([self.source_components_.shape[0],\
    					self.target_components_.shape[0]])

    	self.principal_vectors_ = PVComputation(n_factors, n_factors)
    	self.principal_vectors_.compute_principal_vectors(self.source_components_,\
    													self.target_components_)
예제 #2
0
    def _compute_principal_vectors(self):
        n_pv = np.min([
            self.source_components_.shape[0], self.target_components_.shape[0]
        ])
        n_factors = {
            'source': self.source_components_.shape[0],
            'target': self.target_components_.shape[0]
        }

        self.principal_vectors_ = PVComputation(n_factors, n_pv)
        self.principal_vectors_.compute_principal_vectors(
            self.source_components_, self.target_components_)
예제 #3
0
class IntermediateFactors:
    """
    Handle the intermediate representations between 

    Attributes
    -------
    source_components_ : numpy.ndarray, shape (n_components, n_features)
        Loadings of the source factors, be them already aligned to target or not.
    
    target_components : numpy.ndarray, shape (n_components, n_features)
    	Loadings of the target factors, be them already aligned to source or not.
    
    intermediate_factors_ : numpy.ndarray, shape (n_representations, n_components, n_features)
        Loadings of intermediate factors along the geodesic path. Components are ordered
        by similarity, i.e. first components correspond to path between first PVs, etc.
    
    n_representations: int
        Number of representations along the geodesic path. If -1, means that the Geodesic Flow Kernel
        has been used instead.
    
    geodesic_matrix_: numpy.ndarray, shape (n_features, n_features)
        Geodesic Matrix for geodesic flow kernel.
    
    geodesic_flow_: method float:numpy.array
        Method that computes geodesic flow at a certain position.
    """
    def __init__(self, n_representations, n_jobs=1):
        """
        Parameters
        -------
        n_representations : int
            Number of representations to pick between source and target.
        n_jobs: int (optional, default to 1)
        	Number of jobs for computation.
        """
        self.n_representations = n_representations

        self.intermediate_factors_ = None
        self.source_components_ = None
        self.target_components_ = None

        self.n_jobs = 1

    def _compute_principal_vectors(self):
        n_pv = np.min([
            self.source_components_.shape[0], self.target_components_.shape[0]
        ])
        n_factors = {
            'source': self.source_components_.shape[0],
            'target': self.target_components_.shape[0]
        }

        self.principal_vectors_ = PVComputation(n_factors, n_pv)
        self.principal_vectors_.compute_principal_vectors(
            self.source_components_, self.target_components_)

    def _compute_flow_time(t, principal_vectors):
        Pi = np.sin( (1-t) * principal_vectors.angles_)\
            /np.sin(principal_vectors.angles_)
        Pi[np.isnan(Pi)] = 1 - t  # Asymptotic value of sin/sin in 0

        Xi = np.sin( t * principal_vectors.angles_)\
            / np.sin(principal_vectors.angles_)
        Xi[np.isnan(Xi)] = t  # Asymptotic value of sin/sin in 0

        return (principal_vectors.source_components_.T*Pi \
            + principal_vectors.target_components_.T*Xi).T

    def sample_flow(self,
                    source_components,
                    target_components,
                    already_aligned=False):
        """
        Sample intermediate subspaces (i.e. set of factors) uniformely along the geodesic flow.

        IMPORTANT: Same genes have to be given for source and target, and in same order

        Parameters
        -------
        source_components : np.ndarray, shape (n_components, n_features)
            Source factors
        
        target_components : np.ndarray, shape (n_components, n_features)
            Target factors
        
        already_aligned : boolean (optional, default to False)
            Whether the components are already aligned (i.e. are they PV or not).

        Return values
        -------
        Intermediate subspace, numpy.ndarray of shape (n_representations + 1, n_components, n_features).
        """
        self.source_components_ = source_components
        self.target_components_ = target_components

        # Compute the principal vectors
        if not already_aligned:
            self._compute_principal_vectors()
        else:
            self.principal_vectors_.source_components_ = self.source_components_
            self.principal_vectors_.target_components = self.target_components_

        # Sample at different uniformly distributed time points
        if self.n_representations == -1:
            t_sample = np.array([1])
        else:
            t_sample = np.linspace(0, 1, self.n_representations + 1)

        if self.n_jobs >= 2:
            return np.array(
                Parallel(n_jobs=self.n_jobs)\
                        (delayed(IntermediateFactors._compute_flow_time)(t, self.principal_vectors_)\
                    for t in t_sample)
            )
        else:
            return np.array([
                IntermediateFactors._compute_flow_time(t,
                                                       self.principal_vectors_)
                for t in t_sample
            ])

    def compute_geodesic_matrix(self, source_components, target_components):
        """
        Return method for computing the domain-invariant kernel of Geodesic Flow Kernel.

        Parameters
        -------
        source_components : np.ndarray, shape (n_components, n_features)
            Source factors
        
        target_components : np.ndarray, shape (n_components, n_features)
            Target factors

        Return values
        -------
        Method that takes two p-dimensional vector and returns their domain-invariant
        scalar product.
        """
        self.source_components_ = source_components
        self.target_components_ = target_components

        self._compute_principal_vectors()

        diag_term = (self.principal_vectors_.angles_ - np.cos(self.principal_vectors_.angles_)*np.sin(self.principal_vectors_.angles_)) \
            / 2 / self.principal_vectors_.angles_ / np.power(np.sin(self.principal_vectors_.angles_), 2)
        off_diag_term = (np.sin(self.principal_vectors_.angles_) - np.cos(self.principal_vectors_.angles_)*self.principal_vectors_.angles_) \
            / 2 / np.power(np.sin(self.principal_vectors_.angles_),2) / self.principal_vectors_.angles_
        # Correct for extreme case when theta = 0
        diag_term[np.isnan(diag_term)] = 1. / 3.
        diag_term[np.isinf(diag_term)] = 1. / 3.
        off_diag_term[np.isnan(off_diag_term)] = 1. / 6.
        off_diag_term[np.isinf(off_diag_term)] = 1. / 6.
        diag_term = np.diag(diag_term)
        off_diag_term = np.diag(off_diag_term)

        self.G_matrix = np.block([[diag_term, off_diag_term],
                                  [off_diag_term, diag_term]])

        self.projection = np.block([
            self.principal_vectors_.source_components_.transpose(),
            self.principal_vectors_.target_components_.transpose()
        ])

        return self.G_matrix
        #return lambda x,y: IntermediateFactors._compute_domain_invariant_scalar_product(x, y, self.projection, self.G_matrix)

    def _compute_domain_invariant_scalar_product(x, y, projection, G_matrix):
        x_p = x.dot(projection)
        y_p = y.dot(projection)

        return x_p.dot(G_matrix).dot(y_p.transpose())
예제 #4
0
    def __init__(self, n_representations=100, method='consensus',
                mean_center=True,
                std_unit=False,
                n_factors=70,
                n_pv=40,
                dim_reduction='pca',
                dim_reduction_target=None,
                l1_ratio=0,
                source_data=None,
                target_data=None,
                n_jobs=1):
        """
        Parameters
        -------
        n_representations : int, default to 100
            Number of representations between source and target principal vectors for interpolation.
            0 means source only, -1 means target only.

        method : str, default to 'consensus'
            Scheme used for the domain adaptation step, i.e. 'consensus', 'elasticnet', or 'gfk'.

        mean_center : bool, default to True
            Whether the different datasets used in the implementation should be mean centered.

        std_unit : bool, default to False 
            Whether the different datasets used in the implementation should be standardized
            (feature-level variance to 1).

        n_factors : int, default to 70 
            Number of domain-specific factors to compute, e.g. PCs.

        n_pv : int, default to 40
            Number of principal vectors to compute from the domain-specific factors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. If None, set to dim_reduction.

        l1_ratio : float, default to 0
            l1 ratio for elasticnet model (0 is Ridge, 1 is Lasso).

        source_data : np.ndarray, default to None
            source data to use in domain adaptation phase.

        target_data : np.ndarray, default to None 
            target data to use in domain adaptation phase.

        n_jobs : int, default to 1
            number of jobs used in parallelisation.
        """

        self.n_representations = n_representations
        self.mean_center = mean_center
        self.std_unit = std_unit
        self.method = method
        self.n_factors = n_factors
        self.n_pv = n_pv
        self.l1_ratio = l1_ratio
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_jobs = n_jobs

        self.source_data = source_data
        self.target_data = target_data

        self.pv_computation = PVComputation(
            self.n_factors,
            self.n_pv,
            self.dim_reduction,
            self.dim_reduction_target
        )

        self.intermediate_factors = IntermediateFactors(
            self.n_representations
        )

        self.predictor = None
        
        # Default values for CV
        self.alpha_values = np.logspace(-6,10,34)
        self.cv_fold = 10
        self.verbose = 1
예제 #5
0
    def __init__(self,
                 source_data,
                 target_data,
                 n_factors,
                 n_pv,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 n_representations=100,
                 use_data=False,
                 mean_center=False,
                 std_unit=False):
        """
        Parameters
        -------
        source_data: np.ndarray (n_samples, n_genes)
            Data use as source, e.g. cell line or PDX transcriptome read outs.

        target_data: np.ndarray (n_samples, n_genes)
            Data use as target, e.g. tumor transcriptome read outs.

        n_factors: int
            Number of domain-invariant factors.

        n_pv: int
            Number of principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        n_representations: int, optional default to 100
            Number of interpolated features.

        use_data: bool, optional, default to False
            Whether data given additionally in fit should be used in domain-adaptation.

        mean_center : bool, optional, default to False
            Whether X_source features (i.e. genes) should be mean-centered.

        std_unit : bool, optional, default to False
            Whether X_source features (i.e. genes) should be standardized.
        """
        self.source_data = source_data
        self.target_data = target_data

        self.n_factors = n_factors
        self.n_pv = n_pv
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_representations = n_representations
        self.use_data = use_data

        self.standard_scaler_input_ = StandardScaler(with_mean=mean_center,
                                                     with_std=std_unit)
        self.standard_scaler_source_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)
        self.standard_scaler_target_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)

        self.pv_computation = PVComputation(
            n_factors=self.n_factors,
            n_pv=self.n_pv,
            dim_reduction=self.dim_reduction,
            dim_reduction_target=self.dim_reduction_target,
        )

        self.intermediate_factors = IntermediateFactors(
            n_representations=self.n_representations)
예제 #6
0
class ConsensusRepresentation(BaseEstimator):
    """Consensus Representation computation.

    Compute the geodesic flow kernel matrix. We use the equivalent definition
    derived in [1] to make it faster. Principal vectors are therefore first
    computed to project onto them.

    Attributes
    -------
    """
    def __init__(self,
                 source_data,
                 target_data,
                 n_factors,
                 n_pv,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 n_representations=1000,
                 use_data=False,
                 mean_center=False,
                 std_unit=False):
        """
        Parameters
        -------
        source_data: np.ndarray (n_samples, n_genes)
            Data use as source, e.g. cell line or PDX transcriptome read outs.

        target_data: np.ndarray (n_samples, n_genes)
            Data use as target, e.g. tumor transcriptome read outs.

        n_factors: int
            Number of domain-invariant factors.

        n_pv: int
            Number of principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        n_representations: int, optional default to 100
            Number of interpolated features.

        use_data: bool, optional, default to False
            Whether data given additionally in fit should be used in domain-adaptation.

        mean_center : bool, optional, default to False
            Whether X_source features (i.e. genes) should be mean-centered.

        std_unit : bool, optional, default to False
            Whether X_source features (i.e. genes) should be standardized.
        """
        self.source_data = source_data
        self.target_data = target_data

        self.n_factors = n_factors
        self.n_pv = n_pv
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_representations = n_representations
        self.use_data = use_data

        self.standard_scaler_input_ = StandardScaler(with_mean=mean_center,
                                                     with_std=std_unit)
        self.standard_scaler_source_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)
        self.standard_scaler_target_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)

        self.pv_computation = PVComputation(
            n_factors=self.n_factors,
            n_pv=self.n_pv,
            dim_reduction=self.dim_reduction,
            dim_reduction_target=self.dim_reduction_target,
        )

        self.intermediate_factors = IntermediateFactors(
            n_representations=self.n_representations)

    def _find_common_representation(self):
        flow_vectors = self.flow.transpose(1, 0, 2)
        self.consensus_representation = []

        for i in range(self.n_pv):
            source_projected = flow_vectors[i].dot(
                self.source_data.transpose())
            target_projected = flow_vectors[i].dot(
                self.target_data.transpose())

            ks_stats = [
                ks_2samp(s, t)[0]
                for (s, t) in zip(source_projected, target_projected)
            ]

            self.consensus_representation.append(
                flow_vectors[i, np.argmin(ks_stats)])

        self.consensus_representation = np.array(
            self.consensus_representation).transpose()

        return self.consensus_representation

    def fit(self, X, y=None):
        """
        Computes the principal vectors, interpolates between them, projects source and target
        data, and finally computes, by comparing for each pair source and target projected data, 
        the point where these two quantities are comparable (using KS statistics).

        Parameters
        -------
        X: numpy.ndarray, shape (n_samples, n_genes)
            Genomics data to consider

        y: numpy.ndarray, shape(n_samples, 1), optional, default to None
            Response data (optional, just for compliance with BaseEstimator)

        Returned Values
        -------
        self: returns an instance of self.
        """

        # Add X to source data if use_data set to True
        if self.use_data:
            if self.source_data is None or self.source_data.shape[0] == 0:
                self.source_data = X
            else:
                self.source_data = np.concatenate([self.source_data, X])

        # Standardize data
        self.standard_scaler_input_.fit(X)
        self.source_data = self.standard_scaler_source_.fit_transform(
            self.source_data)
        self.target_data = self.standard_scaler_target_.fit_transform(
            self.target_data)

        # Compute principal vectors
        self.pv_computation.fit(self.source_data, self.target_data, y)

        # Compute intermediate features
        self.flow = self.intermediate_factors.sample_flow(
            self.pv_computation.source_components_,
            self.pv_computation.target_components_)

        # Compute the consensus representation between each PV
        self._find_common_representation()

        return self

    def transform(self, X, y=None):
        """
        Project data along the geodesic path. 

        Attributes
        -------
        X: numpy.ndarray, shape (n_components, n_features)
            Genomics data use for prediction.

        Return values
        -------
        X_projected: numpy.ndarray, shape (n_components, n_representations)
            Genomics data projected on the consensus representation.
        """
        return self.standard_scaler_input_.fit_transform(X).dot(
            self.consensus_representation)
예제 #7
0
class FlowProjector(BaseEstimator):
    """Project on the geodesic.

    Given source and target data, computes the domain-specific factors, aligns them
    to get the principal vectors and finally interpolates between source PVs and
    target PVs. Data can then be projected on all these intermediate features.

    Attributes
    -------
    """
    def __init__(self,
                 source_data,
                 target_data,
                 n_factors,
                 n_pv,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 n_representations=100,
                 use_data=False,
                 mean_center=False,
                 std_unit=False):
        """
        Parameters
        -------
        source_data: np.ndarray (n_samples, n_genes)
            Data use as source, e.g. cell line or PDX transcriptome read outs.

        target_data: np.ndarray (n_samples, n_genes)
            Data use as target, e.g. tumor transcriptome read outs.

        n_factors: int
            Number of domain-invariant factors.

        n_pv: int
            Number of principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        n_representations: int, optional default to 100
            Number of interpolated features.

        use_data: bool, optional, default to False
            Whether data given additionally in fit should be used in domain-adaptation.

        mean_center : bool, optional, default to False
            Whether X_source features (i.e. genes) should be mean-centered.

        std_unit : bool, optional, default to False
            Whether X_source features (i.e. genes) should be standardized.
        """
        self.source_data = source_data
        self.target_data = target_data

        self.n_factors = n_factors
        self.n_pv = n_pv
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_representations = n_representations
        self.use_data = use_data

        self.standard_scaler_input_ = StandardScaler(with_mean=mean_center,
                                                     with_std=std_unit)
        self.standard_scaler_source_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)
        self.standard_scaler_target_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)

        self.pv_computation = PVComputation(
            n_factors=self.n_factors,
            n_pv=self.n_pv,
            dim_reduction=self.dim_reduction,
            dim_reduction_target=self.dim_reduction_target,
        )

        self.intermediate_factors = IntermediateFactors(
            n_representations=self.n_representations)

    def fit(self, X, y=None):
        """
        Computes the intermediate features between the pairs of principal vectors.

        Parameters
        -------
        X: numpy.ndarray, shape (n_samples, n_genes)
            Genomics data to consider

        y: numpy.ndarray, shape(n_samples, 1), optional, default to None
            Response data (optional, just for compliance with BaseEstimator)

        Returned Values
        -------
        self: returns an instance of self.
        """

        # Add X to source data if use_data set to True
        if self.use_data:
            if self.source_data is None or self.source_data.shape[0] == 0:
                self.source_data = X
            else:
                self.source_data = np.concatenate([self.source_data, X])

        # Standardize data
        self.standard_scaler_input_.fit(X)
        self.source_data = self.standard_scaler_source_.fit_transform(
            self.source_data)
        self.target_data = self.standard_scaler_target_.fit_transform(
            self.target_data)

        # Compute principal vectors
        self.pv_computation.fit(self.source_data, self.target_data, y)

        # Compute intermediate factors.
        self.flow = self.intermediate_factors.sample_flow(
            self.pv_computation.source_components_,
            self.pv_computation.target_components_)

        # Concatenate feature representations before projection
        self.flow = np.concatenate(self.flow).transpose()

        return self

    def transform(self, X, y=None):
        """
        Project data along the geodesic path. 

        Parameters
        -------
        X: numpy.ndarray, shape (n_components, n_features)
            Genomics data use for prediction.

        Returned values
        -------
        X_projected: numpy.ndarray, shape (n_components, n_pv * n_representations)
            Genomics data projected along the flow.
        """

        return self.standard_scaler_input_.fit_transform(X).dot(self.flow)
예제 #8
0
class GeodesicMatrixComputer(BaseEstimator):
    """ Geodesic Flow Kernel computation.

    Compute the geodesic flow kernel matrix. We use the equivalent definition
    derived in [1] to make it faster. Principal vectors are therefore first
    computed to project onto them.

    Attributes
    -------
    """
    def __init__(self,
                 source_data,
                 target_data,
                 n_factors,
                 n_pv,
                 dim_reduction='pca',
                 dim_reduction_target=None,
                 n_representations=1000,
                 use_data=False,
                 mean_center=False,
                 std_unit=False):
        """
        Parameters
        -------
        source_data: np.ndarray (n_samples, n_genes)
            Data use as source, e.g. cell line or PDX transcriptome read outs.

        target_data: np.ndarray (n_samples, n_genes)
            Data use as target, e.g. tumor transcriptome read outs.

        n_factors: int
            Number of domain-invariant factors.

        n_pv: int
            Number of principal vectors.

        dim_reduction : str, default to 'pca' 
            Dimensionality reduction method for the source data,
            i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.

        dim_reduction_target : str, default to None 
            Dimensionality reduction method for the target data.

        n_representations: int, optional default to 100
            Number of interpolated features.

        use_data: bool, optional, default to False
            Whether data given additionally in fit should be used in domain-adaptation.

        mean_center : bool, optional, default to False
            Whether X_source features (i.e. genes) should be mean-centered.

        std_unit : bool, optional, default to False
            Whether X_source features (i.e. genes) should be standardized.
        """
        self.source_data = source_data
        self.target_data = target_data

        self.n_factors = n_factors
        self.n_pv = n_pv
        self.dim_reduction = dim_reduction
        self.dim_reduction_target = dim_reduction_target
        self.n_representations = n_representations
        self.use_data = use_data

        self.standard_scaler_input_ = StandardScaler(with_mean=mean_center,
                                                     with_std=std_unit)
        self.standard_scaler_source_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)
        self.standard_scaler_target_ = StandardScaler(with_mean=mean_center,
                                                      with_std=std_unit)

        self.pv_computation_ = PVComputation(
            n_factors=self.n_factors,
            n_pv=self.n_pv,
            dim_reduction=self.dim_reduction,
            dim_reduction_target=self.dim_reduction_target,
        )

        self.intermediate_factors = IntermediateFactors(
            n_representations=self.n_representations)

    def fit(self, X, y=None):
        """
        Computes the geodesic flow kernel matrix used in kernel ridge.

        Parameters
        -------
        X: numpy.ndarray, shape (n_samples, n_genes)
            Genomics data to consider

        y: numpy.ndarray, shape(n_samples, 1), optional, default to None
            Response data (optional, just for compliance with BaseEstimator)

        Returned Values
        -------
        self: returns an instance of self.
        """

        # Add X to source data if use_data set to True
        if self.use_data:
            if self.source_data is None or self.source_data.shape[0] == 0:
                self.source_data = X
            else:
                self.source_data = np.concatenate([self.source_data, X])

        # Standardize data
        self.standard_scaler_input_.fit(X)
        self.source_data = self.standard_scaler_source_.fit_transform(
            self.source_data)
        self.target_data = self.standard_scaler_target_.fit_transform(
            self.target_data)
        self.training_data = self.standard_scaler_input_.transform(X)

        # Compute principal vectors
        self.pv_computation_.fit(self.source_data, self.target_data, y)

        # Compute G, kernel matrix
        self.G_ = self.intermediate_factors.compute_geodesic_matrix(
            self.pv_computation_.source_components_,
            self.pv_computation_.target_components_)

        # Compute projector
        self.projector_ = np.block([
            self.pv_computation_.source_components_.transpose(),
            self.pv_computation_.target_components_.transpose()
        ])

        return self

    def _compute_kernel_matrix(self, X1, X2=None):
        X1_projected = X1.dot(self.projector_)
        if X2 is None:
            X2_projected = X1_projected
        else:
            X2_projected = X2.dot(self.projector_)

        return X1_projected.dot(self.G_).dot(X2_projected.transpose())

    def transform(self, X, y=None):
        """
        Compute the domain-invariant kernel matrix

        Parameters
        -------
        X: numpy.ndarray, shape (n_components, n_features)
            Genomics data use for prediction.

        Returned values
        -------
        X_projected: numpy.ndarray, shape (n_components, n_representations)
            Kernel matrix with source data (fed in fit method).
        """
        return self._compute_kernel_matrix(
            self.standard_scaler_input_.fit_transform(X), self.training_data)