def _compute_principal_vectors(self): n_factors = np.min([self.source_components_.shape[0],\ self.target_components_.shape[0]]) self.principal_vectors_ = PVComputation(n_factors, n_factors) self.principal_vectors_.compute_principal_vectors(self.source_components_,\ self.target_components_)
def _compute_principal_vectors(self): n_pv = np.min([ self.source_components_.shape[0], self.target_components_.shape[0] ]) n_factors = { 'source': self.source_components_.shape[0], 'target': self.target_components_.shape[0] } self.principal_vectors_ = PVComputation(n_factors, n_pv) self.principal_vectors_.compute_principal_vectors( self.source_components_, self.target_components_)
class IntermediateFactors: """ Handle the intermediate representations between Attributes ------- source_components_ : numpy.ndarray, shape (n_components, n_features) Loadings of the source factors, be them already aligned to target or not. target_components : numpy.ndarray, shape (n_components, n_features) Loadings of the target factors, be them already aligned to source or not. intermediate_factors_ : numpy.ndarray, shape (n_representations, n_components, n_features) Loadings of intermediate factors along the geodesic path. Components are ordered by similarity, i.e. first components correspond to path between first PVs, etc. n_representations: int Number of representations along the geodesic path. If -1, means that the Geodesic Flow Kernel has been used instead. geodesic_matrix_: numpy.ndarray, shape (n_features, n_features) Geodesic Matrix for geodesic flow kernel. geodesic_flow_: method float:numpy.array Method that computes geodesic flow at a certain position. """ def __init__(self, n_representations, n_jobs=1): """ Parameters ------- n_representations : int Number of representations to pick between source and target. n_jobs: int (optional, default to 1) Number of jobs for computation. """ self.n_representations = n_representations self.intermediate_factors_ = None self.source_components_ = None self.target_components_ = None self.n_jobs = 1 def _compute_principal_vectors(self): n_pv = np.min([ self.source_components_.shape[0], self.target_components_.shape[0] ]) n_factors = { 'source': self.source_components_.shape[0], 'target': self.target_components_.shape[0] } self.principal_vectors_ = PVComputation(n_factors, n_pv) self.principal_vectors_.compute_principal_vectors( self.source_components_, self.target_components_) def _compute_flow_time(t, principal_vectors): Pi = np.sin( (1-t) * principal_vectors.angles_)\ /np.sin(principal_vectors.angles_) Pi[np.isnan(Pi)] = 1 - t # Asymptotic value of sin/sin in 0 Xi = np.sin( t * principal_vectors.angles_)\ / np.sin(principal_vectors.angles_) Xi[np.isnan(Xi)] = t # Asymptotic value of sin/sin in 0 return (principal_vectors.source_components_.T*Pi \ + principal_vectors.target_components_.T*Xi).T def sample_flow(self, source_components, target_components, already_aligned=False): """ Sample intermediate subspaces (i.e. set of factors) uniformely along the geodesic flow. IMPORTANT: Same genes have to be given for source and target, and in same order Parameters ------- source_components : np.ndarray, shape (n_components, n_features) Source factors target_components : np.ndarray, shape (n_components, n_features) Target factors already_aligned : boolean (optional, default to False) Whether the components are already aligned (i.e. are they PV or not). Return values ------- Intermediate subspace, numpy.ndarray of shape (n_representations + 1, n_components, n_features). """ self.source_components_ = source_components self.target_components_ = target_components # Compute the principal vectors if not already_aligned: self._compute_principal_vectors() else: self.principal_vectors_.source_components_ = self.source_components_ self.principal_vectors_.target_components = self.target_components_ # Sample at different uniformly distributed time points if self.n_representations == -1: t_sample = np.array([1]) else: t_sample = np.linspace(0, 1, self.n_representations + 1) if self.n_jobs >= 2: return np.array( Parallel(n_jobs=self.n_jobs)\ (delayed(IntermediateFactors._compute_flow_time)(t, self.principal_vectors_)\ for t in t_sample) ) else: return np.array([ IntermediateFactors._compute_flow_time(t, self.principal_vectors_) for t in t_sample ]) def compute_geodesic_matrix(self, source_components, target_components): """ Return method for computing the domain-invariant kernel of Geodesic Flow Kernel. Parameters ------- source_components : np.ndarray, shape (n_components, n_features) Source factors target_components : np.ndarray, shape (n_components, n_features) Target factors Return values ------- Method that takes two p-dimensional vector and returns their domain-invariant scalar product. """ self.source_components_ = source_components self.target_components_ = target_components self._compute_principal_vectors() diag_term = (self.principal_vectors_.angles_ - np.cos(self.principal_vectors_.angles_)*np.sin(self.principal_vectors_.angles_)) \ / 2 / self.principal_vectors_.angles_ / np.power(np.sin(self.principal_vectors_.angles_), 2) off_diag_term = (np.sin(self.principal_vectors_.angles_) - np.cos(self.principal_vectors_.angles_)*self.principal_vectors_.angles_) \ / 2 / np.power(np.sin(self.principal_vectors_.angles_),2) / self.principal_vectors_.angles_ # Correct for extreme case when theta = 0 diag_term[np.isnan(diag_term)] = 1. / 3. diag_term[np.isinf(diag_term)] = 1. / 3. off_diag_term[np.isnan(off_diag_term)] = 1. / 6. off_diag_term[np.isinf(off_diag_term)] = 1. / 6. diag_term = np.diag(diag_term) off_diag_term = np.diag(off_diag_term) self.G_matrix = np.block([[diag_term, off_diag_term], [off_diag_term, diag_term]]) self.projection = np.block([ self.principal_vectors_.source_components_.transpose(), self.principal_vectors_.target_components_.transpose() ]) return self.G_matrix #return lambda x,y: IntermediateFactors._compute_domain_invariant_scalar_product(x, y, self.projection, self.G_matrix) def _compute_domain_invariant_scalar_product(x, y, projection, G_matrix): x_p = x.dot(projection) y_p = y.dot(projection) return x_p.dot(G_matrix).dot(y_p.transpose())
def __init__(self, n_representations=100, method='consensus', mean_center=True, std_unit=False, n_factors=70, n_pv=40, dim_reduction='pca', dim_reduction_target=None, l1_ratio=0, source_data=None, target_data=None, n_jobs=1): """ Parameters ------- n_representations : int, default to 100 Number of representations between source and target principal vectors for interpolation. 0 means source only, -1 means target only. method : str, default to 'consensus' Scheme used for the domain adaptation step, i.e. 'consensus', 'elasticnet', or 'gfk'. mean_center : bool, default to True Whether the different datasets used in the implementation should be mean centered. std_unit : bool, default to False Whether the different datasets used in the implementation should be standardized (feature-level variance to 1). n_factors : int, default to 70 Number of domain-specific factors to compute, e.g. PCs. n_pv : int, default to 40 Number of principal vectors to compute from the domain-specific factors. dim_reduction : str, default to 'pca' Dimensionality reduction method for the source data, i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. dim_reduction_target : str, default to None Dimensionality reduction method for the target data, i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. If None, set to dim_reduction. l1_ratio : float, default to 0 l1 ratio for elasticnet model (0 is Ridge, 1 is Lasso). source_data : np.ndarray, default to None source data to use in domain adaptation phase. target_data : np.ndarray, default to None target data to use in domain adaptation phase. n_jobs : int, default to 1 number of jobs used in parallelisation. """ self.n_representations = n_representations self.mean_center = mean_center self.std_unit = std_unit self.method = method self.n_factors = n_factors self.n_pv = n_pv self.l1_ratio = l1_ratio self.dim_reduction = dim_reduction self.dim_reduction_target = dim_reduction_target self.n_jobs = n_jobs self.source_data = source_data self.target_data = target_data self.pv_computation = PVComputation( self.n_factors, self.n_pv, self.dim_reduction, self.dim_reduction_target ) self.intermediate_factors = IntermediateFactors( self.n_representations ) self.predictor = None # Default values for CV self.alpha_values = np.logspace(-6,10,34) self.cv_fold = 10 self.verbose = 1
def __init__(self, source_data, target_data, n_factors, n_pv, dim_reduction='pca', dim_reduction_target=None, n_representations=100, use_data=False, mean_center=False, std_unit=False): """ Parameters ------- source_data: np.ndarray (n_samples, n_genes) Data use as source, e.g. cell line or PDX transcriptome read outs. target_data: np.ndarray (n_samples, n_genes) Data use as target, e.g. tumor transcriptome read outs. n_factors: int Number of domain-invariant factors. n_pv: int Number of principal vectors. dim_reduction : str, default to 'pca' Dimensionality reduction method for the source data, i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. dim_reduction_target : str, default to None Dimensionality reduction method for the target data. n_representations: int, optional default to 100 Number of interpolated features. use_data: bool, optional, default to False Whether data given additionally in fit should be used in domain-adaptation. mean_center : bool, optional, default to False Whether X_source features (i.e. genes) should be mean-centered. std_unit : bool, optional, default to False Whether X_source features (i.e. genes) should be standardized. """ self.source_data = source_data self.target_data = target_data self.n_factors = n_factors self.n_pv = n_pv self.dim_reduction = dim_reduction self.dim_reduction_target = dim_reduction_target self.n_representations = n_representations self.use_data = use_data self.standard_scaler_input_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.standard_scaler_source_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.standard_scaler_target_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.pv_computation = PVComputation( n_factors=self.n_factors, n_pv=self.n_pv, dim_reduction=self.dim_reduction, dim_reduction_target=self.dim_reduction_target, ) self.intermediate_factors = IntermediateFactors( n_representations=self.n_representations)
class ConsensusRepresentation(BaseEstimator): """Consensus Representation computation. Compute the geodesic flow kernel matrix. We use the equivalent definition derived in [1] to make it faster. Principal vectors are therefore first computed to project onto them. Attributes ------- """ def __init__(self, source_data, target_data, n_factors, n_pv, dim_reduction='pca', dim_reduction_target=None, n_representations=1000, use_data=False, mean_center=False, std_unit=False): """ Parameters ------- source_data: np.ndarray (n_samples, n_genes) Data use as source, e.g. cell line or PDX transcriptome read outs. target_data: np.ndarray (n_samples, n_genes) Data use as target, e.g. tumor transcriptome read outs. n_factors: int Number of domain-invariant factors. n_pv: int Number of principal vectors. dim_reduction : str, default to 'pca' Dimensionality reduction method for the source data, i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. dim_reduction_target : str, default to None Dimensionality reduction method for the target data. n_representations: int, optional default to 100 Number of interpolated features. use_data: bool, optional, default to False Whether data given additionally in fit should be used in domain-adaptation. mean_center : bool, optional, default to False Whether X_source features (i.e. genes) should be mean-centered. std_unit : bool, optional, default to False Whether X_source features (i.e. genes) should be standardized. """ self.source_data = source_data self.target_data = target_data self.n_factors = n_factors self.n_pv = n_pv self.dim_reduction = dim_reduction self.dim_reduction_target = dim_reduction_target self.n_representations = n_representations self.use_data = use_data self.standard_scaler_input_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.standard_scaler_source_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.standard_scaler_target_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.pv_computation = PVComputation( n_factors=self.n_factors, n_pv=self.n_pv, dim_reduction=self.dim_reduction, dim_reduction_target=self.dim_reduction_target, ) self.intermediate_factors = IntermediateFactors( n_representations=self.n_representations) def _find_common_representation(self): flow_vectors = self.flow.transpose(1, 0, 2) self.consensus_representation = [] for i in range(self.n_pv): source_projected = flow_vectors[i].dot( self.source_data.transpose()) target_projected = flow_vectors[i].dot( self.target_data.transpose()) ks_stats = [ ks_2samp(s, t)[0] for (s, t) in zip(source_projected, target_projected) ] self.consensus_representation.append( flow_vectors[i, np.argmin(ks_stats)]) self.consensus_representation = np.array( self.consensus_representation).transpose() return self.consensus_representation def fit(self, X, y=None): """ Computes the principal vectors, interpolates between them, projects source and target data, and finally computes, by comparing for each pair source and target projected data, the point where these two quantities are comparable (using KS statistics). Parameters ------- X: numpy.ndarray, shape (n_samples, n_genes) Genomics data to consider y: numpy.ndarray, shape(n_samples, 1), optional, default to None Response data (optional, just for compliance with BaseEstimator) Returned Values ------- self: returns an instance of self. """ # Add X to source data if use_data set to True if self.use_data: if self.source_data is None or self.source_data.shape[0] == 0: self.source_data = X else: self.source_data = np.concatenate([self.source_data, X]) # Standardize data self.standard_scaler_input_.fit(X) self.source_data = self.standard_scaler_source_.fit_transform( self.source_data) self.target_data = self.standard_scaler_target_.fit_transform( self.target_data) # Compute principal vectors self.pv_computation.fit(self.source_data, self.target_data, y) # Compute intermediate features self.flow = self.intermediate_factors.sample_flow( self.pv_computation.source_components_, self.pv_computation.target_components_) # Compute the consensus representation between each PV self._find_common_representation() return self def transform(self, X, y=None): """ Project data along the geodesic path. Attributes ------- X: numpy.ndarray, shape (n_components, n_features) Genomics data use for prediction. Return values ------- X_projected: numpy.ndarray, shape (n_components, n_representations) Genomics data projected on the consensus representation. """ return self.standard_scaler_input_.fit_transform(X).dot( self.consensus_representation)
class FlowProjector(BaseEstimator): """Project on the geodesic. Given source and target data, computes the domain-specific factors, aligns them to get the principal vectors and finally interpolates between source PVs and target PVs. Data can then be projected on all these intermediate features. Attributes ------- """ def __init__(self, source_data, target_data, n_factors, n_pv, dim_reduction='pca', dim_reduction_target=None, n_representations=100, use_data=False, mean_center=False, std_unit=False): """ Parameters ------- source_data: np.ndarray (n_samples, n_genes) Data use as source, e.g. cell line or PDX transcriptome read outs. target_data: np.ndarray (n_samples, n_genes) Data use as target, e.g. tumor transcriptome read outs. n_factors: int Number of domain-invariant factors. n_pv: int Number of principal vectors. dim_reduction : str, default to 'pca' Dimensionality reduction method for the source data, i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. dim_reduction_target : str, default to None Dimensionality reduction method for the target data. n_representations: int, optional default to 100 Number of interpolated features. use_data: bool, optional, default to False Whether data given additionally in fit should be used in domain-adaptation. mean_center : bool, optional, default to False Whether X_source features (i.e. genes) should be mean-centered. std_unit : bool, optional, default to False Whether X_source features (i.e. genes) should be standardized. """ self.source_data = source_data self.target_data = target_data self.n_factors = n_factors self.n_pv = n_pv self.dim_reduction = dim_reduction self.dim_reduction_target = dim_reduction_target self.n_representations = n_representations self.use_data = use_data self.standard_scaler_input_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.standard_scaler_source_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.standard_scaler_target_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.pv_computation = PVComputation( n_factors=self.n_factors, n_pv=self.n_pv, dim_reduction=self.dim_reduction, dim_reduction_target=self.dim_reduction_target, ) self.intermediate_factors = IntermediateFactors( n_representations=self.n_representations) def fit(self, X, y=None): """ Computes the intermediate features between the pairs of principal vectors. Parameters ------- X: numpy.ndarray, shape (n_samples, n_genes) Genomics data to consider y: numpy.ndarray, shape(n_samples, 1), optional, default to None Response data (optional, just for compliance with BaseEstimator) Returned Values ------- self: returns an instance of self. """ # Add X to source data if use_data set to True if self.use_data: if self.source_data is None or self.source_data.shape[0] == 0: self.source_data = X else: self.source_data = np.concatenate([self.source_data, X]) # Standardize data self.standard_scaler_input_.fit(X) self.source_data = self.standard_scaler_source_.fit_transform( self.source_data) self.target_data = self.standard_scaler_target_.fit_transform( self.target_data) # Compute principal vectors self.pv_computation.fit(self.source_data, self.target_data, y) # Compute intermediate factors. self.flow = self.intermediate_factors.sample_flow( self.pv_computation.source_components_, self.pv_computation.target_components_) # Concatenate feature representations before projection self.flow = np.concatenate(self.flow).transpose() return self def transform(self, X, y=None): """ Project data along the geodesic path. Parameters ------- X: numpy.ndarray, shape (n_components, n_features) Genomics data use for prediction. Returned values ------- X_projected: numpy.ndarray, shape (n_components, n_pv * n_representations) Genomics data projected along the flow. """ return self.standard_scaler_input_.fit_transform(X).dot(self.flow)
class GeodesicMatrixComputer(BaseEstimator): """ Geodesic Flow Kernel computation. Compute the geodesic flow kernel matrix. We use the equivalent definition derived in [1] to make it faster. Principal vectors are therefore first computed to project onto them. Attributes ------- """ def __init__(self, source_data, target_data, n_factors, n_pv, dim_reduction='pca', dim_reduction_target=None, n_representations=1000, use_data=False, mean_center=False, std_unit=False): """ Parameters ------- source_data: np.ndarray (n_samples, n_genes) Data use as source, e.g. cell line or PDX transcriptome read outs. target_data: np.ndarray (n_samples, n_genes) Data use as target, e.g. tumor transcriptome read outs. n_factors: int Number of domain-invariant factors. n_pv: int Number of principal vectors. dim_reduction : str, default to 'pca' Dimensionality reduction method for the source data, i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. dim_reduction_target : str, default to None Dimensionality reduction method for the target data. n_representations: int, optional default to 100 Number of interpolated features. use_data: bool, optional, default to False Whether data given additionally in fit should be used in domain-adaptation. mean_center : bool, optional, default to False Whether X_source features (i.e. genes) should be mean-centered. std_unit : bool, optional, default to False Whether X_source features (i.e. genes) should be standardized. """ self.source_data = source_data self.target_data = target_data self.n_factors = n_factors self.n_pv = n_pv self.dim_reduction = dim_reduction self.dim_reduction_target = dim_reduction_target self.n_representations = n_representations self.use_data = use_data self.standard_scaler_input_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.standard_scaler_source_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.standard_scaler_target_ = StandardScaler(with_mean=mean_center, with_std=std_unit) self.pv_computation_ = PVComputation( n_factors=self.n_factors, n_pv=self.n_pv, dim_reduction=self.dim_reduction, dim_reduction_target=self.dim_reduction_target, ) self.intermediate_factors = IntermediateFactors( n_representations=self.n_representations) def fit(self, X, y=None): """ Computes the geodesic flow kernel matrix used in kernel ridge. Parameters ------- X: numpy.ndarray, shape (n_samples, n_genes) Genomics data to consider y: numpy.ndarray, shape(n_samples, 1), optional, default to None Response data (optional, just for compliance with BaseEstimator) Returned Values ------- self: returns an instance of self. """ # Add X to source data if use_data set to True if self.use_data: if self.source_data is None or self.source_data.shape[0] == 0: self.source_data = X else: self.source_data = np.concatenate([self.source_data, X]) # Standardize data self.standard_scaler_input_.fit(X) self.source_data = self.standard_scaler_source_.fit_transform( self.source_data) self.target_data = self.standard_scaler_target_.fit_transform( self.target_data) self.training_data = self.standard_scaler_input_.transform(X) # Compute principal vectors self.pv_computation_.fit(self.source_data, self.target_data, y) # Compute G, kernel matrix self.G_ = self.intermediate_factors.compute_geodesic_matrix( self.pv_computation_.source_components_, self.pv_computation_.target_components_) # Compute projector self.projector_ = np.block([ self.pv_computation_.source_components_.transpose(), self.pv_computation_.target_components_.transpose() ]) return self def _compute_kernel_matrix(self, X1, X2=None): X1_projected = X1.dot(self.projector_) if X2 is None: X2_projected = X1_projected else: X2_projected = X2.dot(self.projector_) return X1_projected.dot(self.G_).dot(X2_projected.transpose()) def transform(self, X, y=None): """ Compute the domain-invariant kernel matrix Parameters ------- X: numpy.ndarray, shape (n_components, n_features) Genomics data use for prediction. Returned values ------- X_projected: numpy.ndarray, shape (n_components, n_representations) Kernel matrix with source data (fed in fit method). """ return self._compute_kernel_matrix( self.standard_scaler_input_.fit_transform(X), self.training_data)