def test_codependence_matrix(self): ''' Test the get_dependence_matrix and get_distance_matrix function ''' #Dependence_matrix vi_matrix = get_dependence_matrix( self.X_matrix, dependence_method='information_variation') mi_matrix = get_dependence_matrix( self.X_matrix, dependence_method='mutual_information') corr_matrix = get_dependence_matrix( self.X_matrix, dependence_method='distance_correlation') #Distance_matrix angl = get_distance_matrix(vi_matrix, distance_metric='angular') sq_angl = get_distance_matrix(mi_matrix, distance_metric='squared_angular') abs_angl = get_distance_matrix(corr_matrix, distance_metric='abs_angular') #assertions self.assertEqual(vi_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(mi_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(corr_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(angl.shape[0], self.X_matrix.shape[1]) self.assertEqual(sq_angl.shape[0], self.X_matrix.shape[1]) self.assertEqual(abs_angl.shape[0], self.X_matrix.shape[1])
def test_value_error_raise(self): ''' Test of invailid arguments ''' #Unkown dependence_metric with self.assertRaises(ValueError): get_dependence_matrix(self.X_matrix, dependence_method='unknown') #Unkown distance_metric with self.assertRaises(ValueError): get_distance_matrix(self.X_matrix, distance_metric='unknown')
def test_codependence_matrix(self): ''' Test the get_dependence_matrix and get_distance_matrix function ''' # TODO: add tests for values in matrix #Dependence_matrix vi_matrix = get_dependence_matrix(self.X_matrix, dependence_method='information_variation') mi_matrix = get_dependence_matrix(self.X_matrix, dependence_method='mutual_information') corr_matrix = get_dependence_matrix(self.X_matrix, dependence_method='distance_correlation') rho_matrix = get_dependence_matrix(self.X_matrix, dependence_method='spearmans_rho') gpr_matrix = get_dependence_matrix(self.X_matrix, dependence_method='gpr_distance', theta=0.5) gnpr_matrix = get_dependence_matrix(self.X_matrix, dependence_method='gnpr_distance', theta=0.5, bandwidth=0.02) #Distance_matrix angl = get_distance_matrix(vi_matrix, distance_metric='angular') sq_angl = get_distance_matrix(mi_matrix, distance_metric='squared_angular') abs_angl = get_distance_matrix(corr_matrix, distance_metric='abs_angular') #assertions self.assertEqual(vi_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(mi_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(corr_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(rho_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(gpr_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(gnpr_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(angl.shape[0], self.X_matrix.shape[1]) self.assertEqual(sq_angl.shape[0], self.X_matrix.shape[1]) self.assertEqual(abs_angl.shape[0], self.X_matrix.shape[1])
def get_feature_clusters(X: pd.DataFrame, dependence_metric: str, distance_metric: str = None, linkage_method: str = None, n_clusters: int = None, critical_threshold: float = 0.0) -> list: """ Machine Learning for Asset Managers Snippet 6.5.2.1 , page 85. Step 1: Features Clustering Gets clustered features subsets from the given set of features. :param X: (pd.DataFrame) Dataframe of features. :param dependence_metric: (str) Method to be use for generating dependence_matrix, either 'linear' or 'information_variation' or 'mutual_information' or 'distance_correlation'. :param distance_metric: (str) The distance operator to be used for generating the distance matrix. The methods that can be applied are: 'angular', 'squared_angular', 'absolute_angular'. Set it to None if the feature are to be generated as it is by the ONC algorithm. :param linkage_method: (str) Method of linkage to be used for clustering. Methods include: 'single', 'ward', 'complete', 'average', 'weighted', and 'centroid'. Set it to None if the feature are to be generated as it is by the ONC algorithm. :param n_clusters: (int) Number of clusters to form. Must be less the total number of features. If None then it returns optimal number of clusters decided by the ONC Algorithm. :param critical_threshold: (float) Threshold for determining low silhouette score in the dataset. It can any real number in [-1,+1], default is 0 which means any feature that has a silhouette score below 0 will be indentified as having low silhouette and hence requied transformation will be appiled to for for correction of the same. :return: (list) Feature subsets. """ # Get the dependence matrix if dependence_metric != 'linear': dep_matrix = get_dependence_matrix(X, dependence_method=dependence_metric) else: dep_matrix = X.corr() # Checking if dataset contains features low silhouette X = _check_for_low_silhouette_scores(X, dep_matrix, critical_threshold) if n_clusters is None and (distance_metric is None or linkage_method is None): return list(get_onc_clusters(dep_matrix.fillna(0))[1].values()) # Get optimal number of clusters if distance_metric is not None and (linkage_method is not None and n_clusters is None): n_clusters = len(get_onc_clusters(dep_matrix.fillna(0))[1]) if n_clusters >= len(X.columns): # Check if number of clusters exceeds number of features raise ValueError('Number of clusters must be less than the number of features') # Apply distance operator on the dependence matrix dist_matrix = get_distance_matrix(dep_matrix, distance_metric=distance_metric) # Get the linkage link = linkage(squareform(dist_matrix), method=linkage_method) clusters = fcluster(link, t=n_clusters, criterion='maxclust') clustered_subsets = [[f for c, f in zip(clusters, X.columns) if c == ci] for ci in range(1, n_clusters + 1)] return clustered_subsets
def test_codependence_matrix(self): ''' Test the get_dependence_matrix and get_distance_matrix function ''' # TODO: add tests for values in matrix #Dependence_matrix vi_matrix = get_dependence_matrix( self.X_matrix, dependence_method='information_variation') mi_matrix = get_dependence_matrix( self.X_matrix, dependence_method='mutual_information') corr_matrix = get_dependence_matrix( self.X_matrix, dependence_method='distance_correlation') rho_matrix = get_dependence_matrix(self.X_matrix, dependence_method='spearmans_rho') gpr_matrix = get_dependence_matrix(self.X_matrix, dependence_method='gpr_distance', theta=0.5) gnpr_matrix = get_dependence_matrix(self.X_matrix, dependence_method='gnpr_distance', theta=0.5) ot_matrix_comon = get_dependence_matrix( self.X_matrix, dependence_method='optimal_transport', target_dependence='comonotonicity') ot_matrix_counter = get_dependence_matrix( self.X_matrix, dependence_method='optimal_transport', target_dependence='countermonotonicity') ot_matrix_gauss = get_dependence_matrix( self.X_matrix, dependence_method='optimal_transport', target_dependence='gaussian', gaussian_corr=0.6) ot_matrix_posneg = get_dependence_matrix( self.X_matrix, dependence_method='optimal_transport', target_dependence='positive_negative') ot_matrix_diffvar = get_dependence_matrix( self.X_matrix, dependence_method='optimal_transport', target_dependence='different_variations') ot_matrix_smallvar = get_dependence_matrix( self.X_matrix, dependence_method='optimal_transport', target_dependence='small_variations') #Distance_matrix angl = get_distance_matrix(vi_matrix, distance_metric='angular') sq_angl = get_distance_matrix(mi_matrix, distance_metric='squared_angular') abs_angl = get_distance_matrix(corr_matrix, distance_metric='abs_angular') #assertions self.assertEqual(vi_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(mi_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(corr_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(rho_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(gpr_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(gnpr_matrix.shape[0], self.X_matrix.shape[1]) self.assertEqual(ot_matrix_comon.shape[0], self.X_matrix.shape[1]) self.assertEqual(ot_matrix_counter.shape[0], self.X_matrix.shape[1]) self.assertEqual(ot_matrix_gauss.shape[0], self.X_matrix.shape[1]) self.assertEqual(ot_matrix_posneg.shape[0], self.X_matrix.shape[1]) self.assertEqual(ot_matrix_diffvar.shape[0], self.X_matrix.shape[1]) self.assertEqual(ot_matrix_smallvar.shape[0], self.X_matrix.shape[1]) self.assertEqual(angl.shape[0], self.X_matrix.shape[1]) self.assertEqual(sq_angl.shape[0], self.X_matrix.shape[1]) self.assertEqual(abs_angl.shape[0], self.X_matrix.shape[1])