示例#1
0
    def test_codependence_matrix(self):
        '''
        Test the get_dependence_matrix and get_distance_matrix function
        '''
        #Dependence_matrix

        vi_matrix = get_dependence_matrix(
            self.X_matrix, dependence_method='information_variation')
        mi_matrix = get_dependence_matrix(
            self.X_matrix, dependence_method='mutual_information')
        corr_matrix = get_dependence_matrix(
            self.X_matrix, dependence_method='distance_correlation')
        #Distance_matrix
        angl = get_distance_matrix(vi_matrix, distance_metric='angular')
        sq_angl = get_distance_matrix(mi_matrix,
                                      distance_metric='squared_angular')
        abs_angl = get_distance_matrix(corr_matrix,
                                       distance_metric='abs_angular')

        #assertions
        self.assertEqual(vi_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(mi_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(corr_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(angl.shape[0], self.X_matrix.shape[1])
        self.assertEqual(sq_angl.shape[0], self.X_matrix.shape[1])
        self.assertEqual(abs_angl.shape[0], self.X_matrix.shape[1])
示例#2
0
 def test_value_error_raise(self):
     '''
     Test of invailid arguments
     '''
     #Unkown dependence_metric
     with self.assertRaises(ValueError):
         get_dependence_matrix(self.X_matrix, dependence_method='unknown')
     #Unkown distance_metric
     with self.assertRaises(ValueError):
         get_distance_matrix(self.X_matrix, distance_metric='unknown')
示例#3
0
    def test_codependence_matrix(self):
        '''
        Test the get_dependence_matrix and get_distance_matrix function
        '''

        # TODO: add tests for values in matrix
        #Dependence_matrix

        vi_matrix = get_dependence_matrix(self.X_matrix, dependence_method='information_variation')
        mi_matrix = get_dependence_matrix(self.X_matrix, dependence_method='mutual_information')
        corr_matrix = get_dependence_matrix(self.X_matrix, dependence_method='distance_correlation')
        rho_matrix = get_dependence_matrix(self.X_matrix, dependence_method='spearmans_rho')
        gpr_matrix = get_dependence_matrix(self.X_matrix, dependence_method='gpr_distance', theta=0.5)
        gnpr_matrix = get_dependence_matrix(self.X_matrix, dependence_method='gnpr_distance', theta=0.5, bandwidth=0.02)

        #Distance_matrix
        angl = get_distance_matrix(vi_matrix, distance_metric='angular')
        sq_angl = get_distance_matrix(mi_matrix, distance_metric='squared_angular')
        abs_angl = get_distance_matrix(corr_matrix, distance_metric='abs_angular')

        #assertions
        self.assertEqual(vi_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(mi_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(corr_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(rho_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(gpr_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(gnpr_matrix.shape[0], self.X_matrix.shape[1])

        self.assertEqual(angl.shape[0], self.X_matrix.shape[1])
        self.assertEqual(sq_angl.shape[0], self.X_matrix.shape[1])
        self.assertEqual(abs_angl.shape[0], self.X_matrix.shape[1])
def get_feature_clusters(X: pd.DataFrame, dependence_metric: str, distance_metric: str = None,
                         linkage_method: str = None, n_clusters: int = None, critical_threshold: float = 0.0) -> list:
    """
    Machine Learning for Asset Managers
    Snippet 6.5.2.1 , page 85. Step 1: Features Clustering

    Gets clustered features subsets from the given set of features.

    :param X: (pd.DataFrame) Dataframe of features.
    :param dependence_metric: (str) Method to be use for generating dependence_matrix, either 'linear' or
                              'information_variation' or 'mutual_information' or 'distance_correlation'.
    :param distance_metric: (str) The distance operator to be used for generating the distance matrix. The methods that
                            can be applied are: 'angular', 'squared_angular', 'absolute_angular'. Set it to None if the
                            feature are to be generated as it is by the ONC algorithm.
    :param linkage_method: (str) Method of linkage to be used for clustering. Methods include: 'single', 'ward',
                           'complete', 'average', 'weighted', and 'centroid'. Set it to None if the feature are to
                           be generated as it is by the ONC algorithm.
    :param n_clusters: (int) Number of clusters to form. Must be less the total number of features. If None then it
                       returns optimal number of clusters decided by the ONC Algorithm.
    :param critical_threshold: (float) Threshold for determining low silhouette score in the dataset. It can any real number
                                in [-1,+1], default is 0 which means any feature that has a silhouette score below 0 will be
                                indentified as having low silhouette and hence requied transformation will be appiled to for
                                for correction of the same.
    :return: (list) Feature subsets.
    """

    # Get the dependence matrix
    if dependence_metric != 'linear':
        dep_matrix = get_dependence_matrix(X, dependence_method=dependence_metric)
    else:
        dep_matrix = X.corr()

    # Checking if dataset contains features low silhouette
    X = _check_for_low_silhouette_scores(X, dep_matrix, critical_threshold)

    if n_clusters is None and (distance_metric is None or linkage_method is None):
        return list(get_onc_clusters(dep_matrix.fillna(0))[1].values())  # Get optimal number of clusters
    if distance_metric is not None and (linkage_method is not None and n_clusters is None):
        n_clusters = len(get_onc_clusters(dep_matrix.fillna(0))[1])
    if n_clusters >= len(X.columns):  # Check if number of clusters exceeds number of features
        raise ValueError('Number of clusters must be less than the number of features')

    # Apply distance operator on the dependence matrix
    dist_matrix = get_distance_matrix(dep_matrix, distance_metric=distance_metric)

    # Get the linkage
    link = linkage(squareform(dist_matrix), method=linkage_method)
    clusters = fcluster(link, t=n_clusters, criterion='maxclust')
    clustered_subsets = [[f for c, f in zip(clusters, X.columns) if c == ci] for ci in range(1, n_clusters + 1)]

    return clustered_subsets
示例#5
0
    def test_codependence_matrix(self):
        '''
        Test the get_dependence_matrix and get_distance_matrix function
        '''

        # TODO: add tests for values in matrix
        #Dependence_matrix

        vi_matrix = get_dependence_matrix(
            self.X_matrix, dependence_method='information_variation')
        mi_matrix = get_dependence_matrix(
            self.X_matrix, dependence_method='mutual_information')
        corr_matrix = get_dependence_matrix(
            self.X_matrix, dependence_method='distance_correlation')
        rho_matrix = get_dependence_matrix(self.X_matrix,
                                           dependence_method='spearmans_rho')
        gpr_matrix = get_dependence_matrix(self.X_matrix,
                                           dependence_method='gpr_distance',
                                           theta=0.5)
        gnpr_matrix = get_dependence_matrix(self.X_matrix,
                                            dependence_method='gnpr_distance',
                                            theta=0.5)
        ot_matrix_comon = get_dependence_matrix(
            self.X_matrix,
            dependence_method='optimal_transport',
            target_dependence='comonotonicity')
        ot_matrix_counter = get_dependence_matrix(
            self.X_matrix,
            dependence_method='optimal_transport',
            target_dependence='countermonotonicity')
        ot_matrix_gauss = get_dependence_matrix(
            self.X_matrix,
            dependence_method='optimal_transport',
            target_dependence='gaussian',
            gaussian_corr=0.6)
        ot_matrix_posneg = get_dependence_matrix(
            self.X_matrix,
            dependence_method='optimal_transport',
            target_dependence='positive_negative')
        ot_matrix_diffvar = get_dependence_matrix(
            self.X_matrix,
            dependence_method='optimal_transport',
            target_dependence='different_variations')
        ot_matrix_smallvar = get_dependence_matrix(
            self.X_matrix,
            dependence_method='optimal_transport',
            target_dependence='small_variations')

        #Distance_matrix
        angl = get_distance_matrix(vi_matrix, distance_metric='angular')
        sq_angl = get_distance_matrix(mi_matrix,
                                      distance_metric='squared_angular')
        abs_angl = get_distance_matrix(corr_matrix,
                                       distance_metric='abs_angular')

        #assertions
        self.assertEqual(vi_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(mi_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(corr_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(rho_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(gpr_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(gnpr_matrix.shape[0], self.X_matrix.shape[1])
        self.assertEqual(ot_matrix_comon.shape[0], self.X_matrix.shape[1])
        self.assertEqual(ot_matrix_counter.shape[0], self.X_matrix.shape[1])
        self.assertEqual(ot_matrix_gauss.shape[0], self.X_matrix.shape[1])
        self.assertEqual(ot_matrix_posneg.shape[0], self.X_matrix.shape[1])
        self.assertEqual(ot_matrix_diffvar.shape[0], self.X_matrix.shape[1])
        self.assertEqual(ot_matrix_smallvar.shape[0], self.X_matrix.shape[1])

        self.assertEqual(angl.shape[0], self.X_matrix.shape[1])
        self.assertEqual(sq_angl.shape[0], self.X_matrix.shape[1])
        self.assertEqual(abs_angl.shape[0], self.X_matrix.shape[1])