def _test_u_distance_correlation_vector_generic(self, vector_type=None, type_cov=None, type_cor=None): """ Auxiliar function for testing U-distance correlation in vectors. This function is provided to check that the results are the same with different dtypes, but that the dtype of the result is the right one. """ if type_cov is None: type_cov = vector_type if type_cor is None: type_cor = vector_type arr1 = np.array([ vector_type(1), vector_type(2), vector_type(3), vector_type(4), vector_type(5), vector_type(6) ]) arr2 = np.array([ vector_type(1), vector_type(7), vector_type(5), vector_type(5), vector_type(6), vector_type(2) ]) covariance = dcor.u_distance_covariance_sqr(arr1, arr2) self.assertIsInstance(covariance, type_cov) self.assertAlmostEqual(covariance, type_cov(-0.88889), places=5) correlation = dcor.u_distance_correlation_sqr(arr1, arr2) self.assertIsInstance(correlation, type_cor) self.assertAlmostEqual(correlation, type_cor(-0.41613), places=5) covariance = dcor.u_distance_covariance_sqr(arr1, arr1) self.assertIsInstance(covariance, type_cov) self.assertAlmostEqual(covariance, type_cov(1.5556), places=4) correlation = dcor.u_distance_correlation_sqr(arr1, arr1) self.assertIsInstance(correlation, type_cor) self.assertAlmostEqual(correlation, type_cor(1), places=5)
def test_u_distance_covariance_avl_overflow(self): """Test potential overflow in fast distance correlation""" arr1 = np.concatenate((np.zeros(500, dtype=int), np.ones(500, dtype=int))) covariance = dcor.u_distance_covariance_sqr( arr1, arr1, method='avl', compile_mode=dcor.CompileMode.NO_COMPILE) self.assertAlmostEqual(covariance, 0.25050, places=5)
def fit(self, x, **kwargs): if 'trimming' not in kwargs: trimming = 0 else: trimming = kwargs.get('trimming') if 'biascorr' not in kwargs: biascorr = False else: biascorr = kwargs.get('biascorr') if 'alpha' not in kwargs: alpha = 1 else: alpha = kwargs.get('alpha') if 'dmetric' not in kwargs: dmetric = 'euclidean' else: dmetric = kwargs.get('dmetric') if 'calcmode' not in kwargs: calcmode = 'fast' else: calcmode = kwargs.get('calcmode') if 'order' not in kwargs: order = 2 else: order = kwargs.get('order') if self.mode == 'var': mode = 'mom' order = 2 elif self.mode == 'cov': mode = 'com' order = 2 elif self.mode == 'std': self.center = 'mean' mode = 'mom' order = 2 num_power = 0.5 elif self.mode == 'skew': self.center = 'mean' mode = 'mom' order = 3 num_power = 1.5 elif self.mode == 'kurt': self.center = 'mean' mode = 'mom' order = 4 num_power = 2 if 'Fisher' not in kwargs: Fisher = True else: Fisher = kwargs.get('Fisher') elif self.mode == 'cos': self.center = 'mean' mode = 'com' order = 3 if 'standardized' not in kwargs: standardized = True else: standardized = kwargs.get('standardized') elif self.mode == 'M3': self.center = 'mean' mode = 'com' order = 3 if 'standardized' not in kwargs: standardized = False else: standardized = kwargs.get('standardized') elif self.mode == 'c*k': self.center = 'mean' mode = 'com' order = 4 if 'standardized' not in kwargs: standardized = True else: standardized = kwargs.get('standardized') if 'Fisher' not in kwargs: Fisher = True else: Fisher = kwargs.get('Fisher') else: mode = self.mode if order > 2: if 'option' not in kwargs: option = 1 else: option = kwargs.get('option') else: option = 0 n = len(x) ntrim = round(n * (1 - trimming)) if len(x.shape) == 1: x = np.matrix(x).reshape((n, 1)) if mode == 'corr': alpha = 1 if n == 0: raise (MyException('Please feed data with length > 0')) if self.center == 'median': locest = np.median else: locest = trim_mean # Classical variance, covariance and continuum as well as robust alternatives if self.est == 'arithmetic': # Variance if mode != 'com': # if self.center=='mean': # xvar = trimvar(x,trimming)*ntrim/(ntrim-1) # elif self.center=='median': # xvar = srs.mad(x)**2 xmom = trim_mom(x, x, locest, order, trimming, option, biascorr) self.x_moment_ = xmom if self.mode in ('std', 'skew', 'kurt'): x2mom = trim_mom(x, x, locest, 2, trimming, option, False) xmom /= (x2mom**num_power) if biascorr: if self.mode == 'skew': xmom *= (ntrim - 1)**2 xmom /= np.sqrt(ntrim**2 - ntrim) elif self.mode == 'kurt': xmom = xmom * ntrim - xmom / ntrim xmom -= 3 * (ntrim - 1)**2.0 / ((ntrim - 2) * (ntrim - 3)) if not Fisher: xmom += 3 if mode == 'mom': self.moment_ = xmom # Covariance or continuum if mode != 'mom': if 'y' not in kwargs: raise (MyException('Please supply second data vector')) else: y = kwargs.get('y') n1 = len(y) if n1 == 0: raise (MyException('Please feed data with length > 0')) if n1 != n: raise (MyException( 'Please feed x and y data of equal length')) if len(y.shape) == 1: y = np.matrix(y).reshape((n, 1)) como = trim_mom(x, y, locest, order, trimming, option, biascorr) self.co_moment_ = como if (biascorr and (order > 2)): como *= ntrim if mode == 'com': self.moment_ = como if self.mode in ('c*k', 'cos'): x2sd = np.sqrt( trim_mom(x, x, locest, 2, trimming, option, biascorr)) y2sd = np.sqrt( trim_mom(y, y, locest, 2, trimming, option, biascorr)) if ((self.mode == 'c*k') and biascorr ): # biascorr is only exact if standardized como -= como / (ntrim**2) como -= 3 * (ntrim - 1)**2.0 / ((ntrim - 2) * (ntrim - 3)) if mode in ['corr', 'continuum']: # if self.center=='mean': # yvar = trimvar(y,trimming)*ntrim/(ntrim-1) # # elif self.center=='median': # yvar = srs.mad(y) ymom = trim_mom(y, y, locest, order, trimming, option, biascorr) self.y_moment_ = ymom # Distance based metrics elif self.est == 'distance': if 'dmetric' not in kwargs: dmetric = 'euclidean' else: dmetric = kwargs.get('dmetric') if (mode in ['ballcov', 'mdd', 'mdc']): if 'y' not in kwargs: raise (MyException('Please supply second data vector')) else: y = kwargs.get('y') n1 = len(y) if n1 == 0: raise (MyException('Please feed data with length > 0')) if n1 != n: raise (MyException( 'Please feed x and y data of equal length')) if (mode in ['mdd', 'mdc']): como = np.sqrt( difference_divergence(x, y, center=self.center, trimming=trimming, biascorr=biascorr)) self.co_moment_ = como if mode == 'mdd': self.moment_ = como else: xmom = difference_divergence(x, x, center=self.center, trimming=trimming, biascorr=biascorr) ymom = difference_divergence(y, y, center=self.center, trimming=trimming, biascorr=biascorr) self.x_moment_ = xmom self.y_moment_ = ymom mode = 'corr' else: dmy, n2 = distance_matrix_centered(y, biascorr=biascorr, trimming=trimming, center=self.center, dmetric=dmetric) bcov_res = Ball.bcov_test(x, y, num_permutations=0)[0] self.moment_ = bcov_res elif (calcmode == 'fast' and self.center == 'mean' and trimming == 0 and order == 2): if mode != 'com': if biascorr: xmom = np.sqrt(dc.u_distance_covariance_sqr(x, x)) else: xmom = np.sqrt(dc.distance_covariance_sqr(x, x)) self.moment_ = xmom if mode != 'mom': if 'y' not in kwargs: raise (MyException('Please supply second data vector')) else: y = kwargs.get('y') n1 = len(y) if biascorr: como = np.sqrt(dc.u_distance_covariance_sqr(x, y)) else: como = np.sqrt(dc.distance_covariance_sqr(x, y)) if mode == 'com': self.co_moment_ = como self.moment_ = como elif mode in ['corr', 'continuum', 'cos', 'c*k']: if biascorr: ymom = np.sqrt(dc.u_distance_covariance_sqr(y, y)) else: ymom = np.sqrt(dc.distance_covariance_sqr(y, y)) self.y_moment_ = ymom else: dmx, n1 = distance_matrix_centered(x, biascorr=biascorr, trimming=trimming, center=self.center, dmetric=dmetric) # Variance if mode != 'com': xmom = distance_moment(dmx, dmx, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=order, option=option) self.x_moment_ = xmom if self.mode in ('std', 'skew', 'kurt'): x2mom = distance_moment(dmx, dmx, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=2, option=option) xmom /= x2mom**num_power if mode == 'mom': self.moment_ = xmom if mode != 'mom': if 'y' not in kwargs: raise (MyException('Please supply second data vector')) else: y = kwargs.get('y') n1 = len(y) if n1 == 0: raise (MyException('Please feed data with length > 0')) if n1 != n: raise (MyException( 'Please feed x and y data of equal length')) dmy, n2 = distance_matrix_centered(y, biascorr=biascorr, trimming=trimming, center=self.center, dmetric=dmetric) como = distance_moment(dmx, dmy, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=order, option=option) self.co_moment_ = como if mode == 'com': self.moment_ = como if self.mode in ('c*k', 'cos'): x2sd = distance_moment(dmx, dmx, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=2, option=option) x2sd = np.sqrt(x2sd) y2sd = distance_moment(dmy, dmy, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=2, option=option) y2sd = np.sqrt(y2sd) if mode in ['corr', 'continuum', 'c*k', 'cos']: ymom = distance_moment(dmy, dmy, n1=n, biascorr=biascorr, center=self.center, trimming=trimming, order=order, option=option) self.y_moment_ = ymom if mode == 'corr': como /= (np.sqrt(xmom) * np.sqrt(ymom)) self.moment_ = como elif mode == 'continuum': como *= como * (np.sqrt(xmom)**(alpha - 1)) self.moment_ = como if (self.mode in ('c*k', 'cos') and standardized): iter_stop_2 = option iter_stop_1 = order - option como /= np.power(x2sd, iter_stop_1) como /= np.power(y2sd, iter_stop_2) if ((self.mode == 'c*k') and not Fisher): # Not very meaningful for co-moment como += 3 self.moment_ = como if type(self.moment_) == np.ndarray: self.moment_ = self.moment_[0] return (self.moment_)
def test_distance_correlation_comparison(self): """ Compare all implementations of the distance covariance and correlation. """ arr1 = np.array(((1.,), (2.,), (3.,), (4.,), (5.,), (6.,))) arr2 = np.array(((1.,), (7.,), (5.,), (5.,), (6.,), (2.,))) for method in dcor.DistanceCovarianceMethod: with self.subTest(method=method): compile_modes = [dcor.CompileMode.AUTO, dcor.CompileMode.COMPILE_CPU, dcor.CompileMode.NO_COMPILE] if method is not dcor.DistanceCovarianceMethod.NAIVE: compile_modes += [dcor.CompileMode.COMPILE_CPU] for compile_mode in compile_modes: with self.subTest(compile_mode=compile_mode): # Unbiased versions covariance = dcor.u_distance_covariance_sqr( arr1, arr2, method=method, compile_mode=compile_mode) self.assertAlmostEqual(covariance, -0.88889, places=5) correlation = dcor.u_distance_correlation_sqr( arr1, arr2, method=method, compile_mode=compile_mode) self.assertAlmostEqual(correlation, -0.41613, places=5) covariance = dcor.u_distance_covariance_sqr( arr1, arr1, method=method, compile_mode=compile_mode) self.assertAlmostEqual(covariance, 1.55556, places=5) correlation = dcor.u_distance_correlation_sqr( arr1, arr1, method=method, compile_mode=compile_mode) self.assertAlmostEqual(correlation, 1, places=5) covariance = dcor.u_distance_covariance_sqr( arr2, arr2, method=method, compile_mode=compile_mode) self.assertAlmostEqual(covariance, 2.93333, places=5) correlation = dcor.u_distance_correlation_sqr( arr2, arr2, method=method, compile_mode=compile_mode) self.assertAlmostEqual(correlation, 1, places=5) stats = dcor.u_distance_stats_sqr( arr1, arr2, method=method, compile_mode=compile_mode) np.testing.assert_allclose( stats, (-0.88889, -0.41613, 1.55556, 2.93333), rtol=1e-4) # Biased covariance = dcor.distance_covariance_sqr( arr1, arr2, method=method, compile_mode=compile_mode) self.assertAlmostEqual(covariance, 0.68519, places=5) correlation = dcor.distance_correlation_sqr( arr1, arr2, method=method, compile_mode=compile_mode) self.assertAlmostEqual(correlation, 0.30661, places=5) covariance = dcor.distance_covariance_sqr( arr1, arr1, method=method, compile_mode=compile_mode) self.assertAlmostEqual(covariance, 1.70679, places=5) correlation = dcor.distance_correlation_sqr( arr1, arr1, method=method, compile_mode=compile_mode) self.assertAlmostEqual(correlation, 1, places=5) covariance = dcor.distance_covariance_sqr( arr2, arr2, method=method, compile_mode=compile_mode) self.assertAlmostEqual(covariance, 2.92593, places=5) correlation = dcor.distance_correlation_sqr( arr2, arr2, method=method, compile_mode=compile_mode) self.assertAlmostEqual(correlation, 1, places=5) stats = dcor.distance_stats_sqr( arr1, arr2, method=method, compile_mode=compile_mode) np.testing.assert_allclose( stats, (0.68519, 0.30661, 1.70679, 2.92593), rtol=1e-4)
def __call__(self, *, selected_variable, y, **kwargs): bound = self.chi_bound(selected_variable, y, self.significance) return dcor.u_distance_covariance_sqr(selected_variable, y) < bound
def fit(self, x, **kwargs): """ Fit a dicomo model Parameters ------------ X : numpy array or pandas DataFrame input data Remarks: The `fit` function takes several optional input arguments. These are options that apply to individual settings: `biascorr`, Bool, when `True`, correct for bias. For classical product-moment statistics, this is the small sample correction. For energy statistics, this leads to the estimates that are unbiased in high dimension (but not preferred in low dimension). `alpha`, float, parameter for continuum association. Has no effect for other options. `option`, int, determines which higher order co-moment to calculate, e.g. for co-skewness, `option=1` calculates CoS(x,x,y) `order`, int, which order (co-)moment to calculate. Can be overruled by `mode`, e.g. if `mode='var'`, `order` is set to 2. `calcmode`, str, to use the efficient or naive algorithm to calculate distance statistics. Defaults to `fast` when available. """ if 'trimming' not in kwargs: trimming = 0 else: trimming = kwargs.get('trimming') if 'biascorr' not in kwargs: biascorr = False else: biascorr = kwargs.get('biascorr') if 'alpha' not in kwargs: alpha = 1 else: alpha = kwargs.get('alpha') if 'dmetric' not in kwargs: dmetric = 'euclidean' else: dmetric = kwargs.get('dmetric') if 'calcmode' not in kwargs: calcmode = 'fast' else: calcmode = kwargs.get('calcmode') if 'order' not in kwargs: order = 2 else: order = kwargs.get('order') if self.mode == 'var': mode = 'mom' order = 2 elif self.mode == 'cov': mode = 'com' order = 2 elif self.mode == 'std': self.center = 'mean' mode = 'mom' order = 2 num_power = 0.5 elif self.mode == 'skew': self.center = 'mean' mode = 'mom' order = 3 num_power = 1.5 elif self.mode == 'kurt': self.center = 'mean' mode = 'mom' order = 4 num_power = 2 if 'Fisher' not in kwargs: Fisher = True else: Fisher = kwargs.get('Fisher') elif self.mode == 'cos': self.center = 'mean' mode = 'com' order = 3 if 'standardized' not in kwargs: standardized = True else: standardized = kwargs.get('standardized') elif self.mode == 'M3': self.center = 'mean' mode = 'com' order = 3 if 'standardized' not in kwargs: standardized = False else: standardized = kwargs.get('standardized') elif self.mode == 'c*k': self.center = 'mean' mode = 'com' order = 4 if 'standardized' not in kwargs: standardized = True else: standardized = kwargs.get('standardized') if 'Fisher' not in kwargs: Fisher = True else: Fisher = kwargs.get('Fisher') else: mode = self.mode if order > 2: if 'option' not in kwargs: option = 1 else: option = kwargs.get('option') else: option = 0 n = len(x) ntrim = round(n * (1 - trimming)) if len(x.shape) == 1: x = np.matrix(x).reshape((n, 1)) if mode == 'corr': alpha = 1 if n == 0: raise (MyException('Please feed data with length > 0')) if self.center == 'median': locest = np.median else: locest = trim_mean # Classical variance, covariance and continuum as well as robust alternatives if self.est == 'arithmetic': # Variance if mode != 'com': # if self.center=='mean': # xvar = trimvar(x,trimming)*ntrim/(ntrim-1) # elif self.center=='median': # xvar = srs.mad(x)**2 xmom = trim_mom(x, x, locest, order, trimming, option, biascorr) self.x_moment_ = xmom if self.mode in ('std', 'skew', 'kurt'): x2mom = trim_mom(x, x, locest, 2, trimming, option, False) xmom /= (x2mom**num_power) if biascorr: if self.mode == 'skew': xmom *= (ntrim - 1)**2 xmom /= np.sqrt(ntrim**2 - ntrim) elif self.mode == 'kurt': xmom = xmom * ntrim - xmom / ntrim xmom -= 3 * (ntrim - 1)**2.0 / ((ntrim - 2) * (ntrim - 3)) if not Fisher: xmom += 3 if mode == 'mom': self.moment_ = xmom # Covariance or continuum if mode != 'mom': if 'y' not in kwargs: raise (MyException('Please supply second data vector')) else: y = kwargs.get('y') n1 = len(y) if n1 == 0: raise (MyException('Please feed data with length > 0')) if n1 != n: raise (MyException( 'Please feed x and y data of equal length')) if len(y.shape) == 1: y = np.matrix(y).reshape((n, 1)) como = trim_mom(x, y, locest, order, trimming, option, biascorr) self.co_moment_ = como if (biascorr and (order > 2)): como *= ntrim if mode == 'com': self.moment_ = como if self.mode in ('c*k', 'cos'): x2sd = np.sqrt( trim_mom(x, x, locest, 2, trimming, option, biascorr)) y2sd = np.sqrt( trim_mom(y, y, locest, 2, trimming, option, biascorr)) if ((self.mode == 'c*k') and biascorr ): # biascorr is only exact if standardized como -= como / (ntrim**2) como -= 3 * (ntrim - 1)**2.0 / ((ntrim - 2) * (ntrim - 3)) if mode in ['corr', 'continuum']: # if self.center=='mean': # yvar = trimvar(y,trimming)*ntrim/(ntrim-1) # # elif self.center=='median': # yvar = srs.mad(y) ymom = trim_mom(y, y, locest, order, trimming, option, biascorr) self.y_moment_ = ymom # Distance based metrics elif self.est == 'distance': if 'dmetric' not in kwargs: dmetric = 'euclidean' else: dmetric = kwargs.get('dmetric') if (mode in ['ballcov', 'mdd', 'mdc']): if 'y' not in kwargs: raise (MyException('Please supply second data vector')) else: y = kwargs.get('y') n1 = len(y) if n1 == 0: raise (MyException('Please feed data with length > 0')) if n1 != n: raise (MyException( 'Please feed x and y data of equal length')) if (mode in ['mdd', 'mdc']): como = np.sqrt( difference_divergence(x, y, center=self.center, trimming=trimming, biascorr=biascorr)) self.co_moment_ = como if mode == 'mdd': self.moment_ = como else: xmom = difference_divergence(x, x, center=self.center, trimming=trimming, biascorr=biascorr) ymom = difference_divergence(y, y, center=self.center, trimming=trimming, biascorr=biascorr) self.x_moment_ = xmom self.y_moment_ = ymom mode = 'corr' else: dmy, n2 = distance_matrix_centered(y, biascorr=biascorr, trimming=trimming, center=self.center, dmetric=dmetric) bcov_res = Ball.bcov_test(x, y, num_permutations=0)[0] self.moment_ = bcov_res elif (calcmode == 'fast' and self.center == 'mean' and trimming == 0 and order == 2): if mode != 'com': if biascorr: xmom = np.sqrt(dc.u_distance_covariance_sqr(x, x)) else: xmom = np.sqrt(dc.distance_covariance_sqr(x, x)) self.moment_ = xmom if mode != 'mom': if 'y' not in kwargs: raise (MyException('Please supply second data vector')) else: y = kwargs.get('y') n1 = len(y) if biascorr: como = np.sqrt(dc.u_distance_covariance_sqr(x, y)) else: como = np.sqrt(dc.distance_covariance_sqr(x, y)) if mode == 'com': self.co_moment_ = como self.moment_ = como elif mode in ['corr', 'continuum', 'cos', 'c*k']: if biascorr: ymom = np.sqrt(dc.u_distance_covariance_sqr(y, y)) else: ymom = np.sqrt(dc.distance_covariance_sqr(y, y)) self.y_moment_ = ymom else: dmx, n1 = distance_matrix_centered(x, biascorr=biascorr, trimming=trimming, center=self.center, dmetric=dmetric) # Variance if mode != 'com': xmom = distance_moment(dmx, dmx, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=order, option=option) self.x_moment_ = xmom if self.mode in ('std', 'skew', 'kurt'): x2mom = distance_moment(dmx, dmx, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=2, option=option) xmom /= x2mom**num_power if mode == 'mom': self.moment_ = xmom if mode != 'mom': if 'y' not in kwargs: raise (MyException('Please supply second data vector')) else: y = kwargs.get('y') n1 = len(y) if n1 == 0: raise (MyException('Please feed data with length > 0')) if n1 != n: raise (MyException( 'Please feed x and y data of equal length')) dmy, n2 = distance_matrix_centered(y, biascorr=biascorr, trimming=trimming, center=self.center, dmetric=dmetric) como = distance_moment(dmx, dmy, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=order, option=option) self.co_moment_ = como if mode == 'com': self.moment_ = como if self.mode in ('c*k', 'cos'): x2sd = distance_moment(dmx, dmx, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=2, option=option) x2sd = np.sqrt(x2sd) y2sd = distance_moment(dmy, dmy, n1=n1, biascorr=biascorr, center=self.center, trimming=trimming, order=2, option=option) y2sd = np.sqrt(y2sd) if mode in ['corr', 'continuum', 'c*k', 'cos']: ymom = distance_moment(dmy, dmy, n1=n, biascorr=biascorr, center=self.center, trimming=trimming, order=order, option=option) self.y_moment_ = ymom if mode == 'corr': como /= (np.sqrt(xmom) * np.sqrt(ymom)) self.moment_ = como elif mode == 'continuum': como *= como * (np.sqrt(xmom)**(alpha - 1)) self.moment_ = como if (self.mode in ('c*k', 'cos') and standardized): iter_stop_2 = option iter_stop_1 = order - option como /= np.power(x2sd, iter_stop_1) como /= np.power(y2sd, iter_stop_2) if ((self.mode == 'c*k') and not Fisher): # Not very meaningful for co-moment como += 3 self.moment_ = como if type(self.moment_) == np.ndarray: self.moment_ = self.moment_[0] return (self.moment_)