def test_statistic(self, matrix_X, matrix_Y): """ Computes the HHG correlation measure between two datasets. :param matrix_X: a [n*p] data matrix, a matrix with n samples in p dimensions :type matrix_X: 2D `numpy.array` :param matrix_Y: a [n*q] data matrix, a matrix with n samples in q dimensions :type matrix_Y: 2D `numpy.array` :param replication_factor: specifies the number of replications to use for the permutation test. Defaults to 1000. :type replication_factor: int :return: returns a list of two items, that contains: - :test_statistic_: test statistic - :test_statistic_metadata_: (optional) a ``dict`` of metadata other than the p_value, that the independence tests computes in the process :rtype: float, dict **Example:** >>> import numpy as np >>> from mgcpy.independence_tests.hhg import HHG >>> X = np.array([0.07487683, -0.18073412, 0.37266440, 0.06074847, 0.76899045, 0.51862516, -0.13480764, -0.54368083, -0.73812644, 0.54910974]).reshape(-1, 1) >>> Y = np.array([-1.31741173, -0.41634224, 2.24021815, 0.88317196, 2.00149312, 1.35857623, -0.06729464, 0.16168344, -0.61048226, 0.41711113]).reshape(-1, 1) >>> hhg = HHG() >>> hhg_test_stat = hhg.test_statistic(X, Y) """ distance_matrix_X, distance_matrix_Y = compute_distance(matrix_X, matrix_Y, self.compute_distance_matrix) n = distance_matrix_X.shape[0] S = np.zeros((n, n)) for i in range(n): for j in range(n): if i != j: tmp1 = distance_matrix_X[i, :] <= distance_matrix_X[i, j] tmp2 = distance_matrix_Y[i, :] <= distance_matrix_Y[i, j] t11 = np.sum(tmp1 * tmp2) - 2 t12 = np.sum(tmp1 * (1-tmp2)) t21 = np.sum((1-tmp1) * tmp2) t22 = np.sum((1-tmp1) * (1-tmp2)) denom = (t11+t12) * (t21+t22) * (t11+t21) * (t12+t22) if denom > 0: S[i, j] = (n-2) * \ np.power((t12*t21 - t11*t22), 2) / denom corr = np.sum(S) # no metadata for HHG self.test_statistic_metadata_ = {} self.test_statistic_ = corr return self.test_statistic_, self.test_statistic_metadata_
def test_statistic(self, matrix_X, matrix_Y, p = None): """ Computes the MGCX measure between two time series datasets. - It first computes all the local correlations - Then, it returns the maximal statistic among all local correlations based on thresholding. :param matrix_X: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*p]`` data matrix, a matrix with ``n`` samples in ``p`` dimensions :type matrix_X: 2D numpy.array :param matrix_Y: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*q]`` data matrix, a matrix with ``n`` samples in ``q`` dimensions :type matrix_Y: 2D numpy.array :param p: bandwidth parameter for Bartlett Kernel. :type p: float :return: returns a list of two items, that contains: - :test_statistic: the sample mgc_ts statistic (not necessarily within [-1,1]) - :test_statistic_metadata: a ``dict`` of metadata with the following keys: - :dist_mtx_X: the distance matrix of sample X - :dist_mtx_Y: the distance matrix of sample X :rtype: list **Example:** >>> import numpy as np >>> from mgcpy.independence_tests.mgc.mgc import MGC >>> >>> X = np.array([0.07487683, -0.18073412, 0.37266440, 0.06074847, 0.76899045, ... 0.51862516, -0.13480764, -0.54368083, -0.73812644, 0.54910974]).reshape(-1, 1) >>> Y = np.array([-1.31741173, -0.41634224, 2.24021815, 0.88317196, 2.00149312, ... 1.35857623, -0.06729464, 0.16168344, -0.61048226, 0.41711113]).reshape(-1, 1) >>> mgc_ts = MGC_TS() >>> mgc_ts_statistic, test_statistic_metadata = mgc.test_statistic(X, Y) """ assert matrix_X.shape[0] == matrix_Y.shape[0], "Matrices X and Y need to be of dimensions [n, p] and [n, q], respectively, where p can be equal to q" n = matrix_X.shape[0] if len(matrix_X.shape) == 1: matrix_X = matrix_X.reshape((n,1)) if len(matrix_Y.shape) == 1: matrix_Y = matrix_Y.reshape((n,1)) matrix_X, matrix_Y = compute_distance(matrix_X, matrix_Y, self.compute_distance_matrix) M = self.max_lag if self.max_lag is not None else math.ceil(math.sqrt(n)) mgc = self.mgc # Collect the test statistic by lag, and sum them for the full test statistic. dependence_by_lag = np.zeros(M+1) mgc_statistic, mgc_metadata = mgc.test_statistic(matrix_X, matrix_Y) dependence_by_lag[0] = np.maximum(0.0, mgc_statistic) max_dependence = dependence_by_lag[0] optimal_lag = 0 optimal_scale = mgc_metadata['optimal_scale'] # TO DO: parallelize? for j in range(1,M+1): dist_mtx_X = matrix_X[j:n,j:n] dist_mtx_Y = matrix_Y[0:(n-j),0:(n-j)] mgc_statistic, mgc_metadata = mgc.test_statistic(dist_mtx_X, dist_mtx_Y) dependence_by_lag[j] = (n-j)*np.maximum(0.0, mgc_statistic) / n if dependence_by_lag[j] > max_dependence: max_dependence = dependence_by_lag[j] optimal_lag = j optimal_scale = mgc_metadata['optimal_scale'] # Reporting optimal lag self.test_statistic_metadata_ = { 'optimal_lag' : optimal_lag, 'optimal_scale' : optimal_scale, 'dependence_by_lag' : dependence_by_lag } self.test_statistic_ = np.sum(dependence_by_lag) return self.test_statistic_, self.test_statistic_metadata_
def test_statistic(self, matrix_X, matrix_Y, p=None): """ Computes the (summed across lags) cross distance covariance estimate between two time series. :param matrix_X: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*p]`` data matrix, a matrix with ``n`` samples in ``p`` dimensions :type matrix_X: 2D numpy.array :param matrix_Y: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*q]`` data matrix, a matrix with ``n`` samples in ``q`` dimensions :type matrix_Y: 2D numpy.array :param p: bandwidth parameter for Bartlett Kernel. :type p: float :return: returns a list of two items, that contains: - :test_statistic: the sample cdcv statistic (not necessarily within [-1,1]) - :test_statistic_metadata: a ``dict`` of metadata with the following keys: - :dist_mtx_X: the distance matrix of sample X - :dist_mtx_Y: the distance matrix of sample X :rtype: list **Example:** >>> import numpy as np >>> from mgcpy.independence_tests.dcorr import DCorr >>> >>> X = np.array([0.07487683, -0.18073412, 0.37266440, 0.06074847, 0.76899045, ... 0.51862516, -0.13480764, -0.54368083, -0.73812644, 0.54910974]).reshape(-1, 1) >>> Y = np.array([-1.31741173, -0.41634224, 2.24021815, 0.88317196, 2.00149312, ... 1.35857623, -0.06729464, 0.16168344, -0.61048226, 0.41711113]).reshape(-1, 1) >>> cdcv = CDCV(which_test = 'unbiased') >>> cdcv_statistic = cdcv.test_statistic(X, Y) """ assert matrix_X.shape[0] == matrix_Y.shape[ 0], "Matrices X and Y need to be of dimensions [n, p] and [n, q], respectively, where p can be different from q" if self.which_test == "unbiased" and matrix_X.shape[0] <= 3: raise ValueError( 'Cannot use unbiased estimator of distance covariance with n <= 3.' ) # Represent univariate data as matrices. # Use the matrix shape and diagonal elements to determine if the given data is a distance matrix or not. n = matrix_X.shape[0] if len(matrix_X.shape) == 1: matrix_X = matrix_X.reshape((n, 1)) if len(matrix_Y.shape) == 1: matrix_Y = matrix_Y.reshape((n, 1)) matrix_X, matrix_Y = compute_distance(matrix_X, matrix_Y, self.compute_distance_matrix) M = self.max_lag if self.max_lag is not None else math.ceil( math.sqrt(n)) dcorr = self.dcorr # Collect the test statistic by lag, and sum them for the full test statistic. dependence_by_lag = np.zeros(M + 1) dcorr_statistic, _ = dcorr.test_statistic(matrix_X, matrix_Y) dependence_by_lag[0] = np.maximum(0.0, dcorr_statistic) # TO DO: parallelize? for j in range(1, M + 1): dist_mtx_X = matrix_X[j:n, j:n] dist_mtx_Y = matrix_Y[0:(n - j), 0:(n - j)] dcorr_statistic, _ = dcorr.test_statistic(dist_mtx_X, dist_mtx_Y) dependence_by_lag[j] = (n - j) * np.maximum(0.0, dcorr_statistic) / n # Reporting optimal lag optimal_lag = np.argmax(dependence_by_lag) test_statistic_metadata = { 'optimal_lag': optimal_lag, 'dependence_by_lag': dependence_by_lag } self.test_statistic_ = np.sum(dependence_by_lag) self.test_statistic_metadata_ = test_statistic_metadata return self.test_statistic_, test_statistic_metadata
def test_statistic(self, matrix_X, matrix_Y, is_fast=False, fast_dcorr_data={}): """ Computes the distance correlation between two datasets. :param matrix_X: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*d]`` data matrix, a matrix with ``n`` samples in ``p`` dimensions :type matrix_X: 2D numpy.array :param matrix_Y: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*d]`` data matrix, a matrix with ``n`` samples in ``q`` dimensions :type matrix_Y: 2D numpy.array :param is_fast: is a boolean flag which specifies if the test_statistic should be computed (approximated) using the fast version of dcorr. This defaults to False. :type is_fast: boolean :param fast_dcorr_data: a ``dict`` of fast dcorr params, refer: self._fast_dcorr_test_statistic - :sub_samples: specifies the number of subsamples. :type fast_dcorr_data: dictonary :return: returns a list of two items, that contains: - :test_statistic: the sample dcorr statistic within [-1, 1] - :independence_test_metadata: a ``dict`` of metadata with the following keys: - :variance_X: the variance of the data matrix X - :variance_Y: the variance of the data matrix Y :rtype: list **Example:** >>> import numpy as np >>> from mgcpy.independence_tests.dcorr import DCorr >>> >>> X = np.array([0.07487683, -0.18073412, 0.37266440, 0.06074847, 0.76899045, ... 0.51862516, -0.13480764, -0.54368083, -0.73812644, 0.54910974]).reshape(-1, 1) >>> Y = np.array([-1.31741173, -0.41634224, 2.24021815, 0.88317196, 2.00149312, ... 1.35857623, -0.06729464, 0.16168344, -0.61048226, 0.41711113]).reshape(-1, 1) >>> dcorr = DCorr(which_test = 'unbiased') >>> dcorr_statistic, test_statistic_metadata = dcorr.test_statistic(X, Y) """ assert matrix_X.shape[0] == matrix_Y.shape[ 0], "Matrices X and Y need to be of dimensions [n, p] and [n, q], respectively, where p can be equal to q" if is_fast: test_statistic, test_statistic_metadata = self._fast_dcorr_test_statistic( matrix_X, matrix_Y, **fast_dcorr_data) else: matrix_X, matrix_Y = compute_distance(matrix_X, matrix_Y, self.compute_distance_matrix) # perform distance transformation # transformed_dist_mtx_X, transformed_dist_mtx_Y = dist_transform(matrix_X, matrix_Y, self.which_test) transformed_distance_matrices = transform_distance_matrix( matrix_X, matrix_Y, base_global_correlation=self.which_test, is_ranked=False) transformed_dist_mtx_X = transformed_distance_matrices[ 'centered_distance_matrix_A'] transformed_dist_mtx_Y = transformed_distance_matrices[ 'centered_distance_matrix_B'] # transformed_dist_mtx need not be symmetric covariance = self.compute_global_covariance( transformed_dist_mtx_X, np.transpose(transformed_dist_mtx_Y)) variance_X = self.compute_global_covariance( transformed_dist_mtx_X, np.transpose(transformed_dist_mtx_X)) variance_Y = self.compute_global_covariance( transformed_dist_mtx_Y, np.transpose(transformed_dist_mtx_Y)) # check the case when one of the dataset has zero variance if variance_X <= 0 or variance_Y <= 0: correlation = 0 else: if self.is_paired: n = transformed_dist_mtx_X.shape[0] correlation = (variance_X/n/(n-1)) + (variance_Y/n/(n-1)) \ - 2*np.sum(np.multiply(transformed_dist_mtx_X, np.transpose(transformed_dist_mtx_Y)).diagonal())/n else: correlation = covariance / np.real( np.sqrt(variance_X * variance_Y)) # store the variance of X, variance of Y and the covariace as metadata test_statistic_metadata = { 'variance_X': variance_X, 'variance_Y': variance_Y, 'covariance': covariance } # use absolute value for mantel coefficients if self.which_test == 'mantel': test_statistic = np.abs(correlation) else: test_statistic = correlation self.test_statistic_ = test_statistic self.test_statistic_metadata_ = test_statistic_metadata return test_statistic, test_statistic_metadata
def p_value_block(self, matrix_X, matrix_Y, replication_factor=1000): """ Tests independence between two datasets using block permutation test. :param matrix_X: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*p]`` data matrix, a matrix with ``n`` samples in ``p`` dimensions :type matrix_X: 2D numpy.array :param matrix_Y: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*q]`` data matrix, a matrix with ``n`` samples in ``q`` dimensions :type matrix_Y: 2D numpy.array :param replication_factor: specifies the number of replications to use for the permutation test. Defaults to ``1000``. :type replication_factor: integer :return: returns a list of two items, that contains: - :p_value: P-value of MGC - :metadata: a ``dict`` of metadata with the following keys: - :null_distribution: numpy array representing distribution of test statistic under null. :rtype: list **Example:** >>> import numpy as np >>> from mgcpy.independence_tests.mgc.mgc_ts import MGC_TS >>> >>> X = np.array([0.07487683, -0.18073412, 0.37266440, 0.06074847, 0.76899045, ... 0.51862516, -0.13480764, -0.54368083, -0.73812644, 0.54910974]).reshape(-1, 1) >>> Y = np.array([-1.31741173, -0.41634224, 2.24021815, 0.88317196, 2.00149312, ... 1.35857623, -0.06729464, 0.16168344, -0.61048226, 0.41711113]).reshape(-1, 1) >>> mgc_ts = MGC_TS() >>> p_value, metadata = mgc_ts.p_value(X, Y, replication_factor = 100) """ assert matrix_X.shape[0] == matrix_Y.shape[0], "Matrices X and Y need to be of dimensions [n, p] and [n, q], respectively, where p can be equal to q" # Compute test statistic n = matrix_X.shape[0] if len(matrix_X.shape) == 1: matrix_X = matrix_X.reshape((n, 1)) if len(matrix_Y.shape) == 1: matrix_Y = matrix_Y.reshape((n, 1)) matrix_X, matrix_Y = compute_distance(matrix_X, matrix_Y, self.compute_distance_matrix) test_statistic, test_statistic_metadata = self.test_statistic(matrix_X, matrix_Y) # Block bootstrap block_size = int(np.ceil(np.sqrt(n))) test_stats_null = np.zeros(replication_factor) for rep in range(replication_factor): # Generate new time series sample for Y permuted_indices = np.r_[[np.arange(t, t + block_size) for t in np.random.choice(n, n // block_size + 1)]].flatten()[:n] permuted_indices = np.mod(permuted_indices, n) permuted_Y = matrix_Y[np.ix_(permuted_indices, permuted_indices)] # Compute test statistic test_stats_null[rep], _ = self.test_statistic(matrix_X, permuted_Y) self.p_value_ = np.sum(np.greater(test_stats_null, test_statistic)) / replication_factor if self.p_value == 0.0: self.p_value = 1 / replication_factor self.p_value_metadata_ = {'null_distribution': test_stats_null} return self.p_value_, self.p_value_metadata_
def test_statistic(self, matrix_X, matrix_Y, permutations=0, individual=0, disttype='cityblock'): """ Computes MDMR Pseudo-F statistic between two datasets. - It first takes the distance matrix of Y (by ) - Next it regresses X into a portion due to Y and a portion due to residual - The p-value is for the null hypothesis that the variable of X is not correlated with Y's distance matrix :param data_matrix_X: (optional, default picked from class attr) is interpreted as: - a ``[n*d]`` data matrix, a matrix with n samples in d dimensions :type data_matrix_X: 2D `numpy.array` :param data_matrix_Y: (optional, default picked from class attr) is interpreted as: - a ``[n*d]`` data matrix, a matrix with n samples in d dimensions :type data_matrix_Y: 2D `numpy.array` :parameter 'individual': -integer, `0` or `1` with value `0` tests the entire X matrix (default) with value `1` tests the entire X matrix and then each predictor variable individually :return: with individual = `0`, returns 1 values, with individual = `1` returns 2 values, containing: -the test statistic of the entire X matrix -for individual = 1, an array with the variable of X in the first column, the test statistic in the second, and the permutation p-value in the third (which here will always be 1) :rtype: list """ X = matrix_X Y = matrix_Y # calculate distance matrix of Y D, _ = compute_distance(Y, np.identity(1), self.compute_distance_matrix) a = D.shape[0]**2 D = D.reshape((a, 1)) predictors = np.arange(X.shape[1]) predsingle = X.shape[1] check_rank(X) # check number of subjects compatible subjects = X.shape[0] if subjects != np.sqrt(D.shape[0]): raise Exception("# of subjects incompatible between X and D") X = np.hstack((np.ones((X.shape[0], 1)), X)) predictors = np.array(predictors) predictors += 1 # Gower Center the distance matrix of Y Gs = gower_center_many(D) m2 = float(X.shape[1] - predictors.shape[0]) nm = float(subjects - X.shape[1]) # form permutation indexes permutation_indexes = np.zeros((permutations + 1, subjects), dtype=np.int) permutation_indexes[0, :] = range(subjects) for i in range(1, permutations + 1): permutation_indexes[i, :] = np.random.permutation(subjects) H2perms = gen_H2_perms(X, predictors, permutation_indexes) IHperms = gen_IH_perms(X, predictors, permutation_indexes) # Calculate test statistic F_perms = calc_ftest(H2perms, IHperms, Gs, m2, nm) # Calculate p-value p_vals = None if permutations > 0: p_vals = fperms_to_pvals(F_perms) F_permtotal = F_perms[0, :] self.test_statistic_ = F_permtotal if individual == 0: return self.test_statistic_, self.test_statistic_metadata_ # code for individual test if individual == 1: results = np.zeros((predsingle, 3)) for predictors in range(1, predsingle+1): predictors = np.array([predictors]) Gs = gower_center_many(D) m2 = float(X.shape[1] - predictors.shape[0]) nm = float(subjects - X.shape[1]) permutation_indexes = np.zeros((permutations + 1, subjects), dtype=np.int) permutation_indexes[0, :] = range(subjects) for i in range(1, permutations + 1): permutation_indexes[i, :] = np.random.permutation(subjects) H2perms = gen_H2_perms(X, predictors, permutation_indexes) IHperms = gen_IH_perms(X, predictors, permutation_indexes) F_perms = calc_ftest(H2perms, IHperms, Gs, m2, nm) p_vals = None if permutations > 0: p_vals = fperms_to_pvals(F_perms) results[predictors-1, 0] = predictors results[predictors-1, 1] = F_perms[0, :] results[predictors-1, 2] = p_vals return F_permtotal, results
def test_statistic(self, matrix_X, matrix_Y, is_fast=False, fast_mgc_data={}): """ Computes the MGC measure between two datasets. - It first computes all the local correlations - Then, it returns the maximal statistic among all local correlations based on thresholding. :param matrix_X: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*p]`` data matrix, a matrix with ``n`` samples in ``p`` dimensions :type matrix_X: 2D numpy.array :param matrix_Y: is interpreted as either: - a ``[n*n]`` distance matrix, a square matrix with zeros on diagonal for ``n`` samples OR - a ``[n*q]`` data matrix, a matrix with ``n`` samples in ``q`` dimensions :type matrix_Y: 2D numpy.array :param is_fast: is a boolean flag which specifies if the test_statistic should be computed (approximated) using the fast version of mgc. This defaults to False. :type is_fast: boolean :param fast_mgc_data: a ``dict`` of fast mgc params, refer: self._fast_mgc_test_statistic - :sub_samples: specifies the number of subsamples. :type fast_mgc_data: dictonary :return: returns a list of two items, that contains: - :test_statistic: the sample MGC statistic within [-1, 1] - :independence_test_metadata: a ``dict`` of metadata with the following keys: - :local_correlation_matrix: a 2D matrix of all local correlations within ``[-1,1]`` - :optimal_scale: the estimated optimal scale as an ``[x, y]`` pair. :rtype: list **Example:** >>> import numpy as np >>> from mgcpy.independence_tests.mgc.mgc import MGC >>> >>> X = np.array([0.07487683, -0.18073412, 0.37266440, 0.06074847, 0.76899045, ... 0.51862516, -0.13480764, -0.54368083, -0.73812644, 0.54910974]).reshape(-1, 1) >>> Y = np.array([-1.31741173, -0.41634224, 2.24021815, 0.88317196, 2.00149312, ... 1.35857623, -0.06729464, 0.16168344, -0.61048226, 0.41711113]).reshape(-1, 1) >>> mgc = MGC() >>> mgc_statistic, test_statistic_metadata = mgc.test_statistic(X, Y) """ assert matrix_X.shape[0] == matrix_Y.shape[ 0], "Matrices X and Y need to be of dimensions [n, p] and [n, q], respectively, where p can be equal to q" if is_fast: mgc_statistic, test_statistic_metadata = self._fast_mgc_test_statistic( matrix_X, matrix_Y, **fast_mgc_data) else: distance_matrix_X, distance_matrix_Y = compute_distance( matrix_X, matrix_Y, self.compute_distance_matrix) local_correlation_matrix = local_correlations( distance_matrix_X, distance_matrix_Y, base_global_correlation=self.base_global_correlation )["local_correlation_matrix"] m, n = local_correlation_matrix.shape if m == 1 or n == 1: mgc_statistic = local_correlation_matrix[m - 1][n - 1] optimal_scale = m * n else: sample_size = len(matrix_X) - 1 # sample size minus 1 # find a connected region of significant local correlations, by thresholding significant_connected_region = threshold_local_correlations( local_correlation_matrix, sample_size) # find the maximum within the significant region result = smooth_significant_local_correlations( significant_connected_region, local_correlation_matrix) mgc_statistic, optimal_scale = result["mgc_statistic"], result[ "optimal_scale"] test_statistic_metadata = { "local_correlation_matrix": local_correlation_matrix, "optimal_scale": optimal_scale } self.test_statistic_ = mgc_statistic self.test_statistic_metadata_ = test_statistic_metadata return mgc_statistic, test_statistic_metadata