def estat(x, y, nboot=1000, replace=False, method='log', fitting=False): ''' Energy distance statistics test. Reference --------- Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free multivariate goodness-of-fit tests, two-sample comparison and unfolding. Nuc Instr and Meth in Phys Res A 537: 626-636 Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics based on distances. J Stat Planning & Infer 143: 1249-1272 Brian Lau, multdist, https://github.com/brian-lau/multdist ''' n, N = len(x), len(x) + len(y) stack = np.vstack([x, y]) stack = (stack - stack.mean(0)) / stack.std(0) if replace: rand = lambda x: random.randint(x, size=x) else: rand = random.permutation en = energy(stack[:n], stack[n:], method) en_boot = np.zeros(nboot, 'f') for i in range(nboot): idx = rand(N) en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method) if fitting: param = genextreme.fit(en_boot) p = genextreme.sf(en, *param) return p, en, param else: p = (en_boot >= en).sum() / nboot return p, en, en_boot
def _compare_resamples(self, tvalues, null_max_tvalues, null_min_tvalues): pvalues = [] maxparams = genextreme.fit(null_max_tvalues) minparams = genextreme.fit([-x for x in null_min_tvalues]) for tvalue in tvalues: pvalue = genextreme.sf(tvalue, *maxparams) if tvalue >= 0 else genextreme.sf(-tvalue, *minparams) pvalues.append(pvalue) return pvalues
def extreme_values(weighted_residuals, confidence_interval): ''' This function uses extreme value theory to calculate the number of standard deviations away from the mean at which we should expect to bracket *all* of our n data points at a certain confidence level. It then uses that value to identify which (if any) of the data points lie outside that region, and calculates the corresponding probabilities of finding a data point at least that many standard deviations away. Parameters ---------- weighted_residuals : array of floats Array of residuals weighted by the square root of their variances wr_i = r_i/sqrt(var_i) confidence_interval : float Probability at which all the weighted residuals lie within the confidence bounds Returns ------- confidence_bound : float Number of standard deviations at which we should expect to encompass all data at the user-defined confidence interval. indices : array of floats Indices of weighted residuals exceeding the confidence_interval defined by the user probabilities : array of floats The probabilities that the extreme data point of the distribution lies further from the mean than the observed position wr_i for each i in the "indices" output array. ''' n = len(weighted_residuals) mean = norm.isf(1./n) # good approximation for > 10 data points scale = 0.8/np.power(np.log(n), 1./2.) # good approximation for > 10 data points c = 0.33/np.power(np.log(n), 3./4.) # We now need a 1-tailed probability from the given confidence_interval # p_total = 1. - confidence_interval = p_upper + p_lower - p_upper*p_lower # p_total = 1. - confidence_interval = 2p - p^2, therefore: p = 1. - np.sqrt(confidence_interval) confidence_bound = genextreme.isf(p, c, loc=mean, scale=scale) indices = [i for i, r in enumerate(weighted_residuals) if np.abs(r) > confidence_bound] # Convert back to 2-tailed probabilities probabilities = (1. - np.power(genextreme.sf(np.abs(weighted_residuals[indices]), c, loc=mean, scale=scale) - 1., 2.)) return confidence_bound, indices, probabilities
def estat(x, y, nboot=1000, maxt=60., replace=False, method='log', fitting=False): """ Energy distance statistics test. References ---------- * Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free multivariate goodness-of-fit tests, two-sample comparison and unfolding. Nuc Instr and Meth in Phys Res A 537: 626-636 * Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics based on distances. J Stat Planning & Infer 143: 1249-1272 * Brian Lau, multdist, https://github.com/brian-lau/multdist """ n, N = len(x), len(x) + len(y) stack = np.vstack([x, y]) # stack = (stack - stack.mean(0)) / stack.std(0) stack = (stack - np.nanmean(stack, 0)) / np.nanstd(stack, 0) if replace: def rand(x): return np.random.randint(x, size=x) # rand = lambda x: np.random.randint(x, size=x) else: rand = np.random.permutation en = energy(stack[:n], stack[n:], method) en_boot = np.zeros(nboot, 'f') s = t.time() for i in range(nboot): idx = rand(N) en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method) if t.time() - s > maxt: print("Time consumed, exit bootstrap (N={})".format(i)) en_boot, nboot = en_boot[:i], i + 1 break if fitting: param = genextreme.fit(en_boot) p = genextreme.sf(en, *param) return p, en, param else: p = (en_boot >= en).sum() / nboot return p, en, en_boot
def extreme_values(weighted_residuals, confidence_interval): ''' This function uses extreme value theory to calculate the number of standard deviations away from the mean at which we should expect to bracket *all* of our n data points at a certain confidence level. It then uses that value to identify which (if any) of the data points lie outside that region, and calculates the corresponding probabilities of finding a data point at least that many standard deviations away. Parameters ---------- weighted_residuals : array of floats Array of residuals weighted by the square root of their variances wr_i = r_i/sqrt(var_i) confidence_interval : float Probability at which all the weighted residuals lie within the confidence bounds Returns ------- confidence_bound : float Number of standard deviations at which we should expect to encompass all data at the user-defined confidence interval. indices : array of floats Indices of weighted residuals exceeding the confidence_interval defined by the user probabilities : array of floats The probabilities that the extreme data point of the distribution lies further from the mean than the observed position wr_i for each i in the "indices" output array. ''' n=len(weighted_residuals) mean = norm.isf(1./n) scale = 0.8/np.power(np.log(n), 1./2.) # good approximation for > 10 data points c = 0.33/np.power(np.log(n), 3./4.) # good approximation for > 10 data points # We now need a 1-tailed probability from the given confidence_interval # p_total = 1. - confidence_interval = p_upper + p_lower - p_upper*p_lower # p_total = 1. - confidence_interval = 2p - p^2, therefore: p = 1. - np.sqrt(confidence_interval) confidence_bound = genextreme.isf(p, c, loc=mean, scale=scale) indices = [i for i, r in enumerate(weighted_residuals) if np.abs(r) > confidence_bound] probabilities = 1. - np.power(genextreme.sf(np.abs(weighted_residuals[indices]), c, loc=mean, scale=scale) - 1., 2.) # Convert back to 2-tailed probabilities return confidence_bound, indices, probabilities
def calculate_adjusted_p_values_genextreme(in_master_table, c, loc, scale, in_alpha): raw_p_values = list() for i in range(0, len(in_master_table)): tmp_p_val = genextreme.sf(in_master_table[i][7], c, loc, scale) raw_p_values.append(tmp_p_val) master_table[i].append(tmp_p_val) # adjust p-values if len(raw_p_values) >= 2: adjusted_p_values = multipletests(raw_p_values, alpha=in_alpha, method='fdr_bh', is_sorted=False) for i in range(0, len(adjusted_p_values[1])): in_master_table[i].append(adjusted_p_values[1][i]) else: for i in range(0, len(in_master_table)): in_master_table[i].append("na") return in_master_table
def estat(x, y, nboot=1000, replace=False, method='log', fitting=False): """ Energy distance statistics test. Compares d-dimensional data from two samples using a measure based on statistical energy. The test is non-parametric, does not require binning and easily scales to arbitrary dimensions. The analytic distribution of the statistic is unknown, and p-values are estimated using a permutation procedure, which works well according to simulations by Aslan & Zech. INPUTS x - [n1 x d] matrix y - [n2 x d] matrix OPTIONAL flag - 'sr', Szekely & Rizzo energy statistic 'az', Aslan & Zech energy statistic (default) nboot - # of bootstrap resamples (default = 1000) replace - boolean for sampling with replacement (default = false) OUTPUTS p - p-value by permutation e_n - minimum energy statistic e_n_boot - bootstrap samples References ---------- * Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free multivariate goodness-of-fit tests, two-sample comparison and unfolding. Nuc Instr and Meth in Phys Res A 537: 626-636 * Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics based on distances. J Stat Planning & Infer 143: 1249-1272 * Brian Lau, multdist, https://github.com/brian-lau/multdist """ n, N = len(x), len(x) + len(y) stack = np.vstack([x, y]) stack = (stack - stack.mean(0)) / stack.std(0) if replace: def rand(x): return np.random.randint(x, size=x) # rand = lambda x: np.random.randint(x, size=x) else: rand = np.random.permutation en = energy(stack[:n], stack[n:], method) en_boot = np.zeros(nboot, 'f') for i in range(nboot): idx = rand(N) en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method) if fitting: param = genextreme.fit(en_boot) p = genextreme.sf(en, *param) return p, en, param else: p = (en_boot >= en).sum() / nboot return p, en, en_boot