def pearson_test(x, y): """Found in scipy.stats as pearsonr Used to evaluate the pearson correlation between X and Y. Parameters ---------- x: list or numpy array, 1-D Our "X" variable for determining the strength of our pearson correlation with y: list or numpy array, 1-D Our "Y" variable for determining the strength of our pearson correlation with Returns ------- rho: float, -1 <= rho <= 1 Our measure of pearson correlation between x and y p: float, 0 <= p <= 1 How significant our observed pearson correlation is """ x, y = _check_table(x, only_count=False), _check_table(y, only_count=False) if len(x) != len(y): raise ValueError( "Cannot calculate correlation with datasets of different lengths") n = len(x) rho = (n * np.sum(x * y) - np.sum(x) * np.sum(y)) / ( sqrt(n * np.sum(np.power(x, 2)) - pow(np.sum(x), 2)) * sqrt(n * np.sum(np.power(y, 2)) - pow(np.sum(y), 2))) t_stat = rho * sqrt((n - 2) / (1 - pow(rho, 2))) p = 2 * (1 - t.cdf(abs(t_stat), n - 2)) return rho, p
def rank_biserial_correlation_test(x, y): """Not found in scipy.stats or statsmodels x: list or numpy array, 1-D Our observations. These are expected to be ordinal y: list or numpy array, 1-D Our groupings variable, or masked array. Must only have two variables and be the same length as x Returns ------- rho: float The measure of correlation between our two groups p: float The likelihood that our two groups would be correlated if both were derived from a normal distribution """ x, y = _check_table(x, only_count=True), _check_table(y, only_count=True) if len(x) != len(y): raise ValueError("X and Y must be of the same length") if len(np.unique(y)) != 2: raise AttributeError( "Need to have two groupings for biseral correlation") group_0, group_1 = x[y == np.unique(y)[0]], x[y == np.unique(y)[1]] mu_1, mu_0 = np.mean(group_1), np.mean(group_0) n, n_1, n_0 = len(x), len(group_1), len(group_0) s = sqrt(n_1 * n_0 * (n + 1) / 12) rho = 2 * ((mu_1 - mu_0) / (n_1 + n_0)) u_min = min((1 + rho) * n_1 * n_0 / 2, (1 - rho) * n_1 * n_0 / 2) mu = n_1 * n_0 / 2 z = (u_min - mu) / s p = 2 * (1 - norm.cdf(abs(z))) return rho, p
def spearman_test(x, y): """Found in scipy.stats as spearmanr Used to evaluate the correlation between the ranks of "X" and "Y", that is, if there exists a monotonic relationship between X and Y. Parameters ---------- x: list or numpy array, 1-D Our "X" variable for determining the strength of monotonic correlation with y: list or numpy array, 1-D Our "Y" variable for determining the strength of monotonic correlation with Returns ------- rho: float, -1 <= t_stat <= 1 Our measure of monotonic correlation between x and y p: float, 0 <= p <= 1 How significant our observed monotonic correlation is """ x, y = _check_table(x, only_count=False), _check_table(y, only_count=False) if len(x) != len(y): raise ValueError( "Cannot calculate correlation with datasets of different lengths") df = len(x) - 2 rank_x, rank_y = rankdata(x), rankdata(y) std_x, std_y = np.std(rank_x, ddof=1), np.std(rank_y, ddof=1) cov = np.cov(rank_x, rank_y)[0][1] rho = cov / (std_x * std_y) t_stat = rho * sqrt(df / (1 - pow(rho, 2))) p = 2 * (1 - t.cdf(abs(t_stat), df)) return rho, p
def chi_goodness_of_fit_test(observed, expected=None): """Found in scipy.stats as chisquare Used when we cannot divide the data cleanly into a contingency table or when we have actual expected results to compare to. Parameters ---------- observed: list or numpy array, 1-D Our observed data points expected: list or numpy array, 1-D, default is None What we expected the results to be. If None given, then we expect all data points to be equally likely Returns ------- X: float The Chi statistic, or the sum of squared differences between observed and expected p: float, 0 <= p <= 1 The likelihood that our observed differences, given the amount of data, can be attributed to chance """ observed = _check_table(observed, False) if not expected: expected = np.repeat(np.mean(observed), len(observed)) else: expected = _check_table(expected) df = len(observed) - 1 X = np.sum(np.power(observed - expected, 2) / expected) p = 1 - chi2.cdf(X, df) return X, p
def point_biserial_correlation_test(x, y): """Found in scipy.stats as pointbiserialr x: list or numpy array, 1-D Our observations. These are expected to be continuous. y: list or numpy array, 1-D Our groupings variable, or masked array. Must only have two variables and be the same length as x Returns ------- rho: float The measure of correlation between our two groups p: float The likelihood that our two groups would be correlated if both were derived from a t (if point) distribution """ x = _check_table(x, only_count=False) y = _check_table(y, only_count=True) if len(x) != len(y): raise ValueError("X and Y must be of the same length") if len(np.unique(y)) != 2: raise AttributeError( "Need to have two groupings for biseral correlation") group_0, group_1 = x[y == np.unique(y)[0]], x[y == np.unique(y)[1]] mu_1, mu_0 = np.mean(group_1), np.mean(group_0) n, n_1, n_0 = len(x), len(group_1), len(group_0) s = np.std(x, ddof=1) rho = ((mu_1 - mu_0) / s) * sqrt(n_1 * n_0 / (n * (n - 1))) t_val = rho * sqrt((n - 2) / (1 - pow(rho, 2))) p = 2 * (1 - t.cdf(abs(t_val), n - 2)) return rho, p
def lepage_test(data_1, data_2): """Not found in either scipy.stats or statsmodels Used to compare the central tendency and variability in two samples. A sum of the squared Euclidean distances of both the Wilcoxon-Rank-Sum test and the Ansari-Bradley test. Parameters ---------- data_1: list or numpy array, 1-D A list or array containing all observations from our first dataset data_2: list or numpy array, 1-D A list or array containing all observations from our second dataset Returns ------- d: float Our measure of central tendency and variability among the two datasets p: float, 0 <= p <= 1 The likelihood we would find this level of central tendency and variability among two datasets sampled from the same population """ data_1, data_2 = _check_table(data_1, only_count=False), _check_table(data_2, only_count=False) n, m = len(data_1), len(data_2) N = n + m c, _ = ansari_bradley_test(data_1, data_2, alternative='two-sided') w, _ = two_sample_wilcoxon_test(data_1, data_2, alternative='two-sided') expected_w = n * (N + 1) / 2 sd_w = sqrt(m * n * (N + 1) / 12) expected_c = n * pow(N + 1, 2) / (4 * N) sd_c = sqrt(m * n * (N + 1) * (3 + pow(N, 2)) / (48 * pow(N, 2))) d = pow((w - expected_w) / sd_w, 2) + pow((c - expected_c) / sd_c, 2) p = 1 - chi2.cdf(d, 2) return d, p
def g_goodness_of_fit_test(observed, expected=None): """Found in scipy.stats as power_divergence(lambda_="log-likelihood") Similar to chi_goodness_of_fit_test, used when we cannot divide the data cleanly into a contingency table or when we have actual expected results to compare to. Parameters ---------- observed: list or numpy array, 1-D Our observed data expected: list or numpy array, default is None What we expected the results to be. If None given, then we expect all data points to be equally likely Returns ------- g: float The G statistic, or the likelihood ratio of the difference between observed and expected p: float, 0 <= p <= 1 The likelihood that our observed differences are due to chance """ observed = _check_table(observed, False) if not expected: expected = np.repeat(np.mean(observed), len(observed)) else: expected = _check_table(expected) df = len(observed) - 1 g = 2 * np.sum(observed * np.log(observed / expected)) p = 1 - chi2.cdf(g, df) return g, p
def two_sample_t_test(data_1, data_2, alternative='two-sided', paired=False): """This test can be found in scipy.stats as either ttest_rel or ttest_ind Used when we want to compare the distributions of two samples, and while we assume that they both follow a normal distribution, their sample size is too small to reliably use a z-test. Parameters ---------- data_1: list or numpy array, 1-D The observed dataset we are comparing to data_2 data_2: list or numpy array, 1-D The observed dataset we are comparing to data_1 alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis paired: bool, default is False Whether or not data_1 and data_2 are paired observations Return ------ t_value: number The t statistic for the difference between our datasets p: float, 0 <= p <= 1 The likelihood that the observed differences are due to chance """ if not isinstance(alternative, str): raise TypeError("Alternative Hypothesis is not of string type") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False) data_1_mean, data_2_mean = np.mean(data_1), np.mean(data_2) if paired: """This test can be found in scipy.stats as ttest_rel""" if len(data_1) != len(data_2): raise AttributeError("The data types are not paired") n = len(data_1) df = n - 1 squared_difference = np.sum(np.power(data_1 - data_2, 2)) difference = np.sum(data_1 - data_2) std = sqrt((squared_difference - (np.power(difference, 2) / n)) / df) standard_error_difference = _standard_error(std, n) else: # We perform the Welch T-Test due to assumption that variances are not equal """This test can be found in scipy.stats as ttest_ind""" data_1_var, data_2_var = np.var(data_1, ddof=1), np.var(data_2, ddof=1) data_1_n, data_2_n = len(data_1), len(data_2) df = np.power((data_1_var / data_1_n) + (data_2_var / data_2_n), 2) /\ ((np.power(data_1_var, 2) / (np.power(data_1_n, 2) * data_1_n - 1)) + (np.power(data_2_var, 2) / (np.power(data_2_n, 2) * data_2_n - 1))) standard_error_difference = sqrt((data_1_var / data_1_n) + (data_2_var / data_2_n)) t_value = (data_1_mean - data_2_mean) / standard_error_difference p = (1.0 - t.cdf(abs(t_value), df)) if alternative.casefold() == 'two-sided': p *= 2 elif alternative.casefold() == 'less': p = 1 - p else: pass return t_value, p
def yeun_welch_test(data_1, data_2, p=10, alternative='two-sided'): """Not found in scipy.stats or statsmodels. Used when we wish to perform a two-sample t-test, but cannot assume normality or equality of variances. Parameters ---------- data_1: list or numpy array, 1-D The observed dataset we are comparing to data_2 data_2: list or numpy array, 1-D The observed dataset we are comparing to data_1 p: float, 0 <= p <= 100 The percentage of data we wish to drop from each sample alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis Return ------ t_value: number The t statistic for the difference between our datasets p: float, 0 <= p <= 1 The likelihood that the observed differences are due to chance """ if p < 0 or p > 100: raise ValueError("Percentage trimmed needs to be between 0 and 100") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") data_1, data_2 = _check_table(data_1), _check_table(data_2) sort_data_1, sort_data_2 = np.sort(data_1), np.sort(data_2) n_1, n_2 = len(data_1) * p // 200, len(data_2) * p // 200 trim_data_1, trim_data_2 = sort_data_1[ n_1:len(sort_data_1) - n_1], sort_data_2[n_2:len(sort_data_2) - n_2] n_x, n_y = len(data_1), len(data_2) m_x, m_y = len(trim_data_1), len(trim_data_2) winsor_values_1, winsor_values_2 = np.append( trim_data_1[0] * n_1, trim_data_1[-1] * n_1), np.append(trim_data_2[0] * n_2, trim_data_2[-1] * n_2) winsor_data_1, winsor_data_2 = np.append(trim_data_1, winsor_values_1), np.append( trim_data_2, winsor_values_2) s_x, s_y = np.var(winsor_data_1, ddof=1), np.var(winsor_data_2, ddof=1) x_bar, y_bar = np.mean(trim_data_1), np.mean(trim_data_2) d_x, d_y = (n_x - 1) * s_x / (m_x * (m_x - 1)), (n_y - 1) * s_y / (m_y * (m_y - 1)) df = pow(d_x + d_y, 2) / (pow(d_x, 2) / (m_x - 1) + pow(d_y, 2) / (m_y - 1)) t_value = (x_bar - y_bar) / sqrt(d_x + d_y) p = 1 - t.cdf(t_value, df // 1) if alternative.casefold() == 'two-sided': p *= 2 elif alternative.casefold() == 'less': p = 1 - p else: pass return t_value, p
def g_proportion_test(success_prob, n_total, expected=None): """Not found in either statsmodels or scipy.stats Used when we are given proportions of success (as well as total participants) instead of numbers of success Parameters ---------- success_prob: list or numpy array, 1-D A list containing the percentage of success for each successive group. Needs to be the same size as n_total and expected n_total: list or numpy array, 1-D A list containing the total count of each successive group. Needs to be the same size as success_prob and expected expected: (optional) list or numpy array, 1-D If None, then expected is the weighted average of success_prob Else, a list containing the expected probabilities of each success group. Needs to be the same size as success_prob and n_total. Returns ------- g: float Our measure of the difference between our observed and expected results p: float, 0 <= p <= 1 The likelihood that we would observe these differences if each group was sampled from the same population """ success_prob, n_total = _check_table( success_prob, only_count=False), _check_table(n_total, only_count=True) if len(success_prob) != len(n_total): raise ValueError( "Success probability and N Total are not of same length") if expected is None: expected = np.sum(success_prob * n_total) / np.sum(n_total) else: expected = _check_table(expected, only_count=False) if len(expected) != len(success_prob): raise ValueError( "Expected and Success probability are not of same length") if not np.all(expected < 1): raise ValueError( "Cannot have percentage of expected greater than 1") elif not np.all(expected >= 0): raise ValueError("Cannot have negative percentage of expected") if not np.all(success_prob < 1): raise ValueError("Cannot have percentage of success greater than 1") elif not np.all(success_prob >= 0): raise ValueError("Cannot have negative percentage of success") n_success = success_prob * n_total n_failure = n_total - n_success n_expected_success = expected * n_total n_expected_failure = (1 - expected) * n_total df = len(n_total) - 1 g = 2 * (np.sum(n_success * np.log(n_success / n_expected_success)) + np.sum(n_failure * np.log(n_failure / n_expected_failure))) p = 1 - chi2.cdf(g, df) return g, p
def calculate_c(data_1, data_2): data_1, data_2 = _check_table(data_1, only_count=False), _check_table(data_2, only_count=False) all_data = np.concatenate([data_1, data_2]) rank_data = rankdata(all_data) n, n_1, n_2 = len(all_data), len(data_1), len(data_2) r_1 = rank_data[:n_1] u = (6 * np.sum(np.power(r_1, 2)) - n_1 * (n + 1) * (2 * n + 1)) / sqrt(n_1 * n_2 * (n + 1) * (2 * n + 1) * (8 * n + 11) / 5) v = (6 * np.sum(np.power(n + 1 - rank_data, 2)) - n_1 * (n + 1) * (2 * n + 1)) / sqrt(n_1 * n_2 * (n + 1) * (2 * n + 1) * (8 * n + 11) / 5) rho = 2 * (pow(n, 2) - 4) / ((2 * n + 1) * (8 * n + 11)) - 1 c = (pow(u, 2) + pow(v, 2) - 2 * rho * u * v) / (2 * (1 - pow(rho, 2))) return c
def trimmed_means_test(data_1, data_2, p=10, alternative='two-sided'): """Not found in scipy.stats or statsmodels. Used when we wish to perform a two-sample t-test, but suspect that the data is being heavily influenced by outliers, i.e., cannot assume normality. Parameters ---------- data_1: list or numpy array, 1-D The observed dataset we are comparing to data_2 data_2: list or numpy array, 1-D The observed dataset we are comparing to data_1 p: float, 0 <= p <= 100 The percentage of data we wish to drop from each sample alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis Return ------ t_value: number The t statistic for the difference between our datasets p: float, 0 <= p <= 1 The likelihood that the observed differences are due to chance """ if p < 0 or p > 100: raise ValueError("Percentage trimmed needs to be between 0 and 100") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") data_1, data_2 = _check_table(data_1), _check_table(data_2) sort_data_1, sort_data_2 = np.sort(data_1), np.sort(data_2) n_1, n_2 = len(data_1) * p // 200, len(data_2) * p // 200 trim_data_1, trim_data_2 = sort_data_1[ n_1:len(sort_data_1) - n_1], sort_data_2[n_2:len(sort_data_2) - n_2] n_x, n_y = len(data_1), len(data_2) m_x, m_y = len(trim_data_1), len(trim_data_2) winsor_values_1, winsor_values_2 = np.append( trim_data_1[0] * n_1, trim_data_1[-1] * n_1), np.append(trim_data_2[0] * n_2, trim_data_2[-1] * n_2) winsor_data_1, winsor_data_2 = np.append(trim_data_1, winsor_values_1), np.append( trim_data_2, winsor_values_2) s_x, s_y = np.var(winsor_data_1, ddof=1), np.var(winsor_data_2, ddof=1) x_bar, y_bar = np.mean(trim_data_1), np.mean(trim_data_2) pooled_var = ((n_x - 1) * s_x + (n_y - 1) * s_y) / ((m_x - 1) + (m_y - 1)) t_value = (x_bar - y_bar) / np.sqrt(pooled_var * ((1 / m_x) + (1 / m_y))) df = m_x + m_y - 2 p = (1.0 - t.cdf(abs(t_value), df)) if alternative.casefold() == 'two-sided': p *= 2 elif alternative.casefold() == 'less': p = 1 - p else: pass return t_value, p
def two_sample_wilcoxon_test(data_1, data_2, alternative='two-sided', handle_zero='wilcox'): """This test can be found in scipy.stats as wilcoxon Used when we want to compare two related or paired samples, or repeated measurements, and see if their population mean ranks differ. Also used when we cannot assume that the samples are normally distributed. Parameters ---------- data_1: list or numpy array, 1-D The first sample or repeated measure data_2: list or numpy array, 1-D The second sample or repeated measure alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis handle_zero: str, default is wilcox How we treat differences of zero. It can be either wilcox (ignore) or pratt Return ------ w_value: float The W statistic for our observed differences in mean ranks p: float, 0 <= p <= 1 The likelihood that the observed mean rank differences would be found in two datasets sampled from the same population """ if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") if handle_zero.casefold() not in ['wilcox', 'pratt']: raise ValueError("Cannot determine how to handle differences of zero") if len(data_1) != len(data_2): raise AttributeError("Cannot perform signed wilcoxon test on unpaired data") data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False) diff = data_1 - data_2 if handle_zero.casefold() == 'wilcox': assert np.sum(diff == 0) != len(data_1), "Cannot perform wilcoxon test when all differences are zero" diff = np.compress(np.not_equal(diff, 0), diff) n = len(diff) abs_diff, sign_diff = np.abs(diff), np.sign(diff) rank = rankdata(abs_diff) if handle_zero.casefold() == "pratt": zero_ranks = np.not_equal(abs_diff, 0) sign_diff, rank = np.compress(zero_ranks, sign_diff), np.compress(zero_ranks, rank) w_value = np.sum(sign_diff * rank) std = sqrt(n * (n + 1) * (2 * n + 1) / 6) z_score = w_value / std if alternative.casefold() == 'two-sided': p = 2 * (1 - norm.cdf(abs(z_score))) elif alternative.casefold() == 'greater': p = 1 - norm.cdf(z_score) else: p = norm.cdf(z_score) return w_value, p
def binomial_sign_test(data_1, data_2, alternative='two-sided', success_prob=0.5): """Found in scipy as sign_test Used to determine whether or not the measured differences between two groups (X and Y) is significantly greater and/or less than each other. For instance, we might use this to determine if the weight loss for users who followed a certain diet is significant or not. Parameters ---------- data_1: list or numpy array, 1-D A list of all observations for group X. data_2: list or numpy array, 1-D A list of all observations for group Y. alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis success_prob: float, 0 <= success_prob <= 1 The probability of success. Default is 0.5 Returns ------- p: float, 0 <= p <= 1 The probability that our observed differences would happen under a binomial distribution, assuming the given success probability. """ if not isinstance(alternative, str): raise TypeError("Alternative Hypothesis is not of string type") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") if len(data_1) != len(data_2): raise AttributeError("The two data sets are not paired data sets") if not isinstance(success_prob, float): raise TypeError("Probability of success needs to be a decimal value") if success_prob > 1 or success_prob < 0: raise ValueError( "Cannot calculate probability of success, needs to be between 0 and 1" ) data_1, data_2 = _check_table(data_1), _check_table(data_2) diff = data_1 - data_2 pos_diff, neg_diff = np.sum(diff > 0), np.sum(diff < 0) total = pos_diff + neg_diff if alternative.casefold() == 'greater': p = _right_extreme(pos_diff, total, success_prob) elif alternative.casefold() == 'less': p = _left_extreme(pos_diff, total, success_prob) else: p = _left_extreme(neg_diff, total, success_prob) + _right_extreme( pos_diff, total, success_prob) return p
def peirce_test(observed, expected, num_outliers=1, num_coef=1): """Not found in either scipy.stats or statsmodels Parameters ---------- observed: list or numpy array Our observed observations expected: list or numpy array Our expected observations, or what the model outputted for "Y" num_outliers: int, default is 1 The number of outliers we are trying to identify. num_coef: int, default is 1 The number of regression variables we are thinking of including Returns ------- An array, containing all values that we found to be an outlier according to Peirce's criteria. """ if not isinstance(num_outliers, int): raise TypeError("Number of outliers needs to be an integer") if num_outliers < 0: raise ValueError("Number of outliers has to be a positive value") if not isinstance(num_coef, int): raise TypeError("Number of regression coefficients needs to be an integer") if num_coef < 0: raise ValueError("Number of regression coefficients has to be a positive value") observed, expected = _check_table(observed), _check_table(expected) if len(observed) != len(expected): raise ValueError("Length of observed and expected need to be the same") n = len(observed) if num_outliers > n: raise ValueError("Cannot have number of outliers greater than number of observations") if num_coef > n: raise Warning("Number of regressor variables is greater than number of observations") q = pow(num_outliers, num_outliers / n) * pow(n - num_outliers, (n - num_outliers) / n) / n r_new, r_old = 1.0, 0.0 while abs(r_new - r_old) > (n * 2.0e-16): ldiv = pow(r_new, num_outliers) if pow(r_new, num_outliers) != 0 else 1.0e-6 lambda1 = pow(q, n) / pow(ldiv, 1 / (n - num_coef)) x2 = 1 + (n - num_coef - num_outliers) / (num_outliers * (1.0 - pow(lambda1, 2))) if x2 < 0: x2 = 0.0 r_old = r_new else: r_old = r_new r_new = np.exp((x2 - 1) / 2.0) * erfc(np.sqrt(x2 / 2)) mean_squared_error = np.sum(np.power(observed - expected, 2)) / n threshold = x2 * mean_squared_error return observed[np.power(observed - expected, 2) > threshold]
def two_sample_mann_whitney_test(data_1, data_2, alternative='two-sided'): """This test can be found in scipy.stats as mannwhitneyu Used when we want to test whether or not the distribution of two ordinal response variables are equal or not, assuming that each sample is independent of one another. Parameters ---------- data_1: list or numpy array, 1-D The observed sample for ordinal response variable 1 data_2: list or numpy array, 1-D The observed sample for ordinal response variable 2 alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis Return ------ u: float The U statistic for our observed differences in the two ordinal responses p: float, 0 <= p <= 1 The likelihood that the observed differences are due to chance """ if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False) combined_data = rankdata(np.concatenate([data_1, data_2])) combined_data_len = len(combined_data) data_1_len, data_2_len = len(data_1), len(data_2) data_1_rank = np.sum(combined_data[:len(data_1)]) data_2_rank = np.sum(combined_data[len(data_1):]) u1 = data_1_rank - ((data_1_len * (data_1_len + 1)) / 2) u2 = data_2_rank - ((data_2_len * (data_2_len + 1)) / 2) u_mean = (u1 + u2) / 2 if alternative.casefold() == 'two-sided': u = np.min([u1, u2]) elif alternative.casefold() == 'greater': u = u1 else: u = u2 T = np.unique(u, return_counts=True)[1] sum_T = np.sum(np.power(T, 3) - T) / (combined_data_len * (combined_data_len - 1)) u_sd = sqrt((data_1_len * data_2_len / 12) * (combined_data_len + 1 - sum_T)) z_score = (u - u_mean) / u_sd if alternative.casefold() == 'two-sided': p = 2 * (1 - norm.cdf(abs(z_score))) elif alternative.casefold() == 'greater': p = 1 - norm.cdf(z_score) else: p = norm.cdf(z_score) return u, p
def bowker_test(cont_table): """Found in statsmodels as TableSymmetry or as bowker_symmetry Used to test if a given square table is symmetric about the main diagonal Parameters ---------- cont_table: list or numpy array, n x n A nxn contingency table Return ------ x: float Our Chi statistic, oor a measure of symmetry for our contingency table p: float, 0 <= p <= 1 The probability that our table isn't symmetric due to chance """ cont_table = _check_table(cont_table, only_count=True) n1, n2 = np.shape(cont_table) if n1 != n2: raise AttributeError("Contingency Table needs to be of a square shape") upper_diagonal = np.triu_indices(n1, 1) # lower_diagonal = np.tril_indices(n1, -1) The issue with this code is that it doesn't maintain the exact order # of a lower triangular matrix compared to np.triu_indices, which we need for our test statistic upper_triangle = cont_table[upper_diagonal] lower_triangle = cont_table.T[upper_diagonal] x = np.sum( np.power(lower_triangle - upper_triangle, 2) / (upper_triangle + lower_triangle)) df = n1 * (n1 - 1) / 2 p = 1 - chi2.cdf(x, df) return x, p
def brown_forsythe_test(*args): """Found in scipy.stats as levene(center='median') Used instead of general levene test if we believe our data to be non-normal. Parameters ---------- args: list or numpy arrays, 1-D The observed variable/observations for each group, organized into lists or numpy array Return ------ w: float The W statistic, our measure of difference in variability, which is approximately F-distributed. p: float, 0 <= p <= 1 The probability that our observed differences in variances could occur due to random sampling from a population of equal variance. """ k = len(args) if k < 2: raise AttributeError( "Need at least two groups to perform a Brown-Forsythe Test") n_i, z_bar, all_z_ij, z_bar_condensed = [], [], [], [] for obs in args: obs = _check_table(obs, False) n_i = np.append(n_i, len(obs)) z_ij = abs(obs - np.median(obs)) all_z_ij = np.append(all_z_ij, z_ij) z_bar = np.append(z_bar, np.repeat(np.mean(z_ij), len(obs))) z_bar_condensed = np.append(z_bar_condensed, np.mean(z_ij)) scalar = (np.sum(n_i) - k) / (k - 1) w = scalar * np.sum( n_i * np.power(z_bar_condensed - np.mean(z_bar), 2)) / np.sum( np.power(all_z_ij - z_bar, 2)) p = 1 - f.cdf(w, k - 1, np.sum(n_i) - k) return w, p
def thompson_tau_test(data, alpha=0.05): """Not found in either scipy.stats or statsmddels. Uses the Thompson-Tau criteria to iteratively identify outliers until no more exist. Parameters ---------- data: list or numpy array, 1-D Our dataset we are evaluating for outliers alpha: float, default is 0.05 Our level of significance for detecting outliers Returns ------- outliers_list: list A list containing all datapoints that we found to be an outlier by Thompson-Tau's criteria """ data = _check_table(data, only_count=False) if alpha < 0 or alpha > 1: raise ValueError("Cannot have alpha level greater than 1 or less than 0") outlier_exist, outlier_table = True, [] data_copy = np.copy(data) while outlier_exist: n, mu, s = len(data_copy), np.mean(data_copy), np.std(data_copy, ddof=1) ab_resid = np.abs(data_copy - mu) / s rejection = t.isf(alpha / 2, n - 2) * (n - 1) / (sqrt(n) * sqrt(n - 2 + pow(t.isf(alpha / 2, n - 2), 2))) is_outlier = ab_resid > rejection if np.sum(is_outlier) != 0: outlier_table.append(data_copy[np.argsort(ab_resid)][-1:][0]) data_copy = data_copy[np.argsort(ab_resid)][:-1] else: outlier_exist = False return outlier_table
def bartlett_test(*args): """Found in scipy.stats as bartlett This test is used to determine if multiple samples are from a population of equal variances. Note that this test is much more sensitive to data that is non-normal compared to Levene or Brown-Forsythe. Parameters ---------- args: list or numpy arrays, 1-D The observed measurements for each group, organized into lists or numpy array Return ------ X: float The Chi statistic, or a measure of the observed difference in variances p: float, 0 <= p <= 1 The probability that our observed differences in variances could occur due to random sampling from a population of equal variance. """ k = len(args) if k < 2: raise AttributeError( "Need at least two groups to perform the Bartlett Test") n_i, var_i = [], [] for obs in args: obs = _check_table(obs) n_i = np.append(n_i, len(obs)) var_i = np.append(var_i, np.var(obs, ddof=1)) pooled_variance = np.sum((n_i - 1) * var_i) / (np.sum(n_i) - k) top = (np.sum(n_i) - k) * np.log(pooled_variance) - np.sum( (n_i - 1) * np.log(var_i)) bottom = 1 + (1 / (3 * (k - 1))) * (np.sum(1 / (n_i - 1)) - (1 / (np.sum(n_i) - k))) X = top / bottom p = 1 - chi2.cdf(X, k - 1) return X, p
def skew_test(data): """Found in scipy.stats as skewtest. Used to determine the likelihood that our sample dataset comes from a normal distribution based on its skewness. Parameters ---------- data: list or numpy array, 1-D Contains all observations from our sample to measure departure from normality Returns ------- z: float Our test statistic, or the measure of difference of our skewness compared to a normal distribution p: float, 0 <= p <= 1 The likelihood that we would see the observed differences in skewness from a normal population due to chance """ data = _check_table(data, only_count=False) if len(data) < 8: raise AttributeError("Skew Test is not reliable on datasets with less than 8 observations") n = len(data) skew = _skew(data) y2 = (36 * (n - 7) * (pow(n, 2) + 2 * n - 5)) / ((n - 2) * (n + 5) * (n + 7) * (n + 9)) u2 = 6 * (n - 2) / ((n + 1) * (n + 3)) w2 = sqrt(2 * y2 + 4) - 1 delta = 1 / sqrt(log(sqrt(w2))) alpha_2 = 2 / (w2 - 1) z = delta * asinh(skew / sqrt(alpha_2 * u2)) p = 2 * (1 - norm.cdf(abs(z))) return z, p
def kurtosis_test(data): """Found in scipy.stats as kurtosistest. Used to determine the likelihood that our sample dataset comes from a normal distribution based on its kurtosis. Parameters ---------- data: list or numpy array, 1-D Contains all observations from our sample to measure departure from normality Returns ------- z: float Our test statistic, or the measure of difference of our kurtosis compared to a normal distribution p: float, 0 <= p <= 1 The likelihood that we would see the observed differences in kurtosis from a normal population due to chance """ data = _check_table(data, only_count=False) if len(data) < 20: raise AttributeError("Kurtosis Test is not reliable on datasets with less than 20 observations") n = len(data) kurtosis = _kurtosis(data) - 3 mean_kurt = - 6 / (n + 1) var_kurt = 24 * n * (n - 2) * (n - 3) / (pow(n + 1, 2) * (n + 3) * (n + 5)) skew_kurt = (6 * (pow(n, 2) - 5 * n + 2) / ((n + 7) * (n + 9))) * sqrt(6 * (n + 3) * (n + 5) / (n * (n - 2) * (n - 3))) a = 6 + ((8 / skew_kurt) * (2 / skew_kurt + sqrt(1 + 4 / pow(skew_kurt, 2)))) z_top = 1 - 2 / a z_bottom = 1 + ((kurtosis - mean_kurt) / sqrt(var_kurt)) * sqrt(2 / (a - 4)) z = sqrt(9 * a / 2) * (1 - 2 / (9 * a) - np.sign(z_bottom) * np.power(z_top / abs(z_bottom), 1 / 3.0)) p = 2 * (1 - norm.cdf(abs(z))) return z, p
def box_pierce_test(data, num_lags=None): """Found in statsmodels as acorr_ljung(boxpierce=True) Used to determine if any group of autocorrelations in a time series dataset are different from zero Parameters ---------- data: list or numpy array, 1-D The time series dataset we are performing our test on num_lags: int or list, default is None If int, the maximum number of time lags If list, then the series of time lags we are performing If None, then use np.arange(1, 11) Returns ------- q: float The Box-Pierce statistic, or our measure of autocorrelations differing from zero p: float, 0 <= p <= 1 The likelihood that our observed autocorrelations would differ from zero due to chance """ if num_lags is None: h_lags = np.arange(1, 11) elif isinstance(num_lags, int): h_lags = np.arange(1, num_lags + 1) elif isinstance(num_lags, list) or isinstance(num_lags, (np.ndarray, np.generic)): h_lags = _check_table(num_lags, only_count=False) else: raise ValueError("Cannot discern number of lags") h = np.max(h_lags) n = len(data) q = n * np.sum(pow(_autocorr(data, h_lags), 2)) p = 1 - chi2.cdf(q, h) return q, p
def mcnemar_test(cont_table): """Found in statsmodels as mcnemar Used when we have paired nominal data that is organized in a 2x2 contingency table. It is used to test the assumption that the marginal column and row probabilities are equal, i.e., that the probability that b and c are equivalent. Parameters ---------- cont_table: list or numpy array, 2 x 2 A 2x2 contingency table Return ------ chi_squared: float Our Chi statistic, or the sum of differences between b and c p: float, 0 <= p <= 1 The probability that b and c aren't equivalent due to chance """ cont_table = _check_table(cont_table, True) if cont_table.shape != (2, 2): raise AttributeError( "McNemar's Test is meant for a 2x2 contingency table") b, c = cont_table[0, 1], cont_table[1, 0] if b + c > 25: chi_squared = pow(abs(b - c) - 1, 2) / (b + c) p = 1 - chi2.cdf(chi_squared, 1) else: chi_squared = min(b, c) p = 2 * binom.cdf(chi_squared, b + c, 0.5) - binom.pmf( binom.ppf(0.99, b + c, 0.5), b + c, 0.5) return chi_squared, p
def chi_squared_test(cont_table): """Found in scipy.stats as chi2_contingency. Determines the difference between what we expect the count of a group to be versus what what was observed in our contingency table. Assuming our data follows a chi distribution (i.e., observations are independent), if the observed variances are found to be very high given the number of observations, then we reject our null hypothesis and conclude that this difference could not occur due to chance. Parameters ---------- cont_table: list or numpy array, 2 x 2 A contingency table containing 2 counts of 2, or 4 counts total. As an example of expected output, refer to a confusion matrix for predicting a binary variable. Return ------ X: float The Chi test statistic, or the variance of the difference of our observed results versus expected results. p: float, 0 <= p <= 1 The likelihood that we would observe our X value given the number of observations we had. """ cont_table = _check_table(cont_table, only_count=True) df = (cont_table.shape[0] - 1) * (cont_table.shape[1] - 1) row_sum, col_sum = np.sum(cont_table, axis=1), np.sum(cont_table, axis=0) expected = np.matmul(np.transpose(row_sum[np.newaxis]), col_sum[np.newaxis]) / np.sum(row_sum) X = np.sum(pow(cont_table - expected, 2) / expected) p = 1 - chi2.cdf(X, df) return X, p
def g_test(cont_table): """Found in scipy.stats as chi2_contingency(lambda_="log-likelihood") A likelihood ratio test used for determine if the difference between our observed results and expected results in our contingency table are likely to happen due to chance. Parameters ---------- cont_table: list or numpy array, 2 x 2 A contingency table containing 2 counts of 2, or 4 counts total. As an example of expected output, refer to a confusion matrix for predicting a binary variable. Return ------ g: float The G statistic, or the likelihood ratio of the difference between observed and expected p: float, 0 <= p <= 1 The likelihood that our observed differences are due to chance """ cont_table = _check_table(cont_table, True) df = (cont_table.shape[0] - 1) * (cont_table.shape[1] - 1) row_sum, col_sum = np.sum(cont_table, axis=1), np.sum(cont_table, axis=0) expected = np.matmul(np.transpose(row_sum[np.newaxis]), col_sum[np.newaxis]) / np.sum(row_sum) g = 2 * np.sum(cont_table * np.log(cont_table / expected)) p = 1 - chi2.cdf(g, df) return g, p
def fligner_policello_test(data_1, data_2, alternative='two-sided'): """Not found in either scipy.stats or statsmodels. Used to determine whether the population medians corresponding to two independent samples are equal. Parameters ---------- data_1: list or numpy array, 1-D The observed measurements for our first sample data_2: list or numpy array, 1-D The observed measurements for our first sample alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis Returns ------- z: float The z-score of our observed median differences p: float, 0 <= p <= 1 The likelihood that we would observe these differences due to chance """ data_1, data_2 = _check_table(data_1), _check_table(data_2) if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine alternative hypothesis") m, n = len(data_1), len(data_2) if m < 12 or n < 12: warnings.warn( "Datasets may be too small for accurate approximation of p") def compare_points(x, y): z = x - y[:, None] z = np.where(z > 0, 1, z) z = np.where(z == 0, 0.5, z) z = np.where(z < 0, 0, z) return np.sum(z, axis=0) n_x, n_y = compare_points(data_1, data_2), compare_points(data_2, data_1) Nx, Ny = np.sum(n_x), np.sum(n_y) m_x, m_y = np.mean(n_x), np.mean(n_y) ss_x, ss_y = np.sum(np.power(n_x - m_x, 2)), np.sum(np.power(n_y - m_y, 2)) z = (Ny - Nx) / (2 * np.sqrt(ss_x + ss_y - (m_x * m_y))) if alternative.casefold() == 'two-sided': p = 2 * (1 - norm.cdf(abs(z))) elif alternative.casefold() == 'greater': p = 1 - norm.cdf(z) else: p = norm.cdf(z) return z, p
def two_sample_proportion_z_test(data_1, data_2, alternative='two-sided'): """Found in statsmodels as proportions_ztest Used when we are comparing whether or not two proportion means are the same, given that both of them come from a normal distribution. Parameters ---------- data_1: list or numpy array, must be binary, 1-D An array containing all observations, marked as a 0 for failure and a 1 for success, that we are comparing to data_2 data_2: list or numpy array, must be binary, 1-D An array containing all observations, marked as a 0 for failure and a 1 for success, that we are comparing to data_1 alternative: str, default is two-sided Our alternative hypothesis. It can be two-sided, less or greater Return ------ z_score: float Our z-statistic to analyze the likelihood that our observed difference is due to chance p: float, 0 <= p <= 1 The probability that the differences between two samples, assuming a normal distribution, is due to chance """ data_1, data_2 = _check_table(data_1), _check_table(data_2) if not np.array_equal(data_1, data_1.astype(bool)): raise AttributeError( "Cannot perform a proportion test on non-binary data for data_1") if not np.array_equal(data_2, data_2.astype(bool)): raise AttributeError( "Cannot perform a proportion test on non-binary data for data_2") if not isinstance(alternative, str): raise TypeError("Alternative Hypothesis is not of string type") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") n_1, n_2 = len(data_1), len(data_2) p_1, p_2 = np.mean(data_1), np.mean(data_2) p = (p_1 * n_1 + p_2 * n_2) / (n_1 + n_2) q = 1 - p se = sqrt((p * q) * ((1 / n_1) + (1 / n_2))) z_score = (p_1 - p_2) / se if alternative.casefold() == 'two-sided': p = 2 * (1 - norm.cdf(abs(z_score))) elif alternative.casefold() == 'greater': p = 1 - norm.cdf(z_score) else: p = norm.cdf(z_score) return z_score, p
def trinomial_test(data_1, data_2, alternative='two-sided'): """Not found in scipy.stats or statsmodels Used on paired-data when the sign test loses power, that is, when there exists instances of "zero observations" or differences of zero between the paired-data. Parameters ---------- data_1: list or numpy array, 1-D The observed measurements for our first sample data_2: list or numpy array, 1-D The observed measurements for our first sample Returns ------- d: int The number of positive instances minus the number of negative instances p: float, 0 <= p <= 1 The likelihood that we would observe these sign differences due to random chance """ data_1, data_2 = _check_table(data_1), _check_table(data_2) if len(data_1) != len(data_2): raise AttributeError("Cannot perform Trinomial Test on unpaired data") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine alternative hypothesis") n = len(data_1) diffs = data_1 - data_2 pos_diff, neg_diff, zero_diff = np.sum(diffs > 0), np.sum( diffs < 0), np.sum(diffs == 0) p_0 = zero_diff / n probs = [] def calculate_probs(n, z, k, p_0): return np.sum(factorial(n) / (st_factorial(n - z - 2 * k) * st_factorial(k + z) * st_factorial(k)) * \ np.power(p_0, n - z - (2 * k)) * np.power((1 - p_0) / 2, z + 2 * k)) for z in range(n + 1): k = np.arange(0, (n - z) // 2 + 1) probs.append(calculate_probs(n, z, k, p_0)) d = pos_diff - neg_diff if alternative.casefold() == "two-sided": p = np.sum(probs[abs(d):]) * 2 elif alternative.casefold() == 'greater': p = np.sum(probs[abs(d):]) else: p = np.sum(probs[:abs(d)]) return d, p
def mood_test(data_1, data_2, alternative='two-sided'): """Found in scipy.stats as mood Used to measure the level of dispersion (difference from median) of the ranks of the two datasets. Parameters ---------- data_1: list or numpy array, 1-D A list or array containing all observations from our first dataset data_2: list or numpy array, 1-D A list or array containing all observations from our second dataset alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis Returns ------- z: float Our test statistic that measures the degree of normality of the rank dispersions p: float, 0 <= p <= 1 The likelihood that our rank dispersion would occur from two datasets drawn from the same distribution """ data_1, data_2 = _check_table(data_1, only_count=False), _check_table(data_2, only_count=False) if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") len_1, len_2 = len(data_1), len(data_2) n_obs = len_1 + len_2 if n_obs < 3: raise AttributeError("Not enough observations to perform mood dispertion test") all_data = np.concatenate([data_1, data_2]) rank_data = rankdata(all_data) r_1 = rank_data[:len_1] m = np.sum(np.power(r_1 - (n_obs + 1) / 2, 2)) mu_m = len_1 * (pow(n_obs, 2) - 1) / 12 var_m = len_1 * len_2 * (n_obs + 1) * (n_obs + 2) * (n_obs - 2) / 180 z = (m - mu_m) / sqrt(var_m) if alternative.casefold() == 'two-sided': if z > 0: p = 2 * (1 - norm.cdf(z)) else: p = 2 * norm.cdf(z) elif alternative.casefold() == 'greater': p = 1 - norm.cdf(z) else: p = norm.cdf(z) return z, p