def wincor(x, y, tr=.2): """ Compute the winsorized correlation between `x` and `y`. This function also returns the winsorized covariance. :param x: Pandas Series Data for group one :param y: Pandas Series Data for group two :param tr: float Proportion to winsorize (default is .2) :return: Dictionary of results cor: float Winsorized correlation nval: int Number of observations sig: float p-value wcov: float Winsorized covariance """ if type(x) is not np.ndarray: x, y=pandas_to_arrays([x, y]) m1 = np.c_[x, y] # cbind m1 = m1[~np.isnan(m1).any(axis=1)] nval = m1.shape[0] x = m1[:, 0] y = m1[:, 1] g = np.floor(tr * len(x)) xvec = winsorize(x, limits=(tr,tr)) yvec = winsorize(y, limits=(tr,tr)) wcor = np.corrcoef(xvec, yvec)[0,1] wcov = np.cov(xvec, yvec)[0,1] test = wcor * np.sqrt((len(x) - 2) / (1. - wcor ** 2)) sig = 2 * (1 - t.cdf(abs(test), len(x) - 2 * g - 2)) res={'cor': wcor, 'wcov': wcov, 'sig': sig, 'nval': nval} return res
def corb(corfun, x, y, alpha, nboot, *args, seed=False): """ Compute a 1-alpha confidence interval for a correlation using percentile bootstrap method The function `corfun` is any function that returns a correlation coefficient. The functions pbcor and wincor follow this convention. When using Pearson's correlation, and when n<250, use lsfitci instead (not yet implemented). Note that arguments up to and including `args` are positional arguments :param corfun: function corfun is any function that returns a correlation coefficient :param x: Pandas Series Data for group one :param y: Pandas Series Data for group two :param alpha: float Alpha level (default is .05) :param nboot: int Number of bootstrap samples :param args: list/value List of arguments to corfun (e.g., .2) :param seed: bool Random seed for reprodicible results. Default is `False`. :return: Dictionary of results ci: list Confidence interval cor: float Correlation estimate p_value: float p-value """ x, y=pandas_to_arrays([x, y]) m1 = np.c_[x, y] # cbind m1 = m1[~np.isnan(m1).any(axis=1)] nval = m1.shape[0] x = m1[:, 0] y = m1[:, 1] est = corfun(x, y, *args)['cor']#[0] if seed: np.random.seed(seed) data_inds = np.random.choice(len(x), size=(nboot, len(x))) bvec = np.array([corbsub(row_inds, x, y, corfun, *args) for row_inds in data_inds]) ihi = int(np.floor((1 - alpha / 2) * nboot + .5)) ilow = int(np.floor((alpha / 2) * nboot + .5)) bsort = sorted(bvec) corci = [bsort[ilow], bsort[ihi]] phat = sum(bvec < 0) / nboot sig = 2 * min(phat, 1 - phat) #return corci, sig, est return {'ci': corci, 'p_value': sig, 'cor': est}
def pbcor(x, y, beta=.2): """ Compute the percentage bend correlation between `x` and `y` :param x: Pandas Series Data for group one :param y: Pandas Series Data for group two :param beta: float `0 < beta < .5`. Beta is analogous to trimming in other functions and related to the measure of dispersion used in the percentage bend calculation. :return: Dictionary of results cor: float Correlation nval: int Number of observations p_value p-value test: float Test statistic """ if type(x) is not np.ndarray: x, y = pandas_to_arrays([x, y]) if len(x) != len(y): raise Exception("The arrays do not have equal lengths") m1 = np.c_[x, y] # cbind m1 = m1[~np.isnan(m1).any(axis=1)] nval = m1.shape[0] x = m1[:, 0] y = m1[:, 1] temp = np.sort(abs(x - np.median(x))) omhatx = temp[int(np.floor((1 - beta) * len(x)))-1] temp = np.sort(abs(y - np.median(y))) omhaty = temp[int(np.floor((1 - beta) * len(y)))-1] a = (x - pbos(x, beta)) / omhatx b = (y - pbos(y, beta)) / omhaty a = np.where(a <= -1, -1, a) a = np.where(a >= 1, 1, a) b = np.where(b <= -1, -1, b) b = np.where(b >= 1, 1, b) pbcor_result = sum(a * b) / np.sqrt(sum(a ** 2) * sum(b ** 2)) test = pbcor_result * np.sqrt((len(x) - 2) / (1 - pbcor_result ** 2)) sig = 2 * (1 - t.cdf(abs(test), len(x) - 2)) res = {'cor': pbcor_result, 'test': test, 'p_value': sig, 'nval': nval} return res
def lindepbt(x, tr=.2, con=None, alpha=.05, nboot=599, dif=True, seed=False): """ Multiple comparisons on trimmed means with FWE controlled with Rom's method Using a bootstrap-t method. :param x: Pandas DataFrame Each column in the data represents a different group :param tr: float Proportion to trim (default is .2) :param con: array `con` is a J (number of groups) by d (number of contrasts) matrix containing the contrast coefficents of interest. All linear constrasts can be created automatically by using the function [con1way](J) (the result of which can be used for `con`). The default is `None` and in this case all linear contrasts are created automatically. :param alpha: float Alpha level. Default is .05. :param nboot: int Number of bootstrap samples (default is 2000) :param dif: bool When `True`, use difference scores, otherwise use marginal distributions :param seed: bool Random seed for reprodicible results (default is `False`) :return: Dictionary of results con: array Contrast matrix num_sig: int Number of observations for each group psihat: DataFrame Difference score and CI for each contrast test: DataFrame Test statistic, p-value, critical value, and standard error for each contrast """ called_directly = False if type(x) is pd.DataFrame: x = pandas_to_arrays(x) x = remove_nans_based_on_design(x, design_values=len(x), design_type='dependent_groups') x = np.r_[x].T called_directly = True from hypothesize.measuring_associations import wincor if seed: np.random.seed(seed) if con is None: con = con2way(1, x.shape[1])[1] # all pairwise ncon = con.shape[1] else: ncon = con.shape[1] x = x[~np.isnan(x).any(axis=1)] n = x.shape[0] J = x.shape[1] nval = x.shape[0] h1 = nval - 2 * np.floor(tr * nval) #df=h1-1 xbar = trim_mean(x, tr) if alpha == .05: dvec = [ .05, .025, .0169, .0127, .0102, .00851, .0073, .00639, .00568, .00511 ] if ncon > 10: avec = .05 / np.arange(11, ncon + 1) dvec = np.append(dvec, avec) elif alpha == .01: dvec = [ .01, .005, .00334, .00251, .00201, .00167, .00143, .00126, .00112, .00101 ] if ncon > 10: avec = .01 / np.arange(11, ncon + 1) dvec = np.append(dvec, avec) else: dvec = alpha / np.arange(1, ncon + 1) psihat = np.zeros([ncon, 4]) test = np.zeros([ncon, 5]) temp1 = np.array([]) for d in range(ncon): psihat[d, 0] = d if not dif: psihat[d, 1] = np.sum(con[:, d] * xbar) sejk = 0 for j in range(J): for k in range(J): djk = (nval - 1) * wincor(x[:, j], x[:, k], tr)['wcov'] / (h1 * (h1 - 1)) sejk = sejk + con[j, d] * con[k, d] * djk sejk = np.sqrt(sejk) test[d, 0] = d test[d, 1] = np.sum(con[:, d] * xbar) / sejk test[d, 4] = sejk data = np.random.randint(n, size=(nboot, n)) xcen = np.full([x.shape[0], x.shape[1]], np.nan) for j in range(J): xcen[:, j] = x[:, j] - trim_mean(x[:, j], tr) bvec = [ lindep_sub(data_row, xcen, con[:, d], tr=tr) for data_row in data ] bsort = np.sort(np.abs(bvec)) ic = round( (1 - alpha) * nboot) - 1 # correct for python with the "- 1"? psihat[d, 2] = psihat[d, 1] - bsort[ic] * test[d, 4] psihat[d, 3] = psihat[d, 1] + bsort[ic] * test[d, 4] p_value = np.mean(np.abs(test[d, 1]) <= np.abs(bvec)) temp1 = np.append(temp1, p_value) elif dif: for j in range(J): if j == 0: dval = con[j, d] * x[:, j] elif j > 0: dval = dval + con[j, d] * x[:, j] temp = trimcibt(dval, tr=tr, alpha=alpha, nboot=nboot, seed=seed) temp1 = np.append(temp1, temp['p_value']) test[d, 0] = d test[d, 1] = temp['test_stat'] ## missing in R? test[d, 4] = trimse(dval, tr=tr) psihat[d, 1] = trim_mean(dval, tr) psihat[d, 2] = temp['ci'][0] psihat[d, 3] = temp['ci'][1] test[:, 2] = temp1 temp2 = (-temp1).argsort() zvec = dvec[:ncon] test[temp2, 3] = zvec # if flagcon num_sig = np.sum(test[:, 2] <= test[:, 3]) if called_directly: test = pd.DataFrame( test, columns=["con_num", "test", "p_value", "p_crit", "se"]) psihat = pd.DataFrame( psihat, columns=["con_num", "psihat", "ci_lower", "ci_upper"]) return {'test': test, 'psihat': psihat, 'con': con, 'num_sig': num_sig}
def pb2gen(x, y, est, *args, alpha=.05, nboot=2000, seed=False): """ Compute a bootstrap confidence interval for the the difference between any two parameters corresponding to two independent groups. Note that arguments up to and including `args` are positional arguments :param x: Pandas Series Data for group one :param y: Pandas Series Data for group two :param est: function Measure of location (currently only `trim_mean` is supported) :param args: list/value Parameter(s) for measure of location (e.g., .2) :param alpha: float Alpha level (default is .05) :param nboot: int Number of bootstrap samples (default is 2000) :param seed: bool Random seed for reprodicible results (default is `False`) :return: Dictionary of results ci: list Confidence interval est_1: float Estimated value (based on `est`) for group one est_2: float Estimated value (based on `est`) for group two est_dif: float Estimated difference between group one and two n1: int Number of observations in group one n2: int Number of observations in group two p_value: float p-value variance: float Variance of group one and two """ x, y = pandas_to_arrays([x, y]) x = x[~np.isnan(x)] y = y[~np.isnan(y)] if seed: np.random.seed(seed) datax = np.random.choice(x, size=(nboot, len(x))) datay = np.random.choice(y, size=(nboot, len(y))) bvecx = est(datax, *args, axis=1) bvecy = est(datay, *args, axis=1) bvec = np.sort(bvecx - bvecy) low = round((alpha / 2) * nboot) #+ 1 up = nboot - low - 2 temp = np.sum(bvec < 0) / nboot + np.sum(bvec == 0) / (2 * nboot) sig_level = 2 * (min(temp, 1 - temp)) se = np.var(bvec) results = { 'est_1': est(x, *args), 'est_2': est(y, *args), 'est_dif': est(x, *args) - est(y, *args), 'ci': [bvec[low], bvec[up]], 'p_value': sig_level, 'variance': se, 'n1': len(x), 'n2': len(y) } return results
def l2drmci(x, y, est, *args, pairwise_drop_na=True, alpha=.05, nboot=2000, seed=False): """ Compute a bootstrap confidence interval for a measure of location associated with the distribution of x-y. That is, compare x and y by looking at all possible difference scores in random samples of `x` and `y`. `x` and `y` are possibly dependent. Note that arguments up to and including `args` are positional arguments :param x: Pandas Series Data for group one :param y: Pandas Series Data for group two :param est: function Measure of location (currently only `trim_mean` is supported) :param args: list/value Parameter(s) for measure of location (e.g., .2) :param pairwise_drop_na: bool If True, treat data as dependent and remove any row with missing data. If False, remove missing data for each group seperately (cannot deal with unequal sample sizes) :param alpha: float Alpha level (default is .05) :param nboot: int Number of bootstrap samples (default is 2000) :param seed: bool Random seed for reprodicible results (default is `False`) :return: Dictionary of results ci: list Confidence interval p_value: float p-value """ x, y = pandas_to_arrays([x, y]) if pairwise_drop_na: m1 = np.c_[x, y] # cbind x = m1[~np.isnan(m1).any(axis=1)] else: x = x[~np.isnan(x)] y = y[~np.isnan(y)] if len(x) != len(y): raise Exception( "With unequal sample sizes, you might consider wmwpb " "(currently not implemented)") else: x = np.c_[x, y] # cbind if seed: np.random.seed(seed) data = np.random.choice(x.shape[0], size=(nboot, len(x))) bvec = np.full(nboot, np.nan) for i in range(nboot): bvec[i] = \ loc2dif(x[data[i,:], 0], x[data[i,:], 1], est, *args, drop_na=pairwise_drop_na) bvec = np.sort(bvec) low = int(np.round((alpha / 2) * nboot) + 1) - 1 up = nboot - low - 2 temp = np.sum(bvec < 0) / nboot + np.sum(bvec == 0) / (2 * nboot) sig_level = 2 * (np.min([temp, 1 - temp])) ci = [bvec[low], bvec[up]] results = dict(zip(['ci', 'p_value'], [ci, sig_level])) return results
def yuenbt(x, y, tr=.2, alpha=.05, nboot=599, seed=False): """ Compute a 1-alpha confidence interval for the difference between the trimmed means corresponding to two independent groups. The bootstrap-t method is used. During the bootstrapping, the absolute value of the test statistic is used (the "two-sided method"). :param x: Pandas Series Data for group one :param y: Pandas Series Data for group two :param tr: float Proportion to trim (default is .2) :param alpha: float Alpha level (default is .05) :param nboot: int Number of bootstrap samples (default is 599) :param seed: bool Random seed for reprodicible results. Default is `False`. :return: Dictionary of results ci: list Confidence interval est_dif: float Estimated difference between group one and two est_1: float Estimated value (based on `est`) for group one est_2: float Estimated value (based on `est`) for group two p_value: float p-value test_stat: float Test statistic """ x, y = pandas_to_arrays([x, y]) if seed: np.random.seed(seed) ci = [] x = x[~np.isnan(x)] y = y[~np.isnan(y)] xcen = x - trim_mean(x, tr) ycen = y - trim_mean(y, tr) test_stat = (trim_mean(x, tr) - trim_mean(y, tr)) / \ np.sqrt(trimse(x, tr = tr) ** 2 + trimse(y, tr = tr) ** 2) datax = np.random.choice(xcen, size=(nboot, len(x))) datay = np.random.choice(ycen, size=(nboot, len(y))) top = trim_mean(datax, .2, axis=1) - trim_mean(datay, .2, axis=1) #botx = list(map(lambda row: trimse(row,.2), datax)) botx = np.array([trimse(x) for x in datax]) boty = np.array([trimse(x) for x in datay]) tval = top / np.sqrt(botx**2 + boty**2) tval = abs(tval) tval = sorted(tval) icrit = int(np.floor((1 - alpha) * nboot + .5)) #ibot = int(np.floor(alpha * nboot / 2 + .5)) #itop = int(np.floor((1 - alpha / 2) * nboot + .5)) se = np.sqrt((trimse(x, tr))**2 + (trimse(y, tr))**2) ci.append(trim_mean(x, tr) - trim_mean(y, tr) - tval[icrit] * se) ci.append(trim_mean(x, tr) - trim_mean(y, tr) + tval[icrit] * se) p_value = sum(np.abs(test_stat) <= np.abs(tval)) / nboot est_x = trim_mean(x, tr) est_y = trim_mean(y, tr) est_dif = est_x - est_y results = { 'ci': ci, 'test_stat': test_stat, 'p_value': p_value, 'est_x': est_x, 'est_y': est_y, 'est_dif': est_dif } return results
def tmcppb(x, est, *args, con=None, bhop=False, alpha=.05, nboot=None, seed=False): """ Multiple comparisons for J independent groups using trimmed means and the percentile bootstrap method. Rom’s method is used to control the probability of one or more type I errors. For C > 10 hypotheses, or when the goal is to test at some level other than .05 and .01, Hochberg’s method is used. Setting the argument `bhop` to `True` uses the Benjamini–Hochberg method instead. Note that arguments up to and including `args` are positional arguments :param x: Pandas DataFrame Each column represents a group of data :param est: function Measure of location (currently only `trim_mean` is supported) :param args: list/value Parameter(s) for measure of location (e.g., .2) :param con: array `con` is a J (number of columns) by d (number of contrasts) matrix containing the contrast coefficents of interest. All linear constrasts can be created automatically by using the function [con1way](J) (the result of which can be used for `con`). The default is `None` and in this case all linear contrasts are created automatically. :param bhop: bool If `True`, the Benjamini–Hochberg method is used to control FWE :param alpha: float Alpha level. Default is .05. :param nboot: int Number of bootstrap samples (default is 2000) :param seed: bool Random seed for reproducible results. Default is `False`. :return: Dictionary of results con: array Contrast matrix num_sig: int Number of statistically significant results output: DataFrame Difference score, p-value, critical value, and CI for each contrast """ x = pandas_to_arrays(x) x = remove_nans_based_on_design(x, len(x), 'independent_groups') J = len(x) mvec = [est(i, *args) for i in x] if con is None: con = con1way(J) ncon = con.shape[1] if not nboot: nboot = 5000 if J <= 8: nboot = 4000 elif J <= 3: nboot = 2000 if not bhop: if alpha == .05: dvec = [ .05, .025, .0169, .0127, .0102, .00851, .0073, .00639, .00568, .00511 ] if ncon > 10: avec = .05 / np.arange(11, ncon + 1) dvec = [dvec, avec] elif alpha == .01: dvec = [ .01, .005, .00334, .00251, .00201, .00167, .00143, .00126, .00112, .00101 ] if ncon > 10: avec = .01 / np.arange(11, ncon + 1) dvec = [dvec, avec] else: #not (alpha != .05 or alpha != .01): dvec = alpha / np.arange(1, ncon + 1) else: dvec = (ncon - np.arange(1, ncon + 1) + 1) * alpha / ncon if seed: np.random.seed(seed) bvec = np.full([J, nboot], np.nan) for i, j in enumerate(x): data = np.random.choice(j, size=(nboot, len(j))) bvec[i, :] = [est(row, *args) for row in data] bcon = con.T @ bvec tvec = con.T @ mvec test = np.full(ncon, np.nan) for d in range(ncon): tv = np.sum(bcon[d, :] == 0) / nboot test[d] = np.sum(bcon[d, :] > 0) / nboot + .5 * tv if test[d] > .5: test[d] = 1 - test[d] output = np.full([ncon, 6], np.nan) test = 2 * test temp2 = (-test).argsort() zvec = dvec[:ncon] output[temp2, 3] = zvec icl = int(np.round(dvec[-1] * nboot / 2) + 1) - 1 icu = nboot - icl - 3 for ic in range(ncon): output[ic, 1] = tvec[ic] output[ic, 0] = ic output[ic, 2] = test[ic] temp = np.sort(bcon[ic, :]) output[ic, 4] = temp[icl] output[ic, 5] = temp[icu] num_sig = np.sum(output[:, 2] <= output[:, 3]) cols = ["con_num", "psihat", "p_value", "p_crit", "ci_lower", "ci_upper"] output = pd.DataFrame(output, columns=cols) results = {'output': output, 'con': con, 'num_sig': num_sig} return results
def linconb(x, con, tr=.2, alpha=.05, nboot=599, seed=False): """ Compute a 1-alpha confidence interval for a set of d linear contrasts involving trimmed means using the bootstrap-t bootstrap method. Independent groups are assumed. CIs are adjusted to control FWE (p values are not adjusted). :param x: DataFrame Each column represents a group of data :param con: array `con` is a J (number of columns) by d (number of contrasts) matrix containing the contrast coefficents of interest. All linear constrasts can be created automatically by using the function [con1way](J) (the result of which can be used for `con`). :param tr: float Proportion to trim (default is .2) :param alpha: float Alpha level (default is .05) :param nboot: int Number of bootstrap samples (default is 2000) :param seed: bool Random seed for reprodicible results. Default is `False`. :return: Dictionary of results con: array Contrast matrix crit: float Critical value n: list Number of observations for each group psihat: DataFrame Difference score and CI for each contrast test: DataFrame Test statistic, standard error, and p-value for each contrast """ x = pandas_to_arrays(x) J = len(x) x = np.asarray([j[~np.isnan(j)] for j in x]) #Jm = J - 1 #d = (J ** 2 - J) / 2 if con.shape[0] != len(x): raise Exception( "The number of groups does not match the number of contrast coefficients." ) bvec = np.zeros([nboot, J, 2]) if seed: np.random.seed(seed) nsam = [len(xi) for xi in x] for j in range(J): xcen = x[j] - trim_mean(x[j], tr) data = np.random.choice(xcen, size=(nboot, len(x[j]))) for i, row in enumerate(data): bvec[i, j, :] = trimparts(row, tr) m1 = bvec[:, :, 0].T m2 = bvec[:, :, 1].T boot = np.zeros([con.shape[1], nboot]) for d in range(con.shape[1]): top = np.asarray([trimpartt(row, con[:, d]) for row in m1.T]) consq = con[:, d]**2 bot = np.asarray([trimpartt(row, consq) for row in m2.T]) boot[d, :] = np.abs(top) / np.sqrt(bot) testb = np.asarray([max(row) for row in boot.T]) ic = int(np.floor((1 - alpha) * nboot) - 1) # one less than R testb = np.sort(testb) psihat = np.zeros([con.shape[1], 4]) test = np.zeros([con.shape[1], 4]) for d in range(con.shape[1]): test[d, 0] = d psihat[d, 0] = d testit = lincon(x, np.array([con[:, d]]).T, tr, alpha) # column slice of contrast matrix #test[d, 1]=testit['test'][0, 1] test[d, 1] = testit['test']['test'][0] #pval = np.mean((abs(testit['test'][0, 1]) < boot[d,:])) pval = np.mean((abs(testit['test']['test'][0]) < boot[d, :])) test[d, 3] = pval #print(testit['test']) #print(testit['psihat']) # psihat[d, 2] = testit['psihat'][0, 1] - testb[ic] * testit['test'][0, 3] # psihat[d, 3] = testit['psihat'][0, 1] + testb[ic] * testit['test'][0, 3] # psihat[d, 1] = testit['psihat'][0, 1] psihat[d, 2] = testit['psihat']['psihat'][ 0] - testb[ic] * testit['test']['se'][0] psihat[d, 3] = testit['psihat']['psihat'][ 0] + testb[ic] * testit['test']['se'][0] psihat[d, 1] = testit['psihat']['psihat'][0] #test[d, 2] = testit['test'][0, 3] test[d, 2] = testit['test']['se'][0] psihat_col_names = ['contrast_index', 'psihat', 'ci_low', 'ci_up'] test_col_names = ['contrast_index', 'test', 'se', 'p_value'] psihat = pd.DataFrame(psihat, columns=psihat_col_names) test = pd.DataFrame(test, columns=test_col_names) return { 'n': nsam, 'psihat': psihat, 'test': test, 'crit': testb[ic], 'con': con }
def ydbt(x, y, tr=.2, alpha=.05, nboot=599, side=True, seed=False): """ Using the bootstrap-t method, compute a .95 confidence interval for the difference between the marginal trimmed means of paired data. By default, 20% trimming is used with 599 bootstrap samples. :param x: Pandas Series Data for group one :param y: Pandas Series Data for group two :param tr: float Proportion to trim (default is .2) :param alpha: float Alpha level. Default is .05. :param nboot: int Number of bootstrap samples (default is 2000) :param side: boolWhen `True` the function returns a symmetric CI and a p value, otherwise the function returns equal-tailed CI (no p value) :param seed: bool Random seed for reprodicible results (default is `False`) :return: Dictionary of results ci: list Confidence interval dif: float Difference between group one and two p_value: float p-value """ x = pandas_to_arrays([x, y]) x = remove_nans_based_on_design(x, 2, 'dependent_groups') x, y = [x[0], x[1]] if seed: np.random.seed(seed) data = np.random.randint(len(x), size=(nboot, len(x))) xcen = x - trim_mean(x, tr) ycen = y - trim_mean(y, tr) bvec = [tsub(row, xcen, ycen, tr) for row in data] dotest = yuend(x, y, tr=tr) estse = dotest['se'] p_value = np.nan dif = trim_mean(x, tr) - trim_mean(y, tr) ci = [] if not side: print('p_value is only returned when side=True') ilow = round((alpha / 2) * nboot) - 1 ihi = nboot - ilow - 2 bsort = np.sort(bvec) ci.append(dif - bsort[ihi] * estse) ci.append(dif - bsort[ilow + 1] * estse) else: bsort = np.sort(np.abs(bvec)) ic = round((1 - alpha) * nboot) - 1 ci.append(dif - bsort[ic] * estse) ci.append(dif + bsort[ic] * estse) p_value = (np.sum(np.abs(dotest['teststat']) <= np.abs(bvec))) / nboot return {'ci': ci, 'dif': dif, 'p_value': p_value}