def FPvalue( *args): df_btwn, df_within = __degree_of_freedom_( *args) mss_btwn = __ss_between_( *args) / float( df_btwn) mss_within = __ss_within_( *args) / float( df_within) F = mss_btwn / mss_within P = special.fdtrc( df_btwn, df_within, F) return( F, P)
def welch_anova_np(args, var_equal=False): """ args : array like of array likes A list of groups (lists of floats) which should be compared. var_equal : boolean The groups share a common variance. """ # Define Welch's ANOVA, which is robust against unequal variances # see https://statisticsbyjim.com/anova/welchs-anova-compared-to-classic-one-way-anova/ # https://stackoverflow.com/questions/50964427/welchs-anova-in-python # https://github.com/scipy/scipy/issues/11122 args = [np.asarray(arg, dtype=float) for arg in args] k = len(args) ni = np.array([len(arg) for arg in args]) mi = np.array([np.mean(arg) for arg in args]) vi = np.array([np.var(arg, ddof=1) for arg in args]) wi = ni / vi tmp = np.sum((1 - wi / np.sum(wi))**2 / (ni - 1)) tmp /= (k**2 - 1) dfbn = k - 1 dfwn = 1 / (3 * tmp) m = np.sum(mi * wi) / np.sum(wi) f = np.sum(wi * (mi - m)**2) / (dfbn * (1 + 2 * (dfbn - 1) * tmp)) prob = fdtrc(dfbn, dfwn, f) return stats.stats.F_onewayResult(f, prob)
def FPvalue(*args): df_btwn, df_within = __degree_of_freedom_(*args) mss_btwn = __ss_between_(*args) / float(df_btwn) mss_within = __ss_within_(*args) / float(df_within) F = mss_btwn / mss_within P = special.fdtrc(df_btwn, df_within, F) return (F, P)
def f_oneway(*args): n_classes = len(args) args = [as_float_array(a) for a in args] n_samples_per_class = np.array([a.shape[0] for a in args]) n_samples = np.sum(n_samples_per_class) ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args) sums_args = [np.asarray(a.sum(axis=0)) for a in args] square_of_sums_alldata = sum(sums_args)**2 square_of_sums_args = [s**2 for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, _ in enumerate(args): ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_features_idx = np.where(msw == 0.)[0] if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size): warnings.warn("Features %s are constant." % constant_features_idx, UserWarning) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = special.fdtrc(dfbn, dfwn, f) return f, prob
def FTest(chi2_1, dof_1, chi2_2, dof_2): """ Run F-test. Compute an F-test to see if a model with extra parameters is significant compared to a simpler model. The input values are the (non-reduced) chi^2 values and the numbers of DOF for '1' the original model and '2' for the new model (with more fit params). The probability is computed exactly like Sherpa's F-test routine (in Ciao) and is also described in the Wikipedia article on the F-test: http://en.wikipedia.org/wiki/F-test The returned value is the probability that the improvement in chi2 is due to chance (i.e. a low probability means that the new fit is quantitatively better, while a value near 1 means that the new model should likely be rejected). Parameters ----------- chi2_1 : Float Chi-squared value of model with fewer parameters dof_1 : Int Degrees of freedom of model with fewer parameters chi2_2 : Float Chi-squared value of model with more parameters dof_2 : Int Degrees of freedom of model with more parameters Returns -------- ft : Float F-test significance value for the model with the larger number of components over the other. """ delta_chi2 = chi2_1 - chi2_2 if delta_chi2 > 0 and dof_1 != dof_2: delta_dof = dof_1 - dof_2 new_redchi2 = chi2_2 / dof_2 F = np.float64((delta_chi2 / delta_dof) / new_redchi2) # fdtr doesn't like float128 ft = fdtrc(delta_dof, dof_2, F) else: if delta_chi2 <= 0: log.warning( "Chi^2 for Model 2 is larger than Chi^2 for Model 1, cannot preform F-test." ) elif dof_1 == dof_2: log.warning( "Models have equal degrees of freedom, cannot preform F-test.") ft = False return ft
def anova(arr): def _square_of_sums(a): s = np.sum(a, 0) if not np.isscalar(s): return s.astype(float) * s else: return float(s) * s def _sum_of_squares(a): return np.sum(a * a, 0) # # If all inputs equivalent return 0, not nan as default behaviour # if sum([(np.asarray(arr[0]) == x).all() for x in arr[1:]]) == len(arr)-1: # return 0 args = [np.asarray(arg, dtype=float) for arg in arr] # ANOVA on N groups, each in its own array num_groups = len(args) alldata = np.concatenate(args) bign = len(alldata) # Determine the mean of the data, and subtract that from all inputs to a # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariance # to a shift in location, and centering all data around zero vastly # improves numerical stability. offset = alldata.mean() alldata -= offset sstot = _sum_of_squares(alldata) - (_square_of_sums(alldata) / float(bign)) ssbn = 0 for a in args: ssbn += _square_of_sums(a - offset) / float(len(a)) # Naming: variables ending in bn/b are for "between treatments", wn/w are # for "within treatments" ssbn -= (_square_of_sums(alldata) / float(bign)) sswn = sstot - ssbn dfbn = num_groups - 1 dfwn = bign - num_groups msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw if f < 0: # correct rounding errors :/ f = 0 prob = special.fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf return prob
def compute_F_statistic_and_pvalue(*args): """ Return F statistic an p-value """ # Compute degrees of freedom df_btwn, df_within = __degree_of_freedom_(*args) # Compute sums of squares mss_btwn = __ss_between_(*args) / float(df_btwn) mss_within = __ss_within_(*args) / float(df_within) # F statistic F = mss_btwn / mss_within pvalue = special.fdtrc(df_btwn, df_within, F) return (F, pvalue, df_btwn, df_within)
def _f_oneway_lower(lifted): """Performs a 1-way ANOVA. Parameters ---------- lifted : FOnewayData The result of `to_monoid`. Returns ------- F-value : float The computed F-value of the test. p-value : float The associated p-value from the F-distribution. """ classes = lifted.classes n_samples_per_class = lifted.n_samples_per_class n_samples = lifted.n_samples ss_alldata = lifted.ss_alldata sums_samples = lifted.sums_samples sums_alldata = lifted.sums_alldata n_classes = len(classes) square_of_sums_alldata = sums_alldata ** 2 square_of_sums_args = {k: s ** 2 for k, s in sums_samples.items()} sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0.0 for k in n_samples_per_class: ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) # constant_features_idx = np.where(msw == 0.0)[0] # if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size): # warnings.warn("Features %s are constant." % constant_features_idx, # UserWarning) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = special.fdtrc(dfbn, dfwn, f) return f, prob
def linRegstats(x, y, conf): if size(shape(x)) == 1: p = 2 n = len(x) # Sample size else: p, n = shape(x) # (samples size, # of parameters) p = p + 1 # add one for intercept dof = n - p # Degrees of freedom nov = p - 1 # number of variables ym = mean(y) # Mean of log recovery X = vstack((ones(n), x)).T # observed x-variable matrix XTX = dot(X.T, X) iXTX = inv(XTX) bhat = dot(dot(iXTX, X.T), y) yhat = dot(X, bhat) # Linear fit line SSE = sum((y - yhat)**2) # Sum of Squared Errors SST = sum((y - ym)**2) # Sum of Squared Total SSR = sum((yhat - ym)**2) # Sum of Squared Residuals (SSR = SST - SSE) R2 = SSR / SST # R^2 Statistic (rval**2) MSE = SSE / dof # Mean Squared Error (MSE) MSR = SSR / nov # Mean Squared Residual (MSR) F = MSR / MSE # F-Statistic F_p = fdtrc(nov, dof, F) # F-Stat. p-value # variance of beta estimates : VARb = MSE * iXTX # diag(VARb) == varB varB = diag(VARb) # variance of beta hat seb = sqrt(varB) # vector of standard errors for beta hat ## variance of y estimates : #VARy = MSE * dot(dot(X, iXTX), X.T) #varY = diag(VARy) # variance of y hat #sey = sqrt(varY) # standard errors for yhat # calculate t-statistic : t_b = bhat / seb #t_y = yhat / sey # calculate p-values : pval = t.sf(abs(t_b), dof) * 2 tbonf = t.ppf((1 + conf) / 2.0, dof) # uncorrected t*-value ci_b = [ bhat - tbonf * seb, # Confidence intervals for betas bhat + tbonf * seb ] # in 2 columns (lower,upper) #ci_y = [yhat - tbonf*sey, # Confidence intervals for estimates # yhat + tbonf*sey] # in 2 columns (lower,upper) resid = y - yhat #vara = { 'SSR' : SSR, # 'SSE' : SSE, # 'SST' : SST, # 'df' : (nov, dof, n-1), # 'MSR' : MSR, # 'MSE' : MSE, # 'F' : F, # 'F_pval': F_p, # 'varY' : varY, # 'varB' : varB, # 'SEB' : seb, # 'SEY' : sey, # 'tbonf' : tbonf, # 't_beta': t_b, # 't_y' : t_y, # 'pval' : pval, # 'CIB' : array(ci_b), # 'CIY' : array(ci_y), # 'bhat' : bhat, # 'yhat' : yhat, # 'R2' : R2, # 'resid' : resid} vara = { 'SSR': SSR, 'SSE': SSE, 'SST': SST, 'df': (nov, dof, n - 1), 'MSR': MSR, 'MSE': MSE, 'F': F, 'F_pval': F_p, 'varB': varB, 'SEB': seb, 'tbonf': tbonf, 't_beta': t_b, 'pval': pval, 'CIB': array(ci_b), 'bhat': bhat, 'yhat': yhat, 'R2': R2, 'resid': resid } return vara
def glm(x,y,w=1.0): p,n = shape(x) # sample size p += 1 # add one for intercept dof = n - p # degrees of freedom sig = var(y) # variance mu = (y + mean(y))/2.0 # initial mean estimate eta = log(mu) # initial predictor X = vstack((ones(n), x)).T # observed x-variable matrix # Newton-Raphson : converged = False rtol = 1e-12 dtol = 1e-12 lmbda = 1.0 nIter = 0 deviance = 1 D = 1 ahat = zeros(p) # initial parameters rel_res = zeros(p) # initial relative residual maxIter = 100 rel_a = [] dev_a = [] while not converged and nIter < maxIter: W = diags(w*mu**2/sig, 0) # compute weights z = eta + (y - mu)/mu # adjusted dependent variable WX = W.dot(X) XTWX = dot(X.T, WX) iXTWX = inv(XTWX) Wz = W.dot(z) ahat_n = dot(iXTWX, dot(X.T, Wz)) eta = dot(X, ahat_n) # compute estimates mu = exp(eta) # linear predictor # calculate residual : rel_res = norm(ahat - ahat_n, inf) rel_a.append(rel_res) ahat = ahat_n D_n = sum((y - mu)**2) deviance = abs(D_n - D) D = D_n dev_a.append(deviance) if rel_res < rtol or deviance < dtol: converged = True nIter += 1 string = "Newton iteration %d: d (abs) = %.2e, (tol = %.2e) r (rel) = %.2e (tol = %.2e)" print string % (nIter, deviance, dtol, rel_res, rtol) # calculate statistics : varA = diag(iXTWX) # variance of alpha hat sea = sqrt(varA) # vector of standard errors for alpha hat t_a = ahat / sea pval = t.sf(abs(t_a), dof) * 2 conf = 0.95 # 95% confidence interval tbonf = t.ppf((1 - conf/p), dof) # bonferroni corrected t-value ci = tbonf*sea # confidence interval for ahat resid = (y - mu) # 'working' residual RSS = sum((y - mu)**2) # residual sum of squares TSS = sum((y - mean(y))**2) # total sum of squares R2 = (TSS-RSS)/TSS # R2 F = (TSS-RSS)/(p-1) * (n-p)/RSS # F-statistic F_p = fdtrc(p-1, dof, F) # F-Stat. p-value # log-likelihood : L = sum((y*mu - mu**2/2)/(2*sig) - y**2/(2*sig) - 0.5*log(2*pi*sig)) AIC = (-2*L + 2*p)/n # AIC statistic # estimated error variance : sighat = 1/(n-p) * RSS vara = { 'ahat' : ahat, 'yhat' : mu, 'sea' : sea, 'ci' : ci, 'dof' : dof, 'resid' : resid, 'rel_a' : rel_a, 'dev_a' : dev_a, 'R2' : R2, 'F' : F, 'AIC' : AIC, 'sighat': sighat} return vara
def anovan(x, y, factor_names, conf, interaction=False): ym = mean(y) SST = sum((y - ym)**2) # find the indexes to each of the groups within each treatment : # n-way analysis if type(x) == list: tmt_names = [] tmt_idxs = [] tmt_means = [] tmt_lens = [] X = [] # design matrix na = float(shape(x)[1]) # Sample size X.append(ones(na)) # tack on intercept for x_i in x: types = unique(x_i) tmt_names.append(types) ii = [] means = [] lens = [] for t in types: i = where(x_i == t)[0] x_c = zeros(na) x_c[i] = 1.0 ii.append(i) lens.append(len(i)) means.append(mean(y[i])) X.append(x_c) X = X[:-1] # remove the redundant information tmt_idxs.append(array(ii)) tmt_means.append(array(means)) tmt_lens.append(array(lens)) tmt_names = array(tmt_names) tmt_idxs = array(tmt_idxs) tmt_means = array(tmt_means) tmt_lens = array(tmt_lens) # sum of squares between cells : SSB = 0 a = len(tmt_idxs[0]) b = len(tmt_idxs[1]) dfT = len(y) - 1 dfA = a - 1 dfB = b - 1 dfAB = dfA * dfB dfE = len(y) - a * b cell_means = [] for l1 in tmt_idxs[0]: c_m = [] for l2 in tmt_idxs[1]: ii = intersect1d(l1, l2) if ii.size != 0: c_m.append(mean(y[ii])) SSB += len(y[ii]) * (mean(y[ii]) - ym)**2 cell_means.append(array(c_m)) cell_means = array(cell_means) # one-way analysis else: na = float(len(x)) tmt_names = unique(x) X = [] # design matrix X.append(ones(na)) # tack on intercept for t in tmt_names: ii = where(x == t)[0] # form a column of the design matrix : x_c = zeros(na) x_c[ii] = 1.0 # append to the lists : X.append(x_c) X = X[:-1] # ensure non-singular matrix # add rows for interaction terms : if interaction and type(x) == list: k = 0 for t in tmt_names[:-1]: k += len(t) for i, x1 in enumerate(X[1:k]): for x2 in X[k:]: X.append(x1 * x2) X = array(X).T # design matrix is done # calculate statistics : SS = array([]) inter_names = [] for il, nl, name, mul in zip(tmt_idxs, tmt_lens, factor_names, tmt_means): SS = append(SS, sum( nl*(mul - ym)**2)) inter_names.append(name) if interaction: inter_names.append(inter_names[0] + ' x ' + inter_names[1]) SS = append(SS, SSB - SS[0] - SS[1]) # fit the data to the model : muhat = dot( dot( inv(dot(X.T, X)), X.T), y) yhat = dot(X, muhat) resid = y - yhat SSE = SST - sum(SS) # calculate mean-squares : MSA = SS[0] / dfA MSB = SS[1] / dfB MSE = SSE / dfE # calculate F-statistics : FA = MSA / MSE FB = MSB / MSE # calculate p-values: pA = fdtrc(dfA, dfE, FA) pB = fdtrc(dfB, dfE, FB) if interaction : MSAB = SS[2] / dfAB FAB = MSAB / MSE pAB = fdtrc(dfAB, dfE, FAB) vara = {'tmt_names' : tmt_names, 'tmt_means' : tmt_means, 'tmt_lens' : tmt_lens, 'tmt_idxs' : tmt_idxs, 'cell_means': cell_means, 'SST' : SST, 'SSB' : SSB, 'SSE' : SSE, 'SS' : SS, 'MSA' : MSA, 'MSB' : MSB, 'MSAB' : MSAB, 'MSE' : MSE, 'FA' : FA, 'FB' : FB, 'FAB' : FAB, 'dfA' : dfA, 'dfB' : dfB, 'dfAB' : dfAB, 'dfE' : dfE, 'dfT' : dfT, 'pA' : pA, 'pB' : pB, 'pAB' : pAB, 'i_names' : inter_names, 'muhat' : muhat, 'yhat' : yhat, 'resid' : resid} else : vara = {'tmt_names' : tmt_names, 'tmt_means' : tmt_means, 'tmt_lens' : tmt_lens, 'tmt_idxs' : tmt_idxs, 'cell_means': cell_means, 'SST' : SST, 'SSB' : SSB, 'SSE' : SSE, 'SS' : SS, 'MSA' : MSA, 'MSB' : MSB, 'MSE' : MSE, 'FA' : FA, 'FB' : FB, 'dfA' : dfA, 'dfB' : dfB, 'dfE' : dfE, 'dfT' : dfT, 'pA' : pA, 'pB' : pB, 'i_names' : inter_names, 'muhat' : muhat, 'yhat' : yhat, 'resid' : resid} return vara
def nonlinRegstats(x, y, f, beta0, conf): def residual(beta, x, y, f): err = y - f(x, beta) return err out = leastsq(residual, beta0, args=(x,y,f), full_output=True) bhat = out[0] J = out[1] nfo = out[2] fjac = nfo['fjac'] ipvt = nfo['ipvt'] msg = out[3] ier = out[4] n = float(len(x)) # Sample size p = float(len(beta0)) # number of parameters dof = max(0, n - p) # Degrees of freedom nov = p - 1 # number of variables xm = mean(x) # Mean of time values ym = mean(y) # Mean of log recovery yhat = f(x, bhat) # non-linear fit line SSE = sum((y - yhat)**2) # Sum of Squared Errors SST = sum((y - ym )**2) # Sum of Squared Total SSR = sum((yhat - ym )**2) # Sum of Squared Residuals (SSR = SST - SSE) R2 = SSR / SST # R^2 Statistic (rval**2) MSE = SSE / dof # Mean Squared Error (MSE) MSR = SSR / nov # Mean Squared Residual (MSR) F = MSR / MSE # F-Statistic F_p = fdtrc(nov, dof, F) # F-Stat. p-value # covariance matrix: covB = MSE * J # Vector of standard errors for beta hat (seb) and yhat (sey) seb = sqrt(diag(covB)) sey = sqrt(MSE * (1.0/n + (x - xm)**2 / sum((x - xm)**2) ) ) tbonf = t.ppf((1+conf)/2.0, dof) # uncorrected t*-value # calculate t-statistic : t_b = bhat / seb t_y = yhat / sey # calculate p-values : pval = t.sf(abs(t_b), dof) * 2 # Confidence intervals ci_b = [bhat - tbonf*seb, bhat + tbonf*seb] ci_y = [yhat - tbonf*sey, yhat + tbonf*sey] resid = y - yhat vara = { 'SSR' : SSR, 'SSE' : SSE, 'SST' : SST, 'df' : (nov, dof, n-1), 'MSR' : MSR, 'MSE' : MSE, 'F' : F, 'F_p' : F_p, 'SEB' : seb, 'SEY' : sey, 't_beta': t_b, 't_y' : t_y, 'pval' : pval, 't' : tbonf, 'CIB' : array(ci_b), 'CIY' : array(ci_y), 'bhat' : bhat, 'yhat' : yhat, 'R2' : R2, 'covB' : covB, 'J' : J, 'fjac' : fjac, 'resid' : resid} return vara
def linRegstats(x, y, conf): if size(shape(x)) == 1: p = 2 n = float(len(x)) # Sample size else: p,n = shape(x) # (samples size, # of parameters) n = float(n) p = p + 1 # add one for intercept dof = n - p # Degrees of freedom nov = p - 1 # number of variables ym = mean(y) # Mean of log recovery X = vstack((ones(n), x)).T # observed x-variable matrix bhat = dot( dot( inv(dot(X.T, X)), X.T), y) yhat = dot(X, bhat) # Linear fit line SSE = sum((y - yhat)**2) # Sum of Squared Errors SST = sum((y - ym )**2) # Sum of Squared Total SSR = sum((yhat - ym )**2) # Sum of Squared Residuals (SSR = SST - SSE) R2 = SSR / SST # R^2 Statistic (rval**2) MSE = SSE / dof # Mean Squared Error (MSE) MSR = SSR / nov # Mean Squared Residual (MSR) F = MSR / MSE # F-Statistic F_p = fdtrc(nov, dof, F) # F-Stat. p-value # variance of beta estimates : VARb = MSE * inv(dot(X.T, X)) # diag(VARb) == varB varB = diag(VARb) # variance of beta hat seb = sqrt(varB) # vector of standard errors for beta hat # variance of y estimates : VARy = MSE * dot(dot(X, inv(dot(X.T, X))), X.T) varY = diag(VARy) # variance of y hat sey = sqrt(varY) # standard errors for yhat # calculate t-statistic : t_b = bhat / seb t_y = yhat / sey # calculate p-values : pval = t.sf(abs(t_b), dof) * 2 tbonf = t.ppf((1+conf)/2.0, dof) # uncorrected t*-value ci_b = [bhat - tbonf*seb, # Confidence intervals for betas bhat + tbonf*seb] # in 2 columns (lower,upper) ci_y = [yhat - tbonf*sey, # Confidence intervals for betas yhat + tbonf*sey] # in 2 columns (lower,upper) resid = y - yhat vara = { 'SSR' : SSR, 'SSE' : SSE, 'SST' : SST, 'df' : (nov, dof, n-1), 'MSR' : MSR, 'MSE' : MSE, 'F' : F, 'F_pval': F_p, 'varY' : varY, 'varB' : varB, 'SEB' : seb, 'SEY' : sey, 'tbonf' : tbonf, 't_beta': t_b, 't_y' : t_y, 'pval' : pval, 'CIB' : array(ci_b), 'CIY' : array(ci_y), 'bhat' : bhat, 'yhat' : yhat, 'R2' : R2, 'resid' : resid} return vara
def durbin(*args): # taken verbatim from scipy.stats._support.abut def _abut(source, *args): source = np.asarray(source) if len(source.shape) == 1: width = 1 source = np.resize(source, [source.shape[0], width]) else: width = source.shape[1] for addon in args: if len(addon.shape) == 1: width = 1 addon = np.resize(addon, [source.shape[0], width]) else: width = source.shape[1] if len(addon) < len(source): addon = np.resize(addon, [source.shape[0], addon.shape[1]]) elif len(addon) > len(source): source = np.resize(source, [addon.shape[0], source.shape[1]]) source = np.concatenate((source, addon), 1) return source # also taken from scipy.stats, but ignores everything under 0. def _rankposdata(a): a = np.ravel(a) b = np.argsort(a) a = a[b] n = len(a) dupcount = 0 oldrank = -1 sumranks = 0 newarray = np.zeros(n, float) for i in range(n): if a[i] <= 0.: newarray[b[i]] = 0. continue oldrank += 1 sumranks += oldrank dupcount += 1 if i == n - 1 or a[i] != a[i + 1]: averrank = float(sumranks) / float(dupcount) + 1 for j in range(i - dupcount + 1, i + 1): newarray[b[j]] = averrank sumranks = 0 dupcount = 0 return newarray b = len(args) if b < 3: raise ValueError('Less than 3 levels. Durbin test is not appropriate') k = len(args[0]) for i in range(1, b): if len(args[i]) != k: raise ValueError('Unequal N in durbin. Aborting.') data = _abut(*args) data = data.astype(float) A = 0. t = data.shape[1] R = np.zeros(t, float) rs = np.zeros(t, int) for i in range(len(data)): data[i] = _rankposdata(data[i]) for j in range(len(data[i])): A += pow(data[i, j], 2.) R[j] += data[i, j] if data[i, j] > 0.: rs[j] += 1 r = np.mean(rs) t = float(t) b = float(b) k = float(k) C = b * k * pow(k + 1, 2) / 4 T1 = (t - 1) * sum([pow(x, 2) - r * C for x in R]) / (A - C) T2 = (T1 / (t - 1)) / ((b * k - b - T1) / (b * k - b - t + 1)) print(data) print(R) print("r = %g, t = %g, b = %g, k = %g, C = %g, A = %g, T1 = %g" % (r, t, b, k, C, A, T1)) return T2, fdtrc(k - 1, b * k - b - t + 1, T2)
def grpStats(x, y, alpha, interaction=False): grp = unique(x) sig = std(y) t = float(len(grp)) na = float(len(y)) ymt = mean(y) idx = [] # index list corresponding to group yms = [] # mean of groups ss = [] # standard deviations of groups nums = [] # number of elements in group sems = [] # standard errors of groups X = [] # design matrix X.append(ones(na)) # tack on intercept for g in grp: ii = where(x == g)[0] # form a column of the design matrix : x_c = zeros(na) x_c[ii] = 1.0 # collect necessary statistics : n = float(len(ii)) ym = mean(y[ii]) s = std(y[ii]) sem = s / sqrt(n) # append to the lists : X.append(x_c) idx.append(ii) yms.append(ym) ss.append(s) nums.append(n) sems.append(sem) X = array(X[:-1]).T # remove the redundant information idx = array(idx) yms = array(yms) ss = array(ss) nums = array(nums) sems = array(sems) pair = [] pair_mean = [] hsd_ci = [] # sort from largest to smallest srt = argsort(yms)[::-1] grp_s = grp[srt] yms_s = yms[srt] num_s = nums[srt] # calculate the Tukey confidence intervals : for i,(g1, ym1, n1) in enumerate(zip(grp_s, yms_s, num_s)): for g2, ym2, n2 in zip(grp_s[i+1:][::-1], yms_s[i+1:][::-1], num_s[i+1:][::-1]): p_m = ym1 - ym2 c = qsturng(alpha, t, na - t) / sqrt(2) * sig * sqrt(1/n1 + 1/n2) pair_mean.append(p_m) pair.append(g1 + ' - ' + g2) hsd_ci.append(c) srt = argsort(pair_mean) pair = array(pair)[srt] pair_mean = array(pair_mean)[srt] hsd_ci = array(hsd_ci)[srt] # calculate more statistics : SSB = sum( nums * (yms - ymt)**2 ) SSW = sum((nums - 1) * ss**2) MSW = SSW / (na - t) MSB = SSB / (t - 1) f = MSB / MSW p = fdtrc((t - 1), (na - t), f) # fit the data to the model : muhat = dot( dot( inv(dot(X.T, X)), X.T), y) yhat = dot(X, muhat) resid = y - yhat vara = {'grp_means' : yms, 'grp_SDs' : ss, 'grp_SEMs' : sems, 'grp_names' : grp, 'grp_lens' : nums, 'grp_dof' : t - 1, 'dof' : na - t, 'F' : f, 'MSW' : MSW, 'MSB' : MSB, 'alpha' : alpha, 'pval' : p, 'pairs' : pair, 'pair_means' : pair_mean, 'HSD_CIs' : hsd_ci, 'muhat' : muhat, 'yhat' : yhat, 'resid' : resid} return vara
def f_oneway(*args): """Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Read more in the :ref:`User Guide <univariate_feature_selection>`. Parameters ---------- *args : array_like, sparse matrices sample1, sample2... The sample measurements should be given as arguments. Returns ------- F-value : float The computed F-value of the test. p-value : float The associated p-value from the F-distribution. Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although with some loss of power. The algorithm is from Heiman[2], pp.394-7. See ``scipy.stats.f_oneway`` that should give the same results while being less efficient. References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) args = [as_float_array(a) for a in args] n_samples_per_class = np.array([a.shape[0] for a in args]) n_samples = np.sum(n_samples_per_class) ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args) sums_args = [np.asarray(a.sum(axis=0)) for a in args] square_of_sums_alldata = sum(sums_args)**2 square_of_sums_args = [s**2 for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, _ in enumerate(args): ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_features_idx = np.where(msw == 0.)[0] if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size): warnings.warn("Features %s are constant." % constant_features_idx, UserWarning) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = special.fdtrc(dfbn, dfwn, f) return f, prob
def h2o_f_oneway(*args): """Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Parameters ---------- sample1, sample2, ... : array_like, H2OFrames, shape=(n_classes,) The sample measurements should be given as varargs (*args). A slice of the original input frame for each class in the target feature. Returns ------- f : float The computed F-value of the test. prob : float The associated p-value from the F-distribution. Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (``scipy.stats.kruskal``) although with some loss of power. The algorithm is from Heiman[2], pp.394-7. See ``scipy.stats.f_oneway`` and ``sklearn.feature_selection.f_oneway``. References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) # sklearn converts everything to float here. Rather than do so, # we will test for total numericism and fail out if it's not 100% # numeric. if not all([all([X.isnumeric() for X in args])]): raise ValueError("All features must be entirely numeric for F-test") n_samples_per_class = [X.shape[0] for X in args] n_samples = np.sum(n_samples_per_class) # compute the sum of squared values in each column, and then compute the column # sums of all of those intermittent rows rbound together ss_alldata = rbind_all(*[X.apply(lambda x: (x * x).sum()) for X in args]).apply(lambda x: x.sum()) # compute the sum of each column for each X in args, then rbind them all # and sum them up, finally squaring them. Tantamount to the squared sum # of each complete column. Note that we need to add a tiny fraction to ensure # all are real numbers for the rbind... sum_args = [X.apply(lambda x: x.sum() + 1e-12).asnumeric() for X in args] # col sums square_of_sums_alldata = rbind_all(*sum_args).apply(lambda x: x.sum()) square_of_sums_alldata *= square_of_sums_alldata square_of_sums_args = [s * s for s in sum_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = None # h2o frame for k, _ in enumerate(args): tmp = square_of_sums_args[k] / n_samples_per_class[k] ssbn = tmp if ssbn is None else (ssbn + tmp) ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_feature_idx = (msw == 0) constant_feature_sum = constant_feature_idx.sum() # sum of ones nonzero_size = (msb != 0).sum() if nonzero_size != msb.shape[1] and constant_feature_sum: warnings.warn( "Features %s are constant." % np.arange(msw.shape[1])[constant_feature_idx], UserWarning) f = (msb / msw) # convert to numpy ndarray for special f = f.as_data_frame(use_pandas=True).iloc[0].values # compute prob prob = special.fdtrc(dfbn, dfwn, f) return f, prob
def durbin(*args): # taken verbatim from scipy.stats._support.abut def _abut(source, *args): source = np.asarray(source) if len(source.shape) == 1: width = 1 source = np.resize(source, [source.shape[0], width]) else: width = source.shape[1] for addon in args: if len(addon.shape) == 1: width = 1 addon = np.resize(addon, [source.shape[0], width]) else: width = source.shape[1] if len(addon) < len(source): addon = np.resize(addon, [source.shape[0], addon.shape[1]]) elif len(addon) > len(source): source = np.resize(source, [addon.shape[0], source.shape[1]]) source = np.concatenate((source, addon), 1) return source # also taken from scipy.stats, but ignores everything under 0. def _rankposdata(a): a = np.ravel(a) b = np.argsort(a) a = a[b] n = len(a) dupcount = 0 oldrank = -1 sumranks = 0 newarray = np.zeros(n, float) for i in range(n): if a[i] <= 0.: newarray[b[i]] = 0. continue oldrank += 1 sumranks += oldrank dupcount += 1 if i == n - 1 or a[i] != a[i + 1]: averrank = float(sumranks) / float(dupcount) + 1 for j in range(i - dupcount + 1, i + 1): newarray[b[j]] = averrank sumranks = 0 dupcount = 0 return newarray b = len(args) if b < 3: raise ValueError( 'Less than 3 levels. Durbin test is not appropriate') k = len(args[0]) for i in range(1, b): if len(args[i]) != k: raise ValueError('Unequal N in durbin. Aborting.') data = _abut(*args) data = data.astype(float) A = 0. t = data.shape[1] R = np.zeros(t, float) rs = np.zeros(t, int) for i in range(len(data)): data[i] = _rankposdata(data[i]) for j in range(len(data[i])): A += pow(data[i, j], 2.) R[j] += data[i, j] if data[i, j] > 0.: rs[j] += 1 r = np.mean(rs) t = float(t) b = float(b) k = float(k) C = b * k * pow(k + 1, 2) / 4 T1 = (t - 1) * sum([pow(x, 2) - r * C for x in R]) / (A - C) T2 = (T1 / (t - 1)) / ((b * k - b - T1) / (b * k - b - t + 1)) print(data) print(R) print("r = %g, t = %g, b = %g, k = %g, C = %g, A = %g, T1 = %g" % (r, t, b, k, C, A, T1)) return T2, fdtrc(k - 1, b * k - b - t + 1, T2)
def glm(x,y,w=1.0): p,n = shape(x) # sample size p += 1 # add one for intercept dof = n - p # degrees of freedom sig = var(y) # variance mu = (y + mean(y))/2.0 # initial mean estimate eta = log(mu) # initial predictor X = vstack((ones(n), x)).T # observed x-variable matrix # Newton-Raphson : converged = False rtol = 1e-15 dtol = 1e-15 lmbda = 1.0 nIter = 0 deviance = 1 D = 1 ahat = zeros(p) # initial parameters rel_res = zeros(p) # initial relative residual maxIter = 65 rel_a = [] dev_a = [] while not converged and nIter < maxIter: W = diags(w*mu**2/sig, 0) # compute weights z = eta + (y - mu)/mu # adjusted dependent variable WX = W.dot(X) XTWX = dot(X.T, WX) iXTWX = inv(XTWX) Wz = W.dot(z) ahat_n = dot(iXTWX, dot(X.T, Wz)) eta = dot(X, ahat_n) # compute estimates mu = exp(eta) # linear predictor # calculate residual : rel_res = norm(ahat - ahat_n, inf) rel_a.append(rel_res) ahat = ahat_n D_n = sum((y - mu)**2) deviance = abs(D_n - D) D = D_n dev_a.append(deviance) if rel_res < rtol or deviance < dtol: converged = True nIter += 1 string = "Newton iteration %d: d (abs) = %.2e, (tol = %.2e) r (rel) = %.2e (tol = %.2e)" print string % (nIter, deviance, dtol, rel_res, rtol) # calculate statistics : varA = diag(iXTWX) # variance of alpha hat sea = sqrt(varA) # vector of standard errors for alpha hat t_a = ahat / sea pval = t.sf(abs(t_a), dof) * 2 conf = 0.95 # 95% confidence interval tbonf = t.ppf((1 - conf/p), dof) # bonferroni corrected t-value ci = tbonf*sea # confidence interval for ahat resid = (y - mu) # 'working' residual RSS = sum((y - mu)**2) # residual sum of squares TSS = sum((y - mean(y))**2) # total sum of squares R2 = (TSS-RSS)/TSS # R2 F = (TSS-RSS)/(p-1) * (n-p)/RSS # F-statistic F_p = fdtrc(p-1, dof, F) # F-Stat. p-value # log-likelihood : L = sum((y*mu - mu**2/2)/(2*sig) - y**2/(2*sig) - 0.5*log(2*pi*sig)) AIC = (-2*L + 2*p)/n # AIC statistic # estimated error variance : sighat = 1/(n-p) * RSS vara = { 'ahat' : ahat, 'yhat' : mu, 'sea' : sea, 'ci' : ci, 'dof' : dof, 'resid' : resid, 'rel_a' : rel_a, 'dev_a' : dev_a, 'R2' : R2, 'F' : F, 'AIC' : AIC, 'sighat': sighat} return vara
def h2o_f_oneway(*args): """Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Parameters ---------- sample1, sample2, ... : array_like, H2OFrames, shape=(n_classes,) The sample measurements should be given as varargs (*args). A slice of the original input frame for each class in the target feature. Returns ------- f : float The computed F-value of the test. prob : float The associated p-value from the F-distribution. Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (``scipy.stats.kruskal``) although with some loss of power. The algorithm is from Heiman[2], pp.394-7. See ``scipy.stats.f_oneway`` and ``sklearn.feature_selection.f_oneway``. References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) # sklearn converts everything to float here. Rather than do so, # we will test for total numericism and fail out if it's not 100% # numeric. if not all([all([X.isnumeric() for X in args])]): raise ValueError("All features must be entirely numeric for F-test") n_samples_per_class = [X.shape[0] for X in args] n_samples = np.sum(n_samples_per_class) # compute the sum of squared values in each column, and then compute the column # sums of all of those intermittent rows rbound together ss_alldata = rbind_all(*[X.apply(lambda x: (x*x).sum()) for X in args]).apply(lambda x: x.sum()) # compute the sum of each column for each X in args, then rbind them all # and sum them up, finally squaring them. Tantamount to the squared sum # of each complete column. Note that we need to add a tiny fraction to ensure # all are real numbers for the rbind... sum_args = [X.apply(lambda x: x.sum() + 1e-12).asnumeric() for X in args] # col sums square_of_sums_alldata = rbind_all(*sum_args).apply(lambda x: x.sum()) square_of_sums_alldata *= square_of_sums_alldata square_of_sums_args = [s*s for s in sum_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = None # h2o frame for k, _ in enumerate(args): tmp = square_of_sums_args[k] / n_samples_per_class[k] ssbn = tmp if ssbn is None else (ssbn + tmp) ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_feature_idx = (msw == 0) constant_feature_sum = constant_feature_idx.sum() # sum of ones nonzero_size = (msb != 0).sum() if nonzero_size != msb.shape[1] and constant_feature_sum: warnings.warn("Features %s are constant." % np.arange(msw.shape[1])[constant_feature_idx], UserWarning) f = (msb / msw) # convert to numpy ndarray for special f = f.as_data_frame(use_pandas=True).iloc[0].values # compute prob prob = special.fdtrc(dfbn, dfwn, f) return f, prob
def f_oneway(*args): """Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Read more in the :ref:`User Guide <univariate_feature_selection>`. Parameters ---------- sample1, sample2, ... : array_like, sparse matrices The sample measurements should be given as arguments. Returns ------- F-value : float The computed F-value of the test. p-value : float The associated p-value from the F-distribution. Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although with some loss of power. The algorithm is from Heiman[2], pp.394-7. See ``scipy.stats.f_oneway`` that should give the same results while being less efficient. References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) args = [as_float_array(a) for a in args] n_samples_per_class = np.array([a.shape[0] for a in args]) n_samples = np.sum(n_samples_per_class) ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args) sums_args = [np.asarray(a.sum(axis=0)) for a in args] square_of_sums_alldata = sum(sums_args) ** 2 square_of_sums_args = [s ** 2 for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, _ in enumerate(args): ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_features_idx = np.where(msw == 0.)[0] if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size): warnings.warn("Features %s are constant." % constant_features_idx, UserWarning) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = special.fdtrc(dfbn, dfwn, f) return f, prob
def polynomtest(x, y, degree=2): y_prime = y - y.mean() ss_tot = np.sum(y_prime**2) print('ss_tot: ' + str(ss_tot)) ### first linear regression b_linear, rsq_linear = linreg.linreg(x, y) df_linear = len(y) - 2 ypred_linear = b_linear[1] * x + b_linear[0] error_linear = y - ypred_linear error_ss_linear = np.sum(error_linear**2) rsq_linear2 = 1. - (error_ss_linear / ss_tot) df_linear = len(y) - 2 print('df_linear: ' + str(df_linear)) print('b_linear: ' + str(b_linear)) print('rsq_linear: ' + str(rsq_linear)) print('rsq_linear2: ' + str(rsq_linear2)) print('error_ss_linear: ' + str(error_ss_linear)) print('') if degree <= 1: return b_linear else: ### next quadratic quadratic_b = np.polyfit(x, y, 2) ypred_quadratic = np.polyval(quadratic_b, x) error_quadratic = y - ypred_quadratic error_ss_quadratic = np.sum(error_quadratic**2) rsq_quadratic = 1. - (error_ss_quadratic / ss_tot) df_quadratic = len(y) - 3 fstat2_quadratic = (error_ss_quadratic - error_ss_linear) / ( error_ss_quadratic / df_quadratic) f_stat_quadratic = df_quadratic * (rsq_quadratic - rsq_linear) / (1. - rsq_quadratic) p_quadratic = special.fdtrc(2, df_quadratic, f_stat_quadratic) print('df_quadratic: ' + str(df_quadratic)) print('quadratic_b: ' + str(quadratic_b)) print('rsq_quadratic: ' + str(rsq_quadratic)) print('error_ss_quadratic: ' + str(error_ss_quadratic)) print('fstat2_quadratic: ' + str(fstat2_quadratic)) print('f_stat_quadratic: ' + str(f_stat_quadratic)) print('p_quadratic: ' + str(p_quadratic)) print('') if degree <= 2: return quadratic_b else: ### next cubic cubic_b = np.polyfit(x, y, 3) ypred_cubic = np.polyval(cubic_b, x) error_cubic = y - ypred_cubic error_ss_cubic = np.sum(error_cubic**2) rsq_cubic = 1. - (error_ss_cubic / ss_tot) df_cubic = len(y) - 4 f_stat_cubic = df_cubic * (rsq_cubic - rsq_quadratic) / (1. - rsq_cubic) p_cubic = special.fdtrc(2, df_cubic, f_stat_cubic) print('cubic_b: ' + str(cubic_b)) print('rsq_cubic: ' + str(rsq_cubic)) print('f_stat_cubic: ' + str(f_stat_cubic)) print('p_cubic: ' + str(p_cubic)) return cubic_b