def FPvalue( *args):
	df_btwn, df_within = __degree_of_freedom_( *args)
	mss_btwn = __ss_between_( *args) / float( df_btwn)   
	mss_within = __ss_within_( *args) / float( df_within)
	F = mss_btwn / mss_within	
	P = special.fdtrc( df_btwn, df_within, F)
	return( F, P)
예제 #2
0
def welch_anova_np(args, var_equal=False):
    """
    args : array like of array likes
        A list of groups (lists of floats) which should be compared.
    var_equal : boolean
        The groups share a common variance.
    """
    # Define Welch's ANOVA, which is robust against unequal variances
    # see https://statisticsbyjim.com/anova/welchs-anova-compared-to-classic-one-way-anova/
    # https://stackoverflow.com/questions/50964427/welchs-anova-in-python
    # https://github.com/scipy/scipy/issues/11122
    args = [np.asarray(arg, dtype=float) for arg in args]
    k = len(args)
    ni = np.array([len(arg) for arg in args])
    mi = np.array([np.mean(arg) for arg in args])
    vi = np.array([np.var(arg, ddof=1) for arg in args])
    wi = ni / vi

    tmp = np.sum((1 - wi / np.sum(wi))**2 / (ni - 1))
    tmp /= (k**2 - 1)

    dfbn = k - 1
    dfwn = 1 / (3 * tmp)

    m = np.sum(mi * wi) / np.sum(wi)
    f = np.sum(wi * (mi - m)**2) / (dfbn * (1 + 2 * (dfbn - 1) * tmp))
    prob = fdtrc(dfbn, dfwn, f)
    return stats.stats.F_onewayResult(f, prob)
def FPvalue(*args):
    df_btwn, df_within = __degree_of_freedom_(*args)
    mss_btwn = __ss_between_(*args) / float(df_btwn)
    mss_within = __ss_within_(*args) / float(df_within)
    F = mss_btwn / mss_within
    P = special.fdtrc(df_btwn, df_within, F)
    return (F, P)
예제 #4
0
def f_oneway(*args):

    n_classes = len(args)
    args = [as_float_array(a) for a in args]
    n_samples_per_class = np.array([a.shape[0] for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
    sums_args = [np.asarray(a.sum(axis=0)) for a in args]
    square_of_sums_alldata = sum(sums_args)**2
    square_of_sums_args = [s**2 for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    constant_features_idx = np.where(msw == 0.)[0]
    if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size):
        warnings.warn("Features %s are constant." % constant_features_idx,
                      UserWarning)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob
예제 #5
0
파일: utils.py 프로젝트: juliadeneva/PINT
def FTest(chi2_1, dof_1, chi2_2, dof_2):
    """
    Run F-test.

    Compute an F-test to see if a model with extra parameters is
    significant compared to a simpler model.  The input values are the
    (non-reduced) chi^2 values and the numbers of DOF for '1' the
    original model and '2' for the new model (with more fit params).
    The probability is computed exactly like Sherpa's F-test routine
    (in Ciao) and is also described in the Wikipedia article on the
    F-test:  http://en.wikipedia.org/wiki/F-test
    The returned value is the probability that the improvement in
    chi2 is due to chance (i.e. a low probability means that the
    new fit is quantitatively better, while a value near 1 means
    that the new model should likely be rejected).

    Parameters
    -----------
    chi2_1 : Float
        Chi-squared value of model with fewer parameters
    dof_1 : Int
        Degrees of freedom of model with fewer parameters
    chi2_2 : Float
        Chi-squared value of model with more parameters
    dof_2 : Int
        Degrees of freedom of model with more parameters

    Returns
    --------
    ft : Float
        F-test significance value for the model with the larger number of
        components over the other.
    """
    delta_chi2 = chi2_1 - chi2_2
    if delta_chi2 > 0 and dof_1 != dof_2:
        delta_dof = dof_1 - dof_2
        new_redchi2 = chi2_2 / dof_2
        F = np.float64((delta_chi2 / delta_dof) /
                       new_redchi2)  # fdtr doesn't like float128
        ft = fdtrc(delta_dof, dof_2, F)
    else:
        if delta_chi2 <= 0:
            log.warning(
                "Chi^2 for Model 2 is larger than Chi^2 for Model 1, cannot preform F-test."
            )
        elif dof_1 == dof_2:
            log.warning(
                "Models have equal degrees of freedom, cannot preform F-test.")
        ft = False
    return ft
예제 #6
0
def anova(arr):
    def _square_of_sums(a):
        s = np.sum(a, 0)
        if not np.isscalar(s):
            return s.astype(float) * s
        else:
            return float(s) * s

    def _sum_of_squares(a):
        return np.sum(a * a, 0)

    # # If all inputs equivalent return 0, not nan as default behaviour
    # if sum([(np.asarray(arr[0]) == x).all() for x in arr[1:]]) == len(arr)-1:
    #     return 0

    args = [np.asarray(arg, dtype=float) for arg in arr]

    # ANOVA on N groups, each in its own array
    num_groups = len(args)
    alldata = np.concatenate(args)
    bign = len(alldata)

    # Determine the mean of the data, and subtract that from all inputs to a
    # variance (via sum_of_sq / sq_of_sum) calculation.  Variance is invariance
    # to a shift in location, and centering all data around zero vastly
    # improves numerical stability.
    offset = alldata.mean()
    alldata -= offset

    sstot = _sum_of_squares(alldata) - (_square_of_sums(alldata) / float(bign))
    ssbn = 0
    for a in args:
        ssbn += _square_of_sums(a - offset) / float(len(a))

    # Naming: variables ending in bn/b are for "between treatments", wn/w are
    # for "within treatments"
    ssbn -= (_square_of_sums(alldata) / float(bign))
    sswn = sstot - ssbn
    dfbn = num_groups - 1
    dfwn = bign - num_groups
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw
    if f < 0:  # correct rounding errors :/
        f = 0
    prob = special.fdtrc(dfbn, dfwn, f)  # equivalent to stats.f.sf

    return prob
예제 #7
0
def compute_F_statistic_and_pvalue(*args):
    """ 
    Return F statistic an p-value
    """
    # Compute degrees of freedom
    df_btwn, df_within = __degree_of_freedom_(*args)

    # Compute sums of squares
    mss_btwn = __ss_between_(*args) / float(df_btwn)
    mss_within = __ss_within_(*args) / float(df_within)

    # F statistic
    F = mss_btwn / mss_within

    pvalue = special.fdtrc(df_btwn, df_within, F)

    return (F, pvalue, df_btwn, df_within)
예제 #8
0
def _f_oneway_lower(lifted):
    """Performs a 1-way ANOVA.

    Parameters
    ----------
    lifted : FOnewayData
        The result of `to_monoid`.

    Returns
    -------
    F-value : float
        The computed F-value of the test.
    p-value : float
        The associated p-value from the F-distribution.
    """
    classes = lifted.classes
    n_samples_per_class = lifted.n_samples_per_class
    n_samples = lifted.n_samples
    ss_alldata = lifted.ss_alldata
    sums_samples = lifted.sums_samples
    sums_alldata = lifted.sums_alldata
    n_classes = len(classes)
    square_of_sums_alldata = sums_alldata ** 2
    square_of_sums_args = {k: s ** 2 for k, s in sums_samples.items()}
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.0
    for k in n_samples_per_class:
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    # constant_features_idx = np.where(msw == 0.0)[0]
    # if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size):
    #     warnings.warn("Features %s are constant." % constant_features_idx,
    #                   UserWarning)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob
예제 #9
0
def linRegstats(x, y, conf):

    if size(shape(x)) == 1:
        p = 2
        n = len(x)  # Sample size
    else:
        p, n = shape(x)  # (samples size, # of parameters)
        p = p + 1  # add one for intercept
    dof = n - p  # Degrees of freedom
    nov = p - 1  # number of variables
    ym = mean(y)  # Mean of log recovery
    X = vstack((ones(n), x)).T  # observed x-variable matrix
    XTX = dot(X.T, X)
    iXTX = inv(XTX)
    bhat = dot(dot(iXTX, X.T), y)
    yhat = dot(X, bhat)  # Linear fit line
    SSE = sum((y - yhat)**2)  # Sum of Squared Errors
    SST = sum((y - ym)**2)  # Sum of Squared Total
    SSR = sum((yhat - ym)**2)  # Sum of Squared Residuals (SSR = SST - SSE)
    R2 = SSR / SST  # R^2 Statistic (rval**2)
    MSE = SSE / dof  # Mean Squared Error (MSE)
    MSR = SSR / nov  # Mean Squared Residual (MSR)
    F = MSR / MSE  # F-Statistic
    F_p = fdtrc(nov, dof, F)  # F-Stat. p-value

    # variance of beta estimates :
    VARb = MSE * iXTX  # diag(VARb) == varB
    varB = diag(VARb)  # variance of beta hat
    seb = sqrt(varB)  # vector of standard errors for beta hat

    ## variance of y estimates :
    #VARy   = MSE * dot(dot(X, iXTX), X.T)
    #varY   = diag(VARy)             # variance of y hat
    #sey    = sqrt(varY)             # standard errors for yhat

    # calculate t-statistic :
    t_b = bhat / seb
    #t_y    = yhat / sey

    # calculate p-values :
    pval = t.sf(abs(t_b), dof) * 2

    tbonf = t.ppf((1 + conf) / 2.0, dof)  # uncorrected t*-value

    ci_b = [
        bhat - tbonf * seb,  # Confidence intervals for betas
        bhat + tbonf * seb
    ]  #   in 2 columns (lower,upper)

    #ci_y   = [yhat - tbonf*sey,     # Confidence intervals for estimates
    #          yhat + tbonf*sey]     #   in 2 columns (lower,upper)

    resid = y - yhat

    #vara = { 'SSR'   : SSR,
    #         'SSE'   : SSE,
    #         'SST'   : SST,
    #         'df'    : (nov, dof, n-1),
    #         'MSR'   : MSR,
    #         'MSE'   : MSE,
    #         'F'     : F,
    #         'F_pval': F_p,
    #         'varY'  : varY,
    #         'varB'  : varB,
    #         'SEB'   : seb,
    #         'SEY'   : sey,
    #         'tbonf' : tbonf,
    #         't_beta': t_b,
    #         't_y'   : t_y,
    #         'pval'  : pval,
    #         'CIB'   : array(ci_b),
    #         'CIY'   : array(ci_y),
    #         'bhat'  : bhat,
    #         'yhat'  : yhat,
    #         'R2'    : R2,
    #         'resid' : resid}
    vara = {
        'SSR': SSR,
        'SSE': SSE,
        'SST': SST,
        'df': (nov, dof, n - 1),
        'MSR': MSR,
        'MSE': MSE,
        'F': F,
        'F_pval': F_p,
        'varB': varB,
        'SEB': seb,
        'tbonf': tbonf,
        't_beta': t_b,
        'pval': pval,
        'CIB': array(ci_b),
        'bhat': bhat,
        'yhat': yhat,
        'R2': R2,
        'resid': resid
    }
    return vara
예제 #10
0
파일: linear_model.py 프로젝트: pf4d/cslvr
def glm(x,y,w=1.0):

  p,n    = shape(x)                    # sample size
  p     += 1                           # add one for intercept
  dof    = n - p                       # degrees of freedom
  
  sig    = var(y)                      # variance
  mu     = (y + mean(y))/2.0           # initial mean estimate
  eta    = log(mu)                     # initial predictor
  X      = vstack((ones(n), x)).T      # observed x-variable matrix

  # Newton-Raphson :
  converged = False
  rtol      = 1e-12
  dtol      = 1e-12
  lmbda     = 1.0
  nIter     = 0
  deviance  = 1
  D         = 1
  ahat      = zeros(p)   # initial parameters
  rel_res   = zeros(p)   # initial relative residual
  maxIter   = 100

  rel_a = []
  dev_a = []

  while not converged and nIter < maxIter:
    W       = diags(w*mu**2/sig, 0)         # compute weights
    z       = eta + (y - mu)/mu             # adjusted dependent variable

    WX      = W.dot(X)
    XTWX    = dot(X.T, WX)
    iXTWX   = inv(XTWX)
    Wz      = W.dot(z)

    ahat_n  = dot(iXTWX, dot(X.T, Wz))
    
    eta     = dot(X, ahat_n)               # compute estimates
    mu      = exp(eta)                     # linear predictor

    # calculate residual :
    rel_res  = norm(ahat - ahat_n, inf)
    rel_a.append(rel_res)
    ahat     = ahat_n

    D_n      = sum((y - mu)**2)
    deviance = abs(D_n - D)
    D        = D_n
    dev_a.append(deviance)
    
    if rel_res < rtol or deviance < dtol: converged = True
    nIter +=  1

    string = "Newton iteration %d: d (abs) = %.2e, (tol = %.2e) r (rel) = %.2e (tol = %.2e)"
    print string % (nIter, deviance, dtol, rel_res, rtol)
  
  # calculate statistics :
  varA   = diag(iXTWX)            # variance of alpha hat
  sea    = sqrt(varA)             # vector of standard errors for alpha hat
  t_a    = ahat / sea
  pval   = t.sf(abs(t_a), dof) * 2
  conf   = 0.95                        # 95% confidence interval
  tbonf  = t.ppf((1 - conf/p), dof)    # bonferroni corrected t-value
  ci     = tbonf*sea                   # confidence interval for ahat
  resid  = (y - mu)                    # 'working' residual
                                       
  RSS    = sum((y - mu)**2)            # residual sum of squares
  TSS    = sum((y - mean(y))**2)       # total sum of squares
  R2     = (TSS-RSS)/TSS               # R2
  F      = (TSS-RSS)/(p-1) * (n-p)/RSS # F-statistic
  F_p    = fdtrc(p-1, dof, F)          # F-Stat. p-value

  # log-likelihood :
  L      = sum((y*mu - mu**2/2)/(2*sig) - y**2/(2*sig) - 0.5*log(2*pi*sig))
  AIC    = (-2*L + 2*p)/n              # AIC statistic

  # estimated error variance :
  sighat = 1/(n-p) * RSS
                                        
  vara = { 'ahat'  : ahat,              
           'yhat'  : mu,                
           'sea'   : sea,               
           'ci'    : ci,                
           'dof'   : dof,               
           'resid' : resid,             
           'rel_a' : rel_a,
           'dev_a' : dev_a,
           'R2'    : R2,
           'F'     : F,
           'AIC'   : AIC,
           'sighat': sighat}
  return vara
예제 #11
0
def anovan(x, y, factor_names, conf, interaction=False):
  
  ym  = mean(y)
  SST = sum((y - ym)**2)

  # find the indexes to each of the groups within each treatment :
  # n-way analysis 
  if type(x) == list:
    tmt_names = []
    tmt_idxs  = []
    tmt_means = []
    tmt_lens  = []
    X         = []                     # design matrix
    na        = float(shape(x)[1])     # Sample size
    X.append(ones(na))                 # tack on intercept
    for x_i in x:
      types = unique(x_i)
      tmt_names.append(types)
      ii    = []
      means = []
      lens  = []
      for t in types:
        i       = where(x_i == t)[0]
        x_c     = zeros(na)
        x_c[i]  = 1.0
        ii.append(i)
        lens.append(len(i))
        means.append(mean(y[i]))
        X.append(x_c)
      X = X[:-1]         # remove the redundant information 
      tmt_idxs.append(array(ii))
      tmt_means.append(array(means))
      tmt_lens.append(array(lens))
    tmt_names = array(tmt_names)
    tmt_idxs  = array(tmt_idxs)
    tmt_means = array(tmt_means)
    tmt_lens  = array(tmt_lens)
    
    # sum of squares between cells :
    SSB = 0
    a   = len(tmt_idxs[0])
    b   = len(tmt_idxs[1])
    dfT = len(y) - 1
    dfA = a - 1
    dfB = b - 1
    dfAB = dfA * dfB
    dfE  = len(y) - a * b
    cell_means = []
    for l1 in tmt_idxs[0]:
      c_m = []
      for l2 in tmt_idxs[1]:
        ii = intersect1d(l1, l2)
        if ii.size != 0:
          c_m.append(mean(y[ii]))
          SSB += len(y[ii]) * (mean(y[ii]) - ym)**2
      cell_means.append(array(c_m))
    cell_means = array(cell_means)


  # one-way analysis
  else:
    na        = float(len(x))
    tmt_names = unique(x)
    X         = []     # design matrix
    X.append(ones(na)) # tack on intercept
    for t in tmt_names:
      ii  = where(x == t)[0]
      
      # form a column of the design matrix :
      x_c     = zeros(na)
      x_c[ii] = 1.0
      
      # append to the lists :
      X.append(x_c) 
    X = X[:-1]         # ensure non-singular matrix 

  # add rows for interaction terms :
  if interaction and type(x) == list:
    k = 0
    for t in tmt_names[:-1]:
      k += len(t)
      for i, x1 in enumerate(X[1:k]):
        for x2 in X[k:]:
          X.append(x1 * x2)
  X = array(X).T       # design matrix is done

  # calculate statistics :
  SS = array([])
  inter_names = []
  for il, nl, name, mul in zip(tmt_idxs, tmt_lens, factor_names, tmt_means):
    SS = append(SS, sum( nl*(mul - ym)**2))
    inter_names.append(name)
  if interaction:
    inter_names.append(inter_names[0] + ' x ' + inter_names[1])
    SS = append(SS, SSB - SS[0] - SS[1])
  
  # fit the data to the model :
  muhat = dot( dot( inv(dot(X.T, X)), X.T), y)
  yhat  = dot(X, muhat)
  resid = y - yhat
  SSE   = SST - sum(SS)

  # calculate mean-squares :
  MSA  = SS[0] / dfA
  MSB  = SS[1] / dfB
  MSE  = SSE   / dfE
 
  # calculate F-statistics :
  FA  = MSA  / MSE
  FB  = MSB  / MSE

  # calculate p-values:
  pA   = fdtrc(dfA, dfE, FA)
  pB   = fdtrc(dfB, dfE, FB)
  
  if interaction :
    MSAB = SS[2] / dfAB
    FAB  = MSAB / MSE
    pAB  = fdtrc(dfAB, dfE, FAB)
    vara = {'tmt_names' : tmt_names,
            'tmt_means' : tmt_means,
            'tmt_lens'  : tmt_lens,
            'tmt_idxs'  : tmt_idxs,
            'cell_means': cell_means,
            'SST'       : SST,
            'SSB'       : SSB,
            'SSE'       : SSE,
            'SS'        : SS,
            'MSA'       : MSA,
            'MSB'       : MSB,
            'MSAB'      : MSAB,
            'MSE'       : MSE,
            'FA'        : FA,
            'FB'        : FB,
            'FAB'       : FAB,
            'dfA'       : dfA,
            'dfB'       : dfB,
            'dfAB'      : dfAB,
            'dfE'       : dfE,
            'dfT'       : dfT,
            'pA'        : pA,
            'pB'        : pB,
            'pAB'       : pAB,
            'i_names'   : inter_names,
            'muhat'     : muhat,
            'yhat'      : yhat,
            'resid'     : resid}
  else :  
    vara = {'tmt_names' : tmt_names,
            'tmt_means' : tmt_means,
            'tmt_lens'  : tmt_lens,
            'tmt_idxs'  : tmt_idxs,
            'cell_means': cell_means,
            'SST'       : SST,
            'SSB'       : SSB,
            'SSE'       : SSE,
            'SS'        : SS,
            'MSA'       : MSA,
            'MSB'       : MSB,
            'MSE'       : MSE,
            'FA'        : FA,
            'FB'        : FB,
            'dfA'       : dfA,
            'dfB'       : dfB,
            'dfE'       : dfE,
            'dfT'       : dfT,
            'pA'        : pA,
            'pB'        : pB,
            'i_names'   : inter_names,
            'muhat'     : muhat,
            'yhat'      : yhat,
            'resid'     : resid}
  return vara
예제 #12
0
def nonlinRegstats(x, y, f, beta0, conf):

  def residual(beta, x, y, f):
    err = y - f(x, beta)
    return err


  out    = leastsq(residual, beta0, args=(x,y,f), full_output=True)

  bhat   = out[0]
  J      = out[1]
  nfo    = out[2]
  fjac   = nfo['fjac']
  ipvt   = nfo['ipvt']
  msg    = out[3]
  ier    = out[4]
   
  n      = float(len(x))          # Sample size
  p      = float(len(beta0))      # number of parameters
  dof    = max(0, n - p)          # Degrees of freedom
  nov    = p - 1                  # number of variables
  xm     = mean(x)                # Mean of time values
  ym     = mean(y)                # Mean of log recovery
  yhat   = f(x,  bhat)            # non-linear fit line
  SSE    = sum((y    - yhat)**2)  # Sum of Squared Errors
  SST    = sum((y    - ym  )**2)  # Sum of Squared Total
  SSR    = sum((yhat - ym  )**2)  # Sum of Squared Residuals (SSR = SST - SSE)
  R2     = SSR / SST              # R^2 Statistic (rval**2)
  MSE    = SSE / dof              # Mean Squared Error (MSE)
  MSR    = SSR / nov              # Mean Squared Residual (MSR)
  F      = MSR / MSE              # F-Statistic
  F_p    = fdtrc(nov, dof, F)     # F-Stat. p-value
  
  # covariance matrix:
  covB   = MSE * J

  # Vector of standard errors for beta hat (seb) and yhat (sey)
  seb    = sqrt(diag(covB))
  sey    = sqrt(MSE * (1.0/n + (x - xm)**2 / sum((x - xm)**2) ) )
  tbonf  = t.ppf((1+conf)/2.0, dof)  # uncorrected t*-value
  
  # calculate t-statistic :
  t_b    = bhat / seb
  t_y    = yhat / sey
 
  # calculate p-values :
  pval   = t.sf(abs(t_b), dof) * 2
  
  # Confidence intervals
  ci_b   = [bhat - tbonf*seb,
            bhat + tbonf*seb]

  ci_y   = [yhat - tbonf*sey,
            yhat + tbonf*sey]
  
  resid  = y - yhat
  
  vara = { 'SSR'   : SSR,
           'SSE'   : SSE,
           'SST'   : SST,
           'df'    : (nov, dof, n-1),
           'MSR'   : MSR,
           'MSE'   : MSE,
           'F'     : F,
           'F_p'   : F_p,
           'SEB'   : seb,
           'SEY'   : sey,
           't_beta': t_b,
           't_y'   : t_y,
           'pval'  : pval,
           't'     : tbonf,
           'CIB'   : array(ci_b),
           'CIY'   : array(ci_y),
           'bhat'  : bhat,
           'yhat'  : yhat,
           'R2'    : R2,
           'covB'  : covB,
           'J'     : J,
           'fjac'  : fjac,
           'resid' : resid}
  return vara
예제 #13
0
def linRegstats(x, y, conf):

  if size(shape(x)) == 1: 
    p    = 2
    n    = float(len(x))          # Sample size
  else:
    p,n  = shape(x)               # (samples size, # of parameters)
    n    = float(n)
    p    = p + 1                  # add one for intercept
  dof    = n - p                  # Degrees of freedom
  nov    = p - 1                  # number of variables
  ym     = mean(y)                # Mean of log recovery
  X      = vstack((ones(n), x)).T # observed x-variable matrix
  bhat   = dot( dot( inv(dot(X.T, X)), X.T), y)
  yhat   = dot(X, bhat)           # Linear fit line
  SSE    = sum((y    - yhat)**2)  # Sum of Squared Errors
  SST    = sum((y    - ym  )**2)  # Sum of Squared Total
  SSR    = sum((yhat - ym  )**2)  # Sum of Squared Residuals (SSR = SST - SSE)
  R2     = SSR / SST              # R^2 Statistic (rval**2)
  MSE    = SSE / dof              # Mean Squared Error (MSE)
  MSR    = SSR / nov              # Mean Squared Residual (MSR)
  F      = MSR / MSE              # F-Statistic
  F_p    = fdtrc(nov, dof, F)     # F-Stat. p-value
  
  # variance of beta estimates :
  VARb   = MSE * inv(dot(X.T, X)) # diag(VARb) == varB
  varB   = diag(VARb)             # variance of beta hat
  seb    = sqrt(varB)             # vector of standard errors for beta hat
 
  # variance of y estimates : 
  VARy   = MSE * dot(dot(X, inv(dot(X.T, X))), X.T)
  varY   = diag(VARy)             # variance of y hat
  sey    = sqrt(varY)             # standard errors for yhat
 
  # calculate t-statistic :
  t_b    = bhat / seb
  t_y    = yhat / sey
 
  # calculate p-values :
  pval   = t.sf(abs(t_b), dof) * 2
  
  tbonf  = t.ppf((1+conf)/2.0, dof)  # uncorrected t*-value
  
  ci_b   = [bhat - tbonf*seb,     # Confidence intervals for betas
            bhat + tbonf*seb]     #   in 2 columns (lower,upper)
  
  ci_y   = [yhat - tbonf*sey,     # Confidence intervals for betas
            yhat + tbonf*sey]     #   in 2 columns (lower,upper)

  resid  = y - yhat

  vara = { 'SSR'   : SSR,
           'SSE'   : SSE,
           'SST'   : SST,
           'df'    : (nov, dof, n-1),
           'MSR'   : MSR,
           'MSE'   : MSE,
           'F'     : F,
           'F_pval': F_p,
           'varY'  : varY,
           'varB'  : varB,
           'SEB'   : seb,
           'SEY'   : sey,
           'tbonf' : tbonf,
           't_beta': t_b,
           't_y'   : t_y,
           'pval'  : pval,
           'CIB'   : array(ci_b),
           'CIY'   : array(ci_y),
           'bhat'  : bhat,
           'yhat'  : yhat,
           'R2'    : R2,
           'resid' : resid}
  return vara
예제 #14
0
    def durbin(*args):
        # taken verbatim from scipy.stats._support.abut
        def _abut(source, *args):
            source = np.asarray(source)
            if len(source.shape) == 1:
                width = 1
                source = np.resize(source, [source.shape[0], width])
            else:
                width = source.shape[1]
            for addon in args:
                if len(addon.shape) == 1:
                    width = 1
                    addon = np.resize(addon, [source.shape[0], width])
                else:
                    width = source.shape[1]
                if len(addon) < len(source):
                    addon = np.resize(addon, [source.shape[0], addon.shape[1]])
                elif len(addon) > len(source):
                    source = np.resize(source, [addon.shape[0], source.shape[1]])
                source = np.concatenate((source, addon), 1)
            return source

        # also taken from scipy.stats, but ignores everything under 0.
        def _rankposdata(a):
            a = np.ravel(a)
            b = np.argsort(a)
            a = a[b]
            n = len(a)
            dupcount = 0
            oldrank = -1
            sumranks = 0
            newarray = np.zeros(n, float)
            for i in range(n):
                if a[i] <= 0.:
                    newarray[b[i]] = 0.
                    continue
                oldrank += 1
                sumranks += oldrank
                dupcount += 1
                if i == n - 1 or a[i] != a[i + 1]:
                    averrank = float(sumranks) / float(dupcount) + 1
                    for j in range(i - dupcount + 1, i + 1):
                        newarray[b[j]] = averrank
                    sumranks = 0
                    dupcount = 0
            return newarray

        b = len(args)
        if b < 3:
            raise ValueError('Less than 3 levels. Durbin test is not appropriate')
        k = len(args[0])
        for i in range(1, b):
            if len(args[i]) != k:
                raise ValueError('Unequal N in durbin. Aborting.')

        data = _abut(*args)
        data = data.astype(float)

        A = 0.
        t = data.shape[1]
        R = np.zeros(t, float)
        rs = np.zeros(t, int)
        for i in range(len(data)):
            data[i] = _rankposdata(data[i])
            for j in range(len(data[i])):
                A += pow(data[i, j], 2.)
                R[j] += data[i, j]
                if data[i, j] > 0.:
                    rs[j] += 1

        r = np.mean(rs)
        t = float(t)
        b = float(b)
        k = float(k)
        C = b * k * pow(k + 1, 2) / 4
        T1 = (t - 1) * sum([pow(x, 2) - r * C for x in R]) / (A - C)
        T2 = (T1 / (t - 1)) / ((b * k - b - T1) / (b * k - b - t + 1))

        print(data)
        print(R)
        print("r = %g, t = %g, b = %g, k = %g, C = %g, A = %g, T1 = %g" % (r, t, b, k, C, A, T1))

        return T2, fdtrc(k - 1, b * k - b - t + 1, T2)
예제 #15
0
def grpStats(x, y, alpha, interaction=False):

  grp = unique(x)
  sig = std(y)
  t   = float(len(grp))
  na  = float(len(y))
  ymt = mean(y)

  idx   = [] # index list corresponding to group
  yms   = [] # mean of groups
  ss    = [] # standard deviations of groups
  nums  = [] # number of elements in group
  sems  = [] # standard errors of groups
  X     = [] # design matrix
  X.append(ones(na)) # tack on intercept
  for g in grp:
    ii  = where(x == g)[0]
    
    # form a column of the design matrix :
    x_c     = zeros(na)
    x_c[ii] = 1.0
    
    # collect necessary statistics :
    n   = float(len(ii))
    ym  = mean(y[ii])
    s   = std(y[ii])
    sem = s / sqrt(n)
   
    # append to the lists :
    X.append(x_c) 
    idx.append(ii)
    yms.append(ym)
    ss.append(s)
    nums.append(n)
    sems.append(sem)
  
  X    = array(X[:-1]).T         # remove the redundant information 
  idx  = array(idx)
  yms  = array(yms)
  ss   = array(ss)
  nums = array(nums)
  sems = array(sems)

  pair      = []
  pair_mean = []
  hsd_ci    = []

  # sort from largest to smallest
  srt   = argsort(yms)[::-1]
  grp_s = grp[srt]
  yms_s = yms[srt]
  num_s = nums[srt]

  # calculate the Tukey confidence intervals :
  for i,(g1, ym1, n1) in enumerate(zip(grp_s, yms_s, num_s)):
    for g2, ym2, n2 in zip(grp_s[i+1:][::-1], 
                           yms_s[i+1:][::-1], 
                           num_s[i+1:][::-1]):
      p_m = ym1 - ym2
      c   = qsturng(alpha, t, na - t) / sqrt(2) * sig * sqrt(1/n1 + 1/n2)
      pair_mean.append(p_m)
      pair.append(g1 + ' - ' + g2)
      hsd_ci.append(c)
  srt       = argsort(pair_mean)
  pair      = array(pair)[srt]
  pair_mean = array(pair_mean)[srt]
  hsd_ci    = array(hsd_ci)[srt]
  
  # calculate more statistics :
  SSB   = sum( nums * (yms - ymt)**2 )
  SSW   = sum((nums - 1) * ss**2)
  MSW   = SSW / (na - t)
  MSB   = SSB / (t - 1)
  f     = MSB / MSW
  p     = fdtrc((t - 1), (na - t), f) 

  # fit the data to the model :
  muhat = dot( dot( inv(dot(X.T, X)), X.T), y)
  yhat  = dot(X, muhat)
  resid = y - yhat
  
  vara = {'grp_means'  : yms,
          'grp_SDs'    : ss,
          'grp_SEMs'   : sems,
          'grp_names'  : grp,
          'grp_lens'   : nums,
          'grp_dof'    : t - 1,
          'dof'        : na - t,
          'F'          : f,
          'MSW'        : MSW,
          'MSB'        : MSB,
          'alpha'      : alpha,
          'pval'       : p,
          'pairs'      : pair,
          'pair_means' : pair_mean,
          'HSD_CIs'    : hsd_ci,
          'muhat'      : muhat,
          'yhat'       : yhat,
          'resid'      : resid}
  return vara
예제 #16
0
def f_oneway(*args):
    """Performs a 1-way ANOVA.

    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    *args : array_like, sparse matrices
        sample1, sample2... The sample measurements should be given as
        arguments.

    Returns
    -------
    F-value : float
        The computed F-value of the test.
    p-value : float
        The associated p-value from the F-distribution.

    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.

    See ``scipy.stats.f_oneway`` that should give the same results while
    being less efficient.

    References
    ----------

    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.

    """
    n_classes = len(args)
    args = [as_float_array(a) for a in args]
    n_samples_per_class = np.array([a.shape[0] for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
    sums_args = [np.asarray(a.sum(axis=0)) for a in args]
    square_of_sums_alldata = sum(sums_args)**2
    square_of_sums_args = [s**2 for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    constant_features_idx = np.where(msw == 0.)[0]
    if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size):
        warnings.warn("Features %s are constant." % constant_features_idx,
                      UserWarning)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob
예제 #17
0
def h2o_f_oneway(*args):
    """Performs a 1-way ANOVA.
    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Parameters
    ----------

    sample1, sample2, ... : array_like, H2OFrames, shape=(n_classes,)
        The sample measurements should be given as varargs (*args).
        A slice of the original input frame for each class in the
        target feature.

    Returns
    -------

    f : float
        The computed F-value of the test.

    prob : float
        The associated p-value from the F-distribution.

    Notes
    -----

    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (``scipy.stats.kruskal``) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.
    See ``scipy.stats.f_oneway`` and ``sklearn.feature_selection.f_oneway``.

    References
    ----------

    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.
    """
    n_classes = len(args)

    # sklearn converts everything to float here. Rather than do so,
    # we will test for total numericism and fail out if it's not 100%
    # numeric.
    if not all([all([X.isnumeric() for X in args])]):
        raise ValueError("All features must be entirely numeric for F-test")

    n_samples_per_class = [X.shape[0] for X in args]
    n_samples = np.sum(n_samples_per_class)

    # compute the sum of squared values in each column, and then compute the column
    # sums of all of those intermittent rows rbound together
    ss_alldata = rbind_all(*[X.apply(lambda x: (x * x).sum())
                             for X in args]).apply(lambda x: x.sum())

    # compute the sum of each column for each X in args, then rbind them all
    # and sum them up, finally squaring them. Tantamount to the squared sum
    # of each complete column. Note that we need to add a tiny fraction to ensure
    # all are real numbers for the rbind...
    sum_args = [X.apply(lambda x: x.sum() + 1e-12).asnumeric()
                for X in args]  # col sums
    square_of_sums_alldata = rbind_all(*sum_args).apply(lambda x: x.sum())
    square_of_sums_alldata *= square_of_sums_alldata

    square_of_sums_args = [s * s for s in sum_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)

    ssbn = None  # h2o frame
    for k, _ in enumerate(args):
        tmp = square_of_sums_args[k] / n_samples_per_class[k]
        ssbn = tmp if ssbn is None else (ssbn + tmp)

    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)

    constant_feature_idx = (msw == 0)
    constant_feature_sum = constant_feature_idx.sum()  # sum of ones
    nonzero_size = (msb != 0).sum()
    if nonzero_size != msb.shape[1] and constant_feature_sum:
        warnings.warn(
            "Features %s are constant." %
            np.arange(msw.shape[1])[constant_feature_idx], UserWarning)

    f = (msb / msw)

    # convert to numpy ndarray for special
    f = f.as_data_frame(use_pandas=True).iloc[0].values

    # compute prob
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob
예제 #18
0
    def durbin(*args):
        # taken verbatim from scipy.stats._support.abut
        def _abut(source, *args):
            source = np.asarray(source)
            if len(source.shape) == 1:
                width = 1
                source = np.resize(source, [source.shape[0], width])
            else:
                width = source.shape[1]
            for addon in args:
                if len(addon.shape) == 1:
                    width = 1
                    addon = np.resize(addon, [source.shape[0], width])
                else:
                    width = source.shape[1]
                if len(addon) < len(source):
                    addon = np.resize(addon, [source.shape[0], addon.shape[1]])
                elif len(addon) > len(source):
                    source = np.resize(source,
                                       [addon.shape[0], source.shape[1]])
                source = np.concatenate((source, addon), 1)
            return source

        # also taken from scipy.stats, but ignores everything under 0.
        def _rankposdata(a):
            a = np.ravel(a)
            b = np.argsort(a)
            a = a[b]
            n = len(a)
            dupcount = 0
            oldrank = -1
            sumranks = 0
            newarray = np.zeros(n, float)
            for i in range(n):
                if a[i] <= 0.:
                    newarray[b[i]] = 0.
                    continue
                oldrank += 1
                sumranks += oldrank
                dupcount += 1
                if i == n - 1 or a[i] != a[i + 1]:
                    averrank = float(sumranks) / float(dupcount) + 1
                    for j in range(i - dupcount + 1, i + 1):
                        newarray[b[j]] = averrank
                    sumranks = 0
                    dupcount = 0
            return newarray

        b = len(args)
        if b < 3:
            raise ValueError(
                'Less than 3 levels. Durbin test is not appropriate')
        k = len(args[0])
        for i in range(1, b):
            if len(args[i]) != k:
                raise ValueError('Unequal N in durbin. Aborting.')

        data = _abut(*args)
        data = data.astype(float)

        A = 0.
        t = data.shape[1]
        R = np.zeros(t, float)
        rs = np.zeros(t, int)
        for i in range(len(data)):
            data[i] = _rankposdata(data[i])
            for j in range(len(data[i])):
                A += pow(data[i, j], 2.)
                R[j] += data[i, j]
                if data[i, j] > 0.:
                    rs[j] += 1

        r = np.mean(rs)
        t = float(t)
        b = float(b)
        k = float(k)
        C = b * k * pow(k + 1, 2) / 4
        T1 = (t - 1) * sum([pow(x, 2) - r * C for x in R]) / (A - C)
        T2 = (T1 / (t - 1)) / ((b * k - b - T1) / (b * k - b - t + 1))

        print(data)
        print(R)
        print("r = %g, t = %g, b = %g, k = %g, C = %g, A = %g, T1 = %g" %
              (r, t, b, k, C, A, T1))

        return T2, fdtrc(k - 1, b * k - b - t + 1, T2)
예제 #19
0
def glm(x,y,w=1.0):

  p,n    = shape(x)                    # sample size
  p     += 1                           # add one for intercept
  dof    = n - p                       # degrees of freedom
  
  sig    = var(y)                      # variance
  mu     = (y + mean(y))/2.0           # initial mean estimate
  eta    = log(mu)                     # initial predictor
  X      = vstack((ones(n), x)).T      # observed x-variable matrix

  # Newton-Raphson :
  converged = False
  rtol      = 1e-15
  dtol      = 1e-15
  lmbda     = 1.0
  nIter     = 0
  deviance  = 1
  D         = 1
  ahat      = zeros(p)   # initial parameters
  rel_res   = zeros(p)   # initial relative residual
  maxIter   = 65

  rel_a = []
  dev_a = []

  while not converged and nIter < maxIter:
    W       = diags(w*mu**2/sig, 0)         # compute weights
    z       = eta + (y - mu)/mu             # adjusted dependent variable

    WX      = W.dot(X)
    XTWX    = dot(X.T, WX)
    iXTWX   = inv(XTWX)
    Wz      = W.dot(z)

    ahat_n  = dot(iXTWX, dot(X.T, Wz))
    
    eta     = dot(X, ahat_n)               # compute estimates
    mu      = exp(eta)                     # linear predictor

    # calculate residual :
    rel_res  = norm(ahat - ahat_n, inf)
    rel_a.append(rel_res)
    ahat     = ahat_n

    D_n      = sum((y - mu)**2)
    deviance = abs(D_n - D)
    D        = D_n
    dev_a.append(deviance)
    
    if rel_res < rtol or deviance < dtol: converged = True
    nIter +=  1

    string = "Newton iteration %d: d (abs) = %.2e, (tol = %.2e) r (rel) = %.2e (tol = %.2e)"
    print string % (nIter, deviance, dtol, rel_res, rtol)
  
  # calculate statistics :
  varA   = diag(iXTWX)            # variance of alpha hat
  sea    = sqrt(varA)             # vector of standard errors for alpha hat
  t_a    = ahat / sea
  pval   = t.sf(abs(t_a), dof) * 2
  conf   = 0.95                      # 95% confidence interval
  tbonf  = t.ppf((1 - conf/p), dof)  # bonferroni corrected t-value
  ci     = tbonf*sea                 # confidence interval for ahat
  resid  = (y - mu)                  # 'working' residual
                                       
  RSS    = sum((y - mu)**2)            # residual sum of squares
  TSS    = sum((y - mean(y))**2)       # total sum of squares
  R2     = (TSS-RSS)/TSS               # R2
  F      = (TSS-RSS)/(p-1) * (n-p)/RSS # F-statistic
  F_p    = fdtrc(p-1, dof, F)          # F-Stat. p-value

  # log-likelihood :
  L      = sum((y*mu - mu**2/2)/(2*sig) - y**2/(2*sig) - 0.5*log(2*pi*sig))
  AIC    = (-2*L + 2*p)/n              # AIC statistic

  # estimated error variance :
  sighat = 1/(n-p) * RSS
                                        
  vara = { 'ahat'  : ahat,              
           'yhat'  : mu,                
           'sea'   : sea,               
           'ci'    : ci,                
           'dof'   : dof,               
           'resid' : resid,             
           'rel_a' : rel_a,
           'dev_a' : dev_a,
           'R2'    : R2,
           'F'     : F,
           'AIC'   : AIC,
           'sighat': sighat}
  return vara
예제 #20
0
def h2o_f_oneway(*args):
    """Performs a 1-way ANOVA.
    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Parameters
    ----------

    sample1, sample2, ... : array_like, H2OFrames, shape=(n_classes,)
        The sample measurements should be given as varargs (*args).
        A slice of the original input frame for each class in the
        target feature.

    Returns
    -------

    f : float
        The computed F-value of the test.

    prob : float
        The associated p-value from the F-distribution.

    Notes
    -----

    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (``scipy.stats.kruskal``) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.
    See ``scipy.stats.f_oneway`` and ``sklearn.feature_selection.f_oneway``.

    References
    ----------

    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.
    """
    n_classes = len(args)

    # sklearn converts everything to float here. Rather than do so,
    # we will test for total numericism and fail out if it's not 100%
    # numeric.
    if not all([all([X.isnumeric() for X in args])]):
        raise ValueError("All features must be entirely numeric for F-test")

    n_samples_per_class = [X.shape[0] for X in args]
    n_samples = np.sum(n_samples_per_class)

    # compute the sum of squared values in each column, and then compute the column
    # sums of all of those intermittent rows rbound together
    ss_alldata = rbind_all(*[X.apply(lambda x: (x*x).sum()) for X in args]).apply(lambda x: x.sum())

    # compute the sum of each column for each X in args, then rbind them all
    # and sum them up, finally squaring them. Tantamount to the squared sum
    # of each complete column. Note that we need to add a tiny fraction to ensure
    # all are real numbers for the rbind...
    sum_args = [X.apply(lambda x: x.sum() + 1e-12).asnumeric() for X in args]  # col sums
    square_of_sums_alldata = rbind_all(*sum_args).apply(lambda x: x.sum())
    square_of_sums_alldata *= square_of_sums_alldata

    square_of_sums_args = [s*s for s in sum_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)

    ssbn = None  # h2o frame
    for k, _ in enumerate(args):
        tmp = square_of_sums_args[k] / n_samples_per_class[k]
        ssbn = tmp if ssbn is None else (ssbn + tmp)

    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)

    constant_feature_idx = (msw == 0)
    constant_feature_sum = constant_feature_idx.sum()  # sum of ones
    nonzero_size = (msb != 0).sum()
    if nonzero_size != msb.shape[1] and constant_feature_sum:
        warnings.warn("Features %s are constant." % np.arange(msw.shape[1])[constant_feature_idx], UserWarning)

    f = (msb / msw)

    # convert to numpy ndarray for special
    f = f.as_data_frame(use_pandas=True).iloc[0].values

    # compute prob
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob
예제 #21
0
def f_oneway(*args):
    """Performs a 1-way ANOVA.

    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    sample1, sample2, ... : array_like, sparse matrices
        The sample measurements should be given as arguments.

    Returns
    -------
    F-value : float
        The computed F-value of the test.
    p-value : float
        The associated p-value from the F-distribution.

    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.

    See ``scipy.stats.f_oneway`` that should give the same results while
    being less efficient.

    References
    ----------

    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.

    """
    n_classes = len(args)
    args = [as_float_array(a) for a in args]
    n_samples_per_class = np.array([a.shape[0] for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
    sums_args = [np.asarray(a.sum(axis=0)) for a in args]
    square_of_sums_alldata = sum(sums_args) ** 2
    square_of_sums_args = [s ** 2 for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    constant_features_idx = np.where(msw == 0.)[0]
    if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size):
        warnings.warn("Features %s are constant." % constant_features_idx,
                      UserWarning)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob
def polynomtest(x, y, degree=2):

    y_prime = y - y.mean()
    ss_tot = np.sum(y_prime**2)
    print('ss_tot: ' + str(ss_tot))

    ### first linear regression
    b_linear, rsq_linear = linreg.linreg(x, y)
    df_linear = len(y) - 2
    ypred_linear = b_linear[1] * x + b_linear[0]
    error_linear = y - ypred_linear
    error_ss_linear = np.sum(error_linear**2)
    rsq_linear2 = 1. - (error_ss_linear / ss_tot)
    df_linear = len(y) - 2
    print('df_linear: ' + str(df_linear))
    print('b_linear: ' + str(b_linear))
    print('rsq_linear: ' + str(rsq_linear))
    print('rsq_linear2: ' + str(rsq_linear2))
    print('error_ss_linear: ' + str(error_ss_linear))
    print('')

    if degree <= 1:
        return b_linear
    else:
        ### next quadratic
        quadratic_b = np.polyfit(x, y, 2)
        ypred_quadratic = np.polyval(quadratic_b, x)
        error_quadratic = y - ypred_quadratic
        error_ss_quadratic = np.sum(error_quadratic**2)
        rsq_quadratic = 1. - (error_ss_quadratic / ss_tot)
        df_quadratic = len(y) - 3
        fstat2_quadratic = (error_ss_quadratic - error_ss_linear) / (
            error_ss_quadratic / df_quadratic)
        f_stat_quadratic = df_quadratic * (rsq_quadratic -
                                           rsq_linear) / (1. - rsq_quadratic)
        p_quadratic = special.fdtrc(2, df_quadratic, f_stat_quadratic)

        print('df_quadratic: ' + str(df_quadratic))
        print('quadratic_b: ' + str(quadratic_b))
        print('rsq_quadratic: ' + str(rsq_quadratic))
        print('error_ss_quadratic: ' + str(error_ss_quadratic))
        print('fstat2_quadratic: ' + str(fstat2_quadratic))
        print('f_stat_quadratic: ' + str(f_stat_quadratic))
        print('p_quadratic: ' + str(p_quadratic))
        print('')

        if degree <= 2:
            return quadratic_b
        else:
            ### next cubic
            cubic_b = np.polyfit(x, y, 3)
            ypred_cubic = np.polyval(cubic_b, x)
            error_cubic = y - ypred_cubic
            error_ss_cubic = np.sum(error_cubic**2)
            rsq_cubic = 1. - (error_ss_cubic / ss_tot)
            df_cubic = len(y) - 4
            f_stat_cubic = df_cubic * (rsq_cubic - rsq_quadratic) / (1. -
                                                                     rsq_cubic)
            p_cubic = special.fdtrc(2, df_cubic, f_stat_cubic)
            print('cubic_b: ' + str(cubic_b))
            print('rsq_cubic: ' + str(rsq_cubic))
            print('f_stat_cubic: ' + str(f_stat_cubic))
            print('p_cubic: ' + str(p_cubic))

            return cubic_b