def _call(self, dataset, labels=None): # This code is based on SciPy's stats.f_oneway() # Copyright (c) Gary Strangman. All rights reserved # License: BSD # # However, it got tweaked and optimized to better fit into PyMVPA. # number of groups if labels is None: labels = dataset.targets ul = np.unique(labels) na = len(ul) bign = float(dataset.nsamples) alldata = dataset.samples # total squares of sums sostot = np.sum(alldata, axis=0) sostot *= sostot sostot /= bign # total sum of squares sstot = np.sum(alldata * alldata, axis=0) - sostot # between group sum of squares ssbn = 0 for l in ul: # all samples for the respective label d = alldata[labels == l] sos = np.sum(d, axis=0) sos *= sos ssbn += sos / float(len(d)) ssbn -= sostot # within sswn = sstot - ssbn # degrees of freedom dfbn = na-1 dfwn = bign - na # mean sums of squares msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw # assure no NaNs -- otherwise it leads instead of # sane unittest failure (check of NaNs) to crazy # File "mtrand.pyx", line 1661, in mtrand.shuffle # TypeError: object of type 'numpy.int64' has no len() # without any sane backtrace f[np.isnan(f)] = 0 if externals.exists('scipy'): from scipy.stats import fprob return Dataset(f[np.newaxis], fa={'fprob': fprob(dfbn, dfwn, f)}) else: return Dataset(f[np.newaxis])
def if_classif(X_y, n_features): """Compute the Anova F-value for the provided sample Parameters ---------- X_y : Tuples of (X, y) with X {array-like, sparse matrix} shape = [n_samples, n_features] The set of regressors that will tested sequentially y array of shape(n_samples) The data matrix Returns ------- F : array, shape = [n_features,] The set of F values pval : array, shape = [n_features,] The set of p-values """ n_samples = 0 n_samples_per_class = defaultdict(lambda: 0) sums_args_d = defaultdict(lambda: np.zeros(shape=(n_features))) ss_alldata = np.zeros(shape=(n_features)) for X, y in X_y: if(n_samples % 100) == 0: logger.info("Processing doc #%d..." % n_samples) n_samples += 1 n_samples_per_class[y] += 1 ss_alldata[:] += X[:]**2 sums_args_d[y][:] += X[:] n_classes = len(sums_args_d.keys()) #Convert dictionary to numpy array sums_args = np.array(list(row for row in sums_args_d.itervalues())) square_of_sums_alldata = safe_sqr(reduce(lambda x, y: x + y, sums_args)) square_of_sums_args = [safe_sqr(s) for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, y in enumerate(n_samples_per_class.keys()): ssbn += square_of_sums_args[k] / n_samples_per_class[y] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = stats.fprob(dfbn, dfwn, f) return f, prob
def _call(self, dataset): # This code is based on SciPy's stats.f_oneway() # Copyright (c) Gary Strangman. All rights reserved # License: BSD # # However, it got tweaked and optimized to better fit into PyMVPA. # number of groups targets_sa = dataset.sa[self.get_space()] labels = targets_sa.value ul = targets_sa.unique na = len(ul) bign = float(dataset.nsamples) alldata = dataset.samples # total squares of sums sostot = np.sum(alldata, axis=0) sostot *= sostot sostot /= bign # total sum of squares sstot = np.sum(alldata * alldata, axis=0) - sostot # between group sum of squares ssbn = 0 for l in ul: # all samples for the respective label d = alldata[labels == l] sos = np.sum(d, axis=0) sos *= sos ssbn += sos / float(len(d)) ssbn -= sostot # within sswn = sstot - ssbn # degrees of freedom dfbn = na - 1 dfwn = bign - na # mean sums of squares msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw # assure no NaNs -- otherwise it leads instead of # sane unittest failure (check of NaNs) to crazy # File "mtrand.pyx", line 1661, in mtrand.shuffle # TypeError: object of type 'numpy.int64' has no len() # without any sane backtrace f[np.isnan(f)] = 0 if externals.exists('scipy'): from scipy.stats import fprob return Dataset(f[np.newaxis], fa={'fprob': fprob(dfbn, dfwn, f)}) else: return Dataset(f[np.newaxis])
def f_oneway_repeated_measures(M): """Calculate One-Way ANOVA for repeated measures. Models the difference between 'subjects' as random effect. Example code from Roger Lew: --------------- import numpy as np from scipy.stats import fprob # M contains subjects as rows and conditions as columns M=np.array([[21,22,8,6,6], [20,19,10,4,4], [17,15,5,4,5], [25,30,13,12,17], [30,27,13,8,6], [19,27,8,7,4], [26,16,5,2,5], [17,18,8,1,5], [26,24,14,8,9]],dtype='float') mu=np.mean(M) SS_total=np.sum([[(v-mu)**2 for v in row] for row in M]) SS_subjects=np.sum([ M.shape[1]*(np.mean(row)-mu)**2 for row in M]) SS_conditions=np.sum([ M.shape[0]*(np.mean(row)-mu)**2 for row in M.T]) SS_error=SS_total-SS_subjects-SS_conditions df_total=M.size-1 df_conditions=M.shape[1]-1 df_subjects=M.shape[0]-1 df_error=df_total-df_subjects-df_conditions F=(SS_conditions/df_conditions)/(SS_error/df_error) p=fprob(df_conditions,df_error,F) print F,p ------------------ :Parameters: M: array-like, 2d The array containing the data, 2d, subjects as rows, conditions as columns """ mu=np.mean(M) SS_total=np.sum([[(v-mu)**2 for v in row] for row in M]) SS_subjects=np.sum([ M.shape[1]*(np.mean(row)-mu)**2 for row in M]) SS_conditions=np.sum([ M.shape[0]*(np.mean(row)-mu)**2 for row in M.T]) SS_error=SS_total-SS_subjects-SS_conditions df_total=M.size-1 df_conditions=M.shape[1]-1 df_subjects=M.shape[0]-1 df_error=df_total-df_subjects-df_conditions F=(SS_conditions/df_conditions)/(SS_error/df_error) p=fprob(df_conditions,df_error,F) return F,p
def f_oneway_repeated_measures(M): """Calculate One-Way ANOVA for repeated measures. Models the difference between 'subjects' as random effect. Example code from Roger Lew: --------------- import numpy as np from scipy.stats import fprob # M contains subjects as rows and conditions as columns M=np.array([[21,22,8,6,6], [20,19,10,4,4], [17,15,5,4,5], [25,30,13,12,17], [30,27,13,8,6], [19,27,8,7,4], [26,16,5,2,5], [17,18,8,1,5], [26,24,14,8,9]],dtype='float') mu=np.mean(M) SS_total=np.sum([[(v-mu)**2 for v in row] for row in M]) SS_subjects=np.sum([ M.shape[1]*(np.mean(row)-mu)**2 for row in M]) SS_conditions=np.sum([ M.shape[0]*(np.mean(row)-mu)**2 for row in M.T]) SS_error=SS_total-SS_subjects-SS_conditions df_total=M.size-1 df_conditions=M.shape[1]-1 df_subjects=M.shape[0]-1 df_error=df_total-df_subjects-df_conditions F=(SS_conditions/df_conditions)/(SS_error/df_error) p=fprob(df_conditions,df_error,F) print F,p ------------------ :Parameters: M: array-like, 2d The array containing the data, 2d, subjects as rows, conditions as columns """ mu = np.mean(M) SS_total = np.sum([[(v - mu) ** 2 for v in row] for row in M]) SS_subjects = np.sum([M.shape[1] * (np.mean(row) - mu) ** 2 for row in M]) SS_conditions = np.sum([M.shape[0] * (np.mean(row) - mu) ** 2 for row in M.T]) SS_error = SS_total - SS_subjects - SS_conditions df_total = M.size - 1 df_conditions = M.shape[1] - 1 df_subjects = M.shape[0] - 1 df_error = df_total - df_subjects - df_conditions F = (SS_conditions / df_conditions) / (SS_error / df_error) p = fprob(df_conditions, df_error, F) return F, p
def __call__(self, dataset, labels=None): """Actually calculate the p-values.""" f = OneWayAnova()(dataset) # number of groups if labels is None: labels = dataset.labels # Calculate degrees of freedom ul = np.unique(labels) na = len(ul) bign = float(dataset.nsamples) dfbn = na-1 dfwn = bign - na # Now propabilities ps = fprob(dfbn,dfwn,f) return ps
def compare_models(c1, c2): """ Compares if classifiaction model c1 is significantly better than model c2. The comparison is based on F-test, the p-value is returned. :param c1, c2: linear regression model objects. :type lr: :class:`LinearRegression` """ if c1 == None or c2 == None: return 1.0 p1, p2, n = c1.m, c2.m, c1.n RSS1, RSS2 = c1.sse, c2.sse if RSS1 <= RSS2 or p2 <= p1 or n <= p2 or RSS2 <= 0: return 1.0 F = ((RSS1 - RSS2) / (p2 - p1)) / (RSS2 / (n - p2)) return stats.fprob(int(p2 - p1), int(n - p2), F)
def __init__(self, *args): super(ANOVA, self).__init__() samples = [np.asarray(x) for x in args] all_samples = np.concatenate(samples) self.grand_mean = np.mean(all_samples) self.sst = np.sum([(x - self.grand_mean)**2 for x in all_samples]) self.ssb = np.sum( [len(x) * (np.mean(x) - self.grand_mean)**2 for x in samples]) self.ssw = self.sst - self.ssb self.N = len(all_samples) self.k = len(samples) self.ssbdf = self.k - 1 self.sswdf = self.N - self.k self.mssb = self.ssb / self.ssbdf self.mssw = self.ssw / self.sswdf self.fstat = self.mssb / self.mssw self.pvalue = fprob(self.ssbdf, self.sswdf, self.fstat)
def compare_models(c1, c2): """ Compares if classifiaction model c1 is significantly better than model c2. The comparison is based on F-test, the p-value is returned. :param c1, c2: linear regression model objects. :type lr: :class:`LinearRegression` """ if c1 == None or c2 == None: return 1.0 p1, p2, n = c1.m, c2.m, c1.n RSS1, RSS2 = c1.sse, c2.sse if RSS1 <= RSS2 or p2 <= p1 or n <= p2 or RSS2 <= 0: return 1.0 F = ((RSS1-RSS2)/(p2-p1))/(RSS2/(n-p2)) return stats.fprob(int(p2-p1), int(n-p2), F)
def __init__(self, *args): super(ANOVA, self).__init__() samples = [np.asarray(x) for x in args] all_samples = np.concatenate(samples) self.grand_mean = np.mean(all_samples) self.sst = np.sum([(x - self.grand_mean)**2 for x in all_samples]) self.ssb = np.sum( [len(x)*(np.mean(x) - self.grand_mean)**2 for x in samples] ) self.ssw = self.sst - self.ssb self.N = len(all_samples) self.k = len(samples) self.ssbdf = self.k - 1 self.sswdf = self.N - self.k self.mssb = self.ssb / self.ssbdf self.mssw = self.ssw / self.sswdf self.fstat = self.mssb / self.mssw self.pvalue = fprob(self.ssbdf, self.sswdf, self.fstat)
def repeated_oneway(data): n = data.shape[0] k = data.shape[1] grand_mean = np.mean(data) measurement_mean = np.mean(data, axis=0) subject_mean = np.mean(data, axis=1) ssb = n * st.ss(measurement_mean - grand_mean) # ssw = st.ss(data-measurement_mean) ssw = np.sum(st.ss(data - measurement_mean)) sss = k * st.ss(subject_mean - grand_mean) sse = ssw - sss dfb = k - 1 dfe = (n - 1) * (k - 1) msb = ssb / float(dfb) mse = sse / float(dfe) f = msb / mse p = st.fprob(dfb, dfe, f) return f, p
def repeated_oneway(data) : n = data.shape[0] k = data.shape[1] grand_mean = np.mean(data) measurement_mean = np.mean(data,axis=0) subject_mean = np.mean(data,axis=1) ssb = n*st.ss(measurement_mean-grand_mean) # ssw = st.ss(data-measurement_mean) ssw = np.sum(st.ss(data-measurement_mean)) sss = k*st.ss(subject_mean-grand_mean) sse = ssw-sss dfb = k - 1 dfe = (n-1)*(k-1) msb = ssb / float(dfb) mse = sse / float(dfe) f = msb / mse p = st.fprob(dfb,dfe,f) return f,p
def _f_oneway(*args): """ Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Parameters ---------- sample1, sample2, ... : array_like The sample measurements should be given as arguments. Returns ------- F-value : float The computed F-value of the test p-value : float The associated p-value from the F-distribution Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homocedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (`stats.kruskal`_) although with some loss of power The algorithm is from Heiman[2], pp.394-7. See scipy.stats.f_oneway that should give the same results while being less efficient References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) n_samples_per_class = np.array([len(a) for a in args]) n_samples = np.sum(n_samples_per_class) ss_alldata = reduce(lambda x, y: x + y, [np.sum(a ** 2, axis=0) for a in args]) sums_args = [np.sum(a, axis=0) for a in args] square_of_sums_alldata = reduce(lambda x, y: x + y, sums_args) ** 2 square_of_sums_args = [s ** 2 for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0 for k, _ in enumerate(args): ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw prob = stats.fprob(dfbn, dfwn, f) return f, prob
def err(p,fp_ref): #print p,fp_ref return abs(fp_ref-fprob(df1,df2,p[0]))
def f_oneway(*args): """Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Parameters ---------- sample1, sample2, ... : array_like, sparse matrices The sample measurements should be given as arguments. Returns ------- F-value : float The computed F-value of the test. p-value : float The associated p-value from the F-distribution. Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although with some loss of power. The algorithm is from Heiman[2], pp.394-7. See ``scipy.stats.f_oneway`` that should give the same results while being less efficient. References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) args = [safe_asarray(a) for a in args] n_samples_per_class = np.array([a.shape[0] for a in args]) n_samples = np.sum(n_samples_per_class) ss_alldata = reduce(lambda x, y: x + y, [safe_sqr(a).sum(axis=0) for a in args]) sums_args = [a.sum(axis=0) for a in args] square_of_sums_alldata = safe_sqr(reduce(lambda x, y: x + y, sums_args)) square_of_sums_args = [safe_sqr(s) for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, _ in enumerate(args): ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = stats.fprob(dfbn, dfwn, f) return f, prob
def f_oneway(*args): """Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Parameters ---------- sample1, sample2, ... : array_like, sparse matrices The sample measurements should be given as arguments. Returns ------- F-value : float The computed F-value of the test. p-value : float The associated p-value from the F-distribution. Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although with some loss of power. The algorithm is from Heiman[2], pp.394-7. See ``scipy.stats.f_oneway`` that should give the same results while being less efficient. References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) args = [as_float_array(a) for a in args] n_samples_per_class = np.array([a.shape[0] for a in args]) n_samples = np.sum(n_samples_per_class) ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args) sums_args = [np.asarray(a.sum(axis=0)) for a in args] square_of_sums_alldata = sum(sums_args) ** 2 square_of_sums_args = [s ** 2 for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, _ in enumerate(args): ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_features_idx = np.where(msw == 0.)[0] if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size): warnings.warn("Features %s are constant." % constant_features_idx, UserWarning) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = stats.fprob(dfbn, dfwn, f) return f, prob
def ftest(Ernull, Eralt): """ Extra-sum-square F test for two groups. This function allows to compare two NESTED models using the F test The Extra sum-of-squares F test is based on the difference between the sum of squares (residuals) of the two models. It also takes into account the number od data points and the number os parameters of each model (penalizes the more complicated model). It uses this information to compute the F ratio, from which it calculates a P value. If the simpler model (few parameters) is "better" P value is greater than 0.05 (5%), otherwise, if the more complicated model is "better" then the P value will be less than 0.05. Parameters ---------- Ernull : array_like residuals from the simpler model (null hypothesis). Eralt : array_like residuals from the more complicated model (alternative hypothesis). Returns ------- fRatio : float F test from the 2 groups. pValue : float P value from F dist. References ---------- .. [1] H.J.Motulsky and A Christopoulos, Fitting Model to Biological Data using Linear and Nolinear Regression: A pratical guide to curve fitting. 2003, GraphPad Software inc., San Diego CA, www.graphpad.com .. [2] http://vassarstats.net/textbook/ch14pt1.html """ sAnull = sum(Ernull) Nnull = len(Ernull) Nalt = len(Eralt) sAalt = sum(Eralt) s2Anull = sum(Ernull**2) s2Aalt = sum(Eralt**2) Mnull = mean(Ernull) # Mean of group 1. Malt = mean(Eralt) # Mean of group 2. SSnull = s2Anull - (sAnull**2) / Nnull SSalt = s2Aalt - (sAalt**2) / Nalt Mt = mean([Mnull, Malt]) Nt = Nnull + Nalt SSwg = SSnull + SSalt # Variability that exists inside 2 groups. SSbg = Nnull * (Mnull - Mt)**2 + Nalt * (Malt - Mt)**2 # measure of the # aggregate differences among the means of the 2 groups. dfbg = 1 dfwg = (Nnull - 1) + (Nalt - 1) # Degree of freedom of the 2 groups. dft = Nt - 2 # Number of degrees of freedom for the entire data. MSbg = SSbg / dfbg MSwg = SSwg / dfwg fRatio = MSbg / MSwg # F Ratio. pValue = fprob(SSnull, SSalt, fRatio) return fRatio, pValue
def err(p, fp_ref): #print p,fp_ref return abs(fp_ref - fprob(df1, df2, p[0]))