def __init__(self, X): """ X : model Model which will be fitted to the data. """ # prepare input self.X = X = asmodel(X) self.n_cases = len(X) if not isbalanced(X): raise NotImplementedError("Unbalanced models") self.X_ = X.full self.full_model = fm = X.df_error == 0 if fm: self.E_MS = hopkins_ems(X) elif hasrandom(X): err = "Models containing random effects need to be fully " "specified." raise NotImplementedError(err) self.max_len = int(2 ** _max_array_size // X.df ** 2) if _lmf_lsq == 0: pass elif _lmf_lsq == 1: self.Xsinv = X.Xsinv else: raise ValueError("version")
def __init__(self, Y, X, sub=None, v=False, lsq=0, title=None): """ Y: dependent variable X: model v: print some intermediate results for inspection lsq: 0 = numpy lsq 1 = my least sq (Fox) performs an anova/ancova based on residuals. Fixed effects only """ self.title = title self.results = {} # will store results # prepare input Y = asvar(Y) X = asmodel(X) # .sorted() if sub is not None: Y = Y[sub] X = X[sub] assert Y.N == X.N assert X.df_error > 0 # fit if lsq == 1: # estimate least squares approximation beta = _leastsq(Y.x, X.full) # estimate values = self.values = beta * X.full Y_est = values.sum(1) self._residuals = residuals = Y.x - Y_est SS_res = np.sum(residuals ** 2) if not Y.mu == Y_est.mean(): logging.warning("Y.mu=%s != Y_est.mean()=%s" % (Y.mu, Y_est.mean())) else: # use numpy beta, SS_res, rank, s = np.linalg.lstsq(X.full, Y.x) if len(SS_res) == 1: SS_res = SS_res[0] else: SS_res = 0 # SS total SS_total = self.SS_total = np.sum((Y.x - Y.mu) ** 2) df_total = self.df_total = X.df_total MS_total = self.MS_total = SS_total / df_total # SS residuals self.SS_res = SS_res df_res = self.df_res = X.df_error MS_res = self.MS_res = SS_res / df_res # SS explained # SS_model = self.SS_model = np.sum((Y_est - Y.mu)**2) SS_model = self.SS_model = SS_total - SS_res df_model = self.df_model = X.df MS_model = self.MS_model = SS_model / df_model # store stuff self.Y = Y self.X = X self.beta = beta
def ancova(Y, factorial_model, covariate, interaction=None, sub=None, v=True, empty=True, ems=None): """ OBSOLETE args ---- Y: dependent variable factorial model: covariate: kwargs ------ interaction: term from the factorial model to check for interaction with the covariate v=True: display more information **anova_kwargs: ems, empty Based on -------- Exercise to STATISTICS: AN INTRODUCTION USING R http://www.bio.ic.ac.uk/research/crawley/statistics/exercises/R6Ancova.pdf """ assert isvar(covariate) anova_kwargs = {"empty": empty, "ems": ems} if sub != None: Y = Y[sub] factorial_model = factorial_model[sub] covariate = covariate[sub] if interaction != None: interaction = interaction[sub] # if interaction: assert type(interaction) in [factor] factorial_model = asmodel(factorial_model) a1 = lm(Y, factorial_model) if v: print a1.anova(title="MODEL 1", **anova_kwargs) print "\n" a2 = lm(Y, factorial_model + covariate) if v: print a2.anova(title="MODEL 2: Main Effect Covariate", **anova_kwargs) print "\n" print 'Model with "%s" Covariate > without Covariate' % covariate.name print comparelm(a1, a2) if interaction: logging.debug("%s / %s" % (covariate.name, interaction.name)) logging.debug("%s" % (covariate.__div__)) i_effect = covariate.__div__(interaction) # i_effect = covariate / interaction a3 = lm(Y, factorial_model + i_effect) if v: print "\n" print a3.anova(title="MODEL 3: Interaction") # compare print '\n"%s"x"%s" Interaction > No Covariate:' % (covariate.name, interaction.name) print comparelm(a1, a3) print '\n"%s"x"%s" Interaction > Main Effect:' % (covariate.name, interaction.name) print comparelm(a2, a3)
def Ysub(self, effects): "return Y after subtracting the SS explained by the given effects" # FIXME: Ysub effects = asmodel(effects) Yout = deepcopy(self.Y.x) for e in effects.effects: Yout -= self.values[:, self.indexes[e.name]].sum(1) return Yout
def __init__(self, X, v=False, title=False): self.title = title self.results = {} # will store results # prepare input X = asmodel(X) self.X = X # X inverse X_ = X.full self.Xinv = np.matrix(X_).I.A # alternative still used for map self.Xsinv = np.dot(np.matrix(np.dot(X_.T, X_)).I.A, X_.T) # E MS self.E_ms = _hopkins_ems(X) self.df_res = X.df_error
def __init__(self, Y='MEG', X='condition', sub=None, ds=None, p=.05, contours={.01: '.5', .001: '0'}): self.name = "anova" Y = self.Y = asndvar(Y, sub=sub, ds=ds) X = self.X = asmodel(X, sub=sub, ds=ds) fitter = _glm.lm_fitter(X) properties = Y.properties.copy() properties['colorspace'] = _cs.get_sig(p=p, contours=contours) kwargs = dict(dims=Y.dims[1:], properties=properties) self.all = [] self.p_maps = {} for e, _, Ps in fitter.map(Y.x): name = e.name P = ndvar(Ps, name=name, **kwargs) self.all.append(P) self.p_maps[e] = P
def __init__(self, Y, X, sub=None, title=None, empty=True, ems=None, lsq=0, showall=False): """ Returns an ANOVA table for the linear model. Mixed effects models require full model specification so that E(MS) can be estimated according to Hopkins (1976) Random effects: If the model is fully specified, a Hopkins E(MS) table is used to determine error terms in the mixed effects model. Otherwise, random factors are treated as fixed factors. kwargs ------ empty: include rows without F-Tests (True/False) ems: display source of E(MS) for F-Tests (True/False; None = use default) lsq: least square fitter = 0 -> numpy.linalg.lstsq = 1 -> after Fox showall: show SS, df and MS for effects without F test TODO ---- - sort model - reuse lms which are used repeatedly - provide threshold for including interaction effects when testing lower level effects Problem with unbalanced models ------------------------------ - The SS of Effects which do not include the between-subject factor are higher than in SPSS - The SS of effects which include the between-subject factor agree with SPSS """ # prepare kwargs Y = asvar(Y) X = asmodel(X) if sub is not None: Y = Y[sub] X = X[sub] assert Y.N == X.N # save args self.Y = Y self.X = X self.title = title self.show_ems = ems self._log = [] # decide which E(MS) model to use if X.df_error == 0: rfx = 1 fx_desc = "Mixed" elif X.df_error > 0: rfx = 0 fx_desc = "Fixed" else: raise ValueError("Model Overdetermined") self._log.append("%s effects model" % fx_desc) if lsq == 1: self._log.append("(my lsq)") elif lsq == 0: self._log.append("\n (np lsq)") # create testing table: # list of (effect, lm, lm_comp, lm_EMS) test_table = [] # # list of (name, SS, df, MS, F, p) results_table = [] if len(X.effects) == 1: self._log.append("single factor model") lm0 = lm(Y, X, lsq=lsq) SS = lm0.SS_model df = lm0.df_model MS = lm0.MS_model F, p = lm0.F_test() results_table.append((X.name, SS, df, MS, F, p)) results_table.append(("Residuals", lm0.SS_res, lm0.df_res, lm0.MS_res, None, None)) else: if not rfx: full_lm = lm(Y, X, lsq=lsq) SS_e = full_lm.SS_res MS_e = full_lm.MS_res df_e = full_lm.df_res for e_test in X.effects: skip = False name = e_test.name # find model 0 effects = [] excluded_e = [] for e in X.effects: # determine whether e_test if e is e_test: pass else: if _is_higher_order(e, e_test): excluded_e.append(e) else: effects.append(e) model0 = model(*effects) if e_test.df > model0.df_error: skip = "overspecified" else: lm0 = lm(Y, model0, lsq=lsq) # find model 1 effects.append(e_test) model1 = model(*effects) if model1.df_error > 0: lm1 = lm(Y, model1, lsq=lsq) else: lm1 = None if rfx: # find E(MS) EMS_effects = [] for e in X.effects: if e is e_test: pass elif all([(f in e_test or f.random) for f in e.factors]): if all([(f in e or e.nestedin(f)) for f in e_test.factors]): EMS_effects.append(e) if len(EMS_effects) > 0: lm_EMS = lm(Y, model(*EMS_effects), lsq=lsq) MS_e = lm_EMS.MS_model df_e = lm_EMS.df_model else: if showall: if lm1 is None: SS = lm0.SS_res df = lm0.df_res else: SS = lm0.SS_res - lm1.SS_res df = lm0.df_res - lm1.df_res MS = SS / df results_table.append((name, SS, df, MS, None, None)) skip = "no Hopkins E(MS)" if skip: self._log.append("SKIPPING: %s (%s)" % (e_test.name, skip)) else: test_table.append((e_test, lm1, lm0, MS_e, df_e)) SS, df, MS, F, p = incremental_F_test(lm1, lm0, MS_e=MS_e, df_e=df_e) results_table.append((name, SS, df, MS, F, p)) if not rfx: results_table.append(("Residuals", SS_e, df_e, MS_e, None, None)) self._test_table = test_table self._results_table = results_table
def __init__(self, Y, X, t=.1, samples=1000, replacement=False, tstart=None, tstop=None, close_time=0, pmax=1, sub=None, ds=None, ): """ Arguments --------- Y : ndvar Measurements (dependent variable) X : categorial Model t : scalar Threshold (uncorrected p-value) to use for finding clusters samples : int Number of samples to estimate parameter distributions replacement : bool whether random samples should be drawn with replacement or without tstart, tstop : None | scalar Time window for clusters. **None**: use the whole epoch; **scalar**: use only a part of the epoch .. Note:: implementation is not optimal: F-values are still computed but ignored. close_time : scalar Close gaps in clusters that are smaller than this interval. Assumes that Y is a uniform time series. sub : index Apply analysis to a subset of cases in Y, X pmax : scalar <= 1 Maximum cluster p-values to keep cluster. .. FIXME:: connectivity for >2 dimensional data. Currently, adjacent samples are connected. """ Y = self.Y = asndvar(Y, sub=sub, ds=ds) X = self.X = asmodel(X, sub=sub, ds=ds) lm = _glm.lm_fitter(X) # get F-thresholds from p-threshold tF = {} if lm.full_model: for e in lm.E_MS: effects_d = lm.E_MS[e] if effects_d: df_d = sum(ed.df for ed in effects_d) tF[e] = scipy.stats.distributions.f.isf(t, e.df, df_d) else: df_d = X.df_error tF = {e: scipy.stats.distributions.f.isf(t, e.df, df_d) for e in X.effects} # Estimate statistic distributions from permuted Ys kwargs = dict(tstart=tstart, tstop=tstop, close_time=close_time, unit='F') dists = {e: cluster_dist(Y, samples, tF[e], name=e.name, **kwargs) for e in tF} self.cluster_dists = dists for _, Yrs in _resample(Y, replacement=replacement, samples=samples): for e, F in lm.map(Yrs.x, p=False): dists[e].add_perm(F) # Find clusters in the actual data test0 = lm.map(Y.x, p=False) self.effects = [] self.clusters = {} self.F_maps = {} for e, F in test0: self.effects.append(e) dist = dists[e] dist.add_original(F) self.clusters[e] = dist self.F_maps[e] = dist.P self.name = "ANOVA Permutation Cluster Test" self.tF = tF self.all = [[self.F_maps[e]] + self.clusters[e].clusters for e in self.X.effects if e in self.F_maps]
def __init__(self, Y, X, sub=None, title=None, empty=True, ems=None, showall=False, ds=None): """ Fits a univariate ANOVA model. Mixed effects models require full model specification so that E(MS) can be estimated according to Hopkins (1976) Parameters ---------- Y : var dependent variable X : model Model to fit to Y empty : bool include rows without F-Tests (True/False) ems : bool | None display source of E(MS) for F-Tests (True/False; None = use default) lsq : int least square fitter to use; 0 -> scipy.linalg.lstsq 1 -> after Fox showall : bool show SS, df and MS for effects without F test """ # TODO: # - sort model # - reuse lms which are used repeatedly # - provide threshold for including interaction effects when testing lower # level effects # # Problem with unbalanced models # ------------------------------ # - The SS of Effects which do not include the between-subject factor are # higher than in SPSS # - The SS of effects which include the between-subject factor agree with # SPSS # prepare kwargs Y = asvar(Y, sub=sub, ds=ds) X = asmodel(X, sub=sub, ds=ds) if len(Y) != len(X): raise ValueError("Y and X must describe same number of cases") # save args self.Y = Y self.X = X self.title = title self.show_ems = ems self._log = [] # decide which E(MS) model to use if X.df_error == 0: rfx = 1 fx_desc = "Mixed" elif X.df_error > 0: if hasrandom(X): err = "Models containing random effects need to be fully " "specified." raise NotImplementedError(err) rfx = 0 fx_desc = "Fixed" else: raise ValueError("Model Overdetermined") self._log.append("Using %s effects model" % fx_desc) # list of (name, SS, df, MS, F, p) self.F_tests = [] self.names = [] if len(X.effects) == 1: self._log.append("single factor model") lm1 = lm(Y, X) self.F_tests.append(lm1) self.names.append(X.name) self.residuals = lm1.SS_res, lm1.df_res, lm1.MS_res else: if rfx: pass # <- Hopkins else: full_lm = lm(Y, X) SS_e = full_lm.SS_res MS_e = full_lm.MS_res df_e = full_lm.df_res for e_test in X.effects: skip = False name = e_test.name # find model 0 effects = [] excluded_e = [] for e in X.effects: # determine whether e_test if e is e_test: pass else: if is_higher_order(e, e_test): excluded_e.append(e) else: effects.append(e) model0 = model(*effects) if e_test.df > model0.df_error: skip = "overspecified" else: lm0 = lm(Y, model0) # find model 1 effects.append(e_test) model1 = model(*effects) if model1.df_error > 0: lm1 = lm(Y, model1) else: lm1 = None if rfx: # find E(MS) EMS_effects = _find_hopkins_ems(e_test, X) if len(EMS_effects) > 0: lm_EMS = lm(Y, model(*EMS_effects)) MS_e = lm_EMS.MS_model df_e = lm_EMS.df_model else: if lm1 is None: SS = lm0.SS_res df = lm0.df_res else: SS = lm0.SS_res - lm1.SS_res df = lm0.df_res - lm1.df_res MS = SS / df skip = "no Hopkins E(MS); SS=%.2f, df=%i, " "MS=%.2f" % (SS, df, MS) if skip: self._log.append("SKIPPING: %s (%s)" % (e_test.name, skip)) else: res = incremental_F_test(lm1, lm0, MS_e=MS_e, df_e=df_e, name=name) self.F_tests.append(res) self.names.append(name) if not rfx: self.residuals = SS_e, df_e, MS_e
def __init__(self, Y, X, sub=None): """ Fit the model X to the dependent variable Y. Parameters ---------- Y : var Dependent variable. X : model Model. sub : None | index Only use part of the data """ # prepare input Y = asvar(Y) X = asmodel(X) # .sorted() if sub is not None: Y = Y[sub] X = X[sub] assert len(Y) == len(X) assert X.df_error > 0 # fit if _lm_lsq == 0: # use scipy (faster) beta, SS_res, _, _ = lstsq(X.full, Y.x) elif _lm_lsq == 1: # Fox # estimate least squares approximation beta = X.fit(Y) # estimate values = beta * X.full Y_est = values.sum(1) self._residuals = residuals = Y.x - Y_est SS_res = np.sum(residuals ** 2) if not Y.mean() == Y_est.mean(): msg = "Y.mean() != Y_est.mean() (%s vs " "%s)" % (Y.mean(), Y_est.mean()) logging.warning(msg) else: raise ValueError # SS total SS_total = self.SS_total = np.sum((Y.x - Y.mean()) ** 2) df_total = self.df_total = X.df_total self.MS_total = SS_total / df_total # SS residuals self.SS_res = SS_res df_res = self.df_res = X.df_error self.MS_res = SS_res / df_res # SS explained # SS_model = self.SS_model = np.sum((Y_est - Y.mean())**2) SS_model = self.SS = self.SS_model = SS_total - SS_res df_model = self.df = self.df_model = X.df self.MS_model = self.MS = SS_model / df_model # store stuff self.Y = Y self.X = X self.sub = sub self.beta = beta
def data(Y, X=None, match=None, cov=[], sub=None, fmt=None, labels=True, showcase=True): """ return a textab.table (printed as tsv table by default) parameters ---------- Y: variable to display (can be model with several dependents) X: categories defining cells (factorial model) match: factor to match values on and return repeated-measures table cov: covariate to report (WARNING: only works with match, where each value on the matching variable corresponds with one value in the covariate) sub: boolean array specifying which values to include (generate e.g. with 'sub=T==[1,2]') fmt: Format string labels: display labels for nominal variables (otherwise display codes) """ if hasattr(Y, '_items'): # dataframe Y = Y._items Y = _data.asmodel(Y) if _data.isfactor(cov) or _data.isvar(cov): cov = [cov] data = [] names_yname = [] # names including Yi.name for matched table headers ynames = [] # names of Yi for independent measures table headers within_list = [] for Yi in Y.effects: _data, datalabels, names, _within = _data._split_Y(Yi, X, match=match, sub=sub, datalabels=match) data += _data names_yname += ['({c})'.format(c=n) for n in names] ynames.append(Yi.name) within_list.append(_within) within = within_list[0] assert all([w==within for w in within_list]) # table n_dependents = len(Y.effects) n_cells = int(len(data) / n_dependents) if within: n, k = len(data[0]), len(data) table = textab.Table('l' * (k + showcase + len(cov))) # header line 1 if showcase: table.cell(match.name) case_labels = datalabels[0] assert all([np.all(case_labels==l) for l in datalabels[1:]]) for i in range(n_dependents): for name in names: table.cell(name.replace(' ','_')) for c in cov: table.cell(c.name) # header line 2 if n_dependents > 1: if showcase: table.cell() for name in ynames: [table.cell('(%s)'%name) for i in range(n_cells)] for c in cov: table.cell() # body table.midrule() for i in range(n): case = case_labels[i] if showcase: table.cell(case) for j in range(k): table.cell(data[j][i], fmt=fmt) # covariates indexes = match==case for c in cov: # test it's all the same values case_cov = c[indexes] if len(np.unique(case_cov.x)) != 1: msg = 'covariate for case "%s" has several values'%case raise ValueError(msg) # get value first_i = np.nonzero(indexes)[0][0] cov_value = c[first_i] if _data.isfactor(c) and labels: cov_value = c.cells[cov_value] table.cell(cov_value, fmt=fmt) else: table = textab.Table('l'*(1 + n_dependents)) table.cell(X.name) [table.cell(y) for y in ynames] table.midrule() # data is now sorted: (cell_i within dependent_i) # sort data as (X-cell, dependent_i) data_sorted = [] for i_cell in range(n_cells): data_sorted.append([data[i_dep*n_cells + i_cell] for i_dep in \ range(n_dependents)]) # table for name, cell_data in zip(names, data_sorted): for i in range(len(cell_data[0])): table.cell(name) for dep_data in cell_data: v = dep_data[i] table.cell(v, fmt=fmt) return table