def ancova(Y, factorial_model, covariate, interaction=None, sub=None, v=True, empty=True, ems=None): """ OBSOLETE args ---- Y: dependent variable factorial model: covariate: kwargs ------ interaction: term from the factorial model to check for interaction with the covariate v=True: display more information **anova_kwargs: ems, empty Based on -------- Exercise to STATISTICS: AN INTRODUCTION USING R http://www.bio.ic.ac.uk/research/crawley/statistics/exercises/R6Ancova.pdf """ assert isvar(covariate) anova_kwargs = {"empty": empty, "ems": ems} if sub != None: Y = Y[sub] factorial_model = factorial_model[sub] covariate = covariate[sub] if interaction != None: interaction = interaction[sub] # if interaction: assert type(interaction) in [factor] factorial_model = asmodel(factorial_model) a1 = lm(Y, factorial_model) if v: print a1.anova(title="MODEL 1", **anova_kwargs) print "\n" a2 = lm(Y, factorial_model + covariate) if v: print a2.anova(title="MODEL 2: Main Effect Covariate", **anova_kwargs) print "\n" print 'Model with "%s" Covariate > without Covariate' % covariate.name print comparelm(a1, a2) if interaction: logging.debug("%s / %s" % (covariate.name, interaction.name)) logging.debug("%s" % (covariate.__div__)) i_effect = covariate.__div__(interaction) # i_effect = covariate / interaction a3 = lm(Y, factorial_model + i_effect) if v: print "\n" print a3.anova(title="MODEL 3: Interaction") # compare print '\n"%s"x"%s" Interaction > No Covariate:' % (covariate.name, interaction.name) print comparelm(a1, a3) print '\n"%s"x"%s" Interaction > Main Effect:' % (covariate.name, interaction.name) print comparelm(a2, a3)
def __str__(self): f_names = [f.name for f in self._factors.values() if _vsl.isfactor(f)] v_names = [f.name for f in self._factors.values() if _vsl.isvar(f)] out = 'Variables:\n' + ', '.join(sorted(v_names)) out += '\nFactors:\n' + ', '.join(sorted(f_names)) if hasattr(self, '_stats'): out += '\n\nSEGMENTS:\n' + ', '.join(f.name for f in self._stats.values()) return out
def _resample(Y, unit=None, replacement=True, samples=1000): """ Generator function to resample a dependent variable (Y) multiple times unit: factor specdifying unit of measurement (e.g. subject). If unit is specified, resampling proceeds by first resampling the categories of unit (with or without replacement) and then shuffling the values within unites (no replacement). replacement: whether random samples should be drawn with replacement or without samples: number of samples to yield """ if isvar(Y): Yout = Y.copy('_resampled') Y else: Y = var(Y) Yout = var(Y.copy(), name="Y resampled") if unit: ct = celltable(Y, unit) unit_data = ct.get_data(out=list) unit_indexes = ct.data_indexes.values() x_out = Yout.x if replacement: n = len(ct.indexes) for sample in xrange(samples): source_ids = np.random.randint(n, size=n) for index, source_index in zip(unit_indexes, source_ids): data = unit_data[source_index] np.random.shuffle(data) x_out[index] = data yield Yout else: for sample in xrange(samples): random.shuffle(unit_data) for index, data in zip(unit_indexes, unit_data): np.random.shuffle(data) x_out[index] = data yield Yout else: if replacement: N = Y.N for i in xrange(samples): index = np.random.randint(N) Yout.x = Y.x[index] yield Yout else: for i in xrange(samples): np.random.shuffle(Yout.x) yield Yout
def _resample(Y, unit=None, replacement=True, samples=1000): """ Generator function to resample a dependent variable (Y) multiple times Y : var | ndvar Variable which is to be resampled; a copy of ``Y`` is yielded in each iteration. unit : categorial factor specifying unit of measurement (e.g. subject). If unit is specified, resampling proceeds by first resampling the categories of unit (with or without replacement) and then shuffling the values within unites (no replacement). replacement : bool whether random samples should be drawn with replacement or without samples : int number of samples to yield """ if isvar(Y): pass elif isndvar(Y): if not Y.has_case: raise ValueError("Need ndvar with cases") else: raise TypeError("need var or ndvar") Yout = Y.copy('{name}_resampled') if unit: # not implemented ct = celltable(Y, unit) unit_data = ct.get_data(out=list) unit_indexes = ct.data_indexes.values() x_out = Yout.x if replacement: n = len(ct.indexes) for i in xrange(samples): source_ids = np.random.randint(n, size=n) for index, source_index in zip(unit_indexes, source_ids): data = unit_data[source_index] np.random.shuffle(data) x_out[index] = data yield i, Yout else: for i in xrange(samples): random.shuffle(unit_data) for index, data in zip(unit_indexes, unit_data): np.random.shuffle(data) x_out[index] = data yield i, Yout else: # OK if replacement: N = len(Y) for i in xrange(samples): index = np.random.randint(N, N) Yout.x = Y.x[index] yield i, Yout else: # OK for i in xrange(samples): np.random.shuffle(Yout.x) yield i, Yout
def correlations(Y, Xs, cat=None, levels=[.05, .01, .001], diff=None, sub=None, pmax=None, nan=True): # , match=None): """ :arg var Y: first variable :arg var X: second variable (or list of variables) :arg cat: show correlations separately for different groups in the data. Can be a ``factor`` (the correlation for each level is shown separately) or an array of ``bool`` values (e.g. from a comparison like ``Stim==1``) :arg list levels: significance levels to mark :arg diff: (factor, cat_1, cat_2) :arg sub: use only a subset of the data :arg pmax: (None) don't show correlations with p>pmax :arg nan: ``True``: display correlation which yield NAN; ``False``: hide NANs but mention occurrence in summary (not implemented); ``None``: don't mention NANs :rtype: Table """ levels = np.array(levels) if isvar(Xs): Xs = [Xs] # SUB if sub is not None: Y = Y[sub] Xs = [X[sub] for X in Xs] if ismodel(cat) or isfactor(cat): cat = cat[sub] if diff is not None: raise NotImplementedError if cat is None: table = fmtxt.Table('l' * 4) table.cells('Variable', 'r', 'p', 'n') else: assert iscategorial(cat) table = fmtxt.Table('l' * 5) table.cells('Variable', 'Category', 'r', 'p', 'n') table.midrule() table.title("Correlations with %s" % (Y.name)) table._my_nan_count = 0 for X in Xs: if cat is None: _corr_to_table(table, Y, X, cat, levels, pmax=pmax, nan=nan) else: printXname = True for cell in cat.cells: tlen = len(table) sub = (cat == cell) _corr_to_table(table, Y, X, sub, levels, pmax=pmax, nan=nan, printXname=printXname, label=cell_label(cell)) if len(table) > tlen: printXname = False # last row if pmax is None: p_text = '' else: p_text = 'all other p>{p}'.format(p=pmax) if nan is False and table._my_nan_count > 0: nan_text = '%s NANs' % table._my_nan_count else: nan_text = '' if p_text or nan_text: if p_text and nan_text: text = ', '.join([p_text, nan_text]) else: text = ''.join([p_text, nan_text]) table.cell("(%s)" % text) return table
def data(Y, X=None, match=None, cov=[], sub=None, fmt=None, labels=True, showcase=True): """ return a textab.table (printed as tsv table by default) parameters ---------- Y: variable to display (can be model with several dependents) X: categories defining cells (factorial model) match: factor to match values on and return repeated-measures table cov: covariate to report (WARNING: only works with match, where each value on the matching variable corresponds with one value in the covariate) sub: boolean array specifying which values to include (generate e.g. with 'sub=T==[1,2]') fmt: Format string labels: display labels for nominal variables (otherwise display codes) """ if hasattr(Y, '_items'): # dataframe Y = Y._items Y = _data.asmodel(Y) if _data.isfactor(cov) or _data.isvar(cov): cov = [cov] data = [] names_yname = [] # names including Yi.name for matched table headers ynames = [] # names of Yi for independent measures table headers within_list = [] for Yi in Y.effects: _data, datalabels, names, _within = _data._split_Y(Yi, X, match=match, sub=sub, datalabels=match) data += _data names_yname += ['({c})'.format(c=n) for n in names] ynames.append(Yi.name) within_list.append(_within) within = within_list[0] assert all([w==within for w in within_list]) # table n_dependents = len(Y.effects) n_cells = int(len(data) / n_dependents) if within: n, k = len(data[0]), len(data) table = textab.Table('l' * (k + showcase + len(cov))) # header line 1 if showcase: table.cell(match.name) case_labels = datalabels[0] assert all([np.all(case_labels==l) for l in datalabels[1:]]) for i in range(n_dependents): for name in names: table.cell(name.replace(' ','_')) for c in cov: table.cell(c.name) # header line 2 if n_dependents > 1: if showcase: table.cell() for name in ynames: [table.cell('(%s)'%name) for i in range(n_cells)] for c in cov: table.cell() # body table.midrule() for i in range(n): case = case_labels[i] if showcase: table.cell(case) for j in range(k): table.cell(data[j][i], fmt=fmt) # covariates indexes = match==case for c in cov: # test it's all the same values case_cov = c[indexes] if len(np.unique(case_cov.x)) != 1: msg = 'covariate for case "%s" has several values'%case raise ValueError(msg) # get value first_i = np.nonzero(indexes)[0][0] cov_value = c[first_i] if _data.isfactor(c) and labels: cov_value = c.cells[cov_value] table.cell(cov_value, fmt=fmt) else: table = textab.Table('l'*(1 + n_dependents)) table.cell(X.name) [table.cell(y) for y in ynames] table.midrule() # data is now sorted: (cell_i within dependent_i) # sort data as (X-cell, dependent_i) data_sorted = [] for i_cell in range(n_cells): data_sorted.append([data[i_dep*n_cells + i_cell] for i_dep in \ range(n_dependents)]) # table for name, cell_data in zip(names, data_sorted): for i in range(len(cell_data[0])): table.cell(name) for dep_data in cell_data: v = dep_data[i] table.cell(v, fmt=fmt) return table