def box_plot(df, val, factors=None, where=None, fname=None, output_dir='', quality='medium'): """ Makes a box plot args: df: a pyvttbl.DataFrame object val: the label of the dependent variable kwds: factors: a list of factors to include in boxplot where: a string, list of strings, or list of tuples applied to the DataFrame before plotting fname: output file name quality: {'low' | 'medium' | 'high'} specifies image file dpi """ if factors == None: factors = [] if where == None: where = [] # check to see if there is any data in the table if df == {}: raise Exception('Table must have data to print data') # check to see if data columns have equal lengths if not df._are_col_lengths_equal(): raise Exception('columns have unequal lengths') # check the supplied arguments if val not in df.keys(): raise KeyError(val) if not hasattr(factors, '__iter__'): raise TypeError( "'%s' object is not iterable" % type(factors).__name__) for k in factors: if k not in df.keys(): raise KeyError(k) # check for duplicate names dup = Counter([val]+factors) del dup[None] if not all([count==1 for count in dup.values()]): raise Exception('duplicate labels specified as plot parameters') # check fname if not isinstance(fname, _strobj) and fname != None: raise TypeError('fname must be None or string') if isinstance(fname, _strobj): if not (fname.lower().endswith('.png') or \ fname.lower().endswith('.svg')): raise Exception('fname must end with .png or .svg') test = {} if factors == []: d = df.select_col(val, where=where) fig = pylab.figure() pylab.boxplot(np.array(d)) xticks = pylab.xticks()[0] xlabels = [val] pylab.xticks(xticks, xlabels) test['d'] = d test['val'] = val else: D = df.pivot(val, rows=factors, where=where, aggregate='tolist') fig = pylab.figure(figsize=(6*len(factors),6)) fig.subplots_adjust(left=.05, right=.97, bottom=0.24) pylab.boxplot([np.array(_flatten(d)) for d in D]) xticks = pylab.xticks()[0] xlabels = ['\n'.join('%s = %s'%fc for fc in c) for c in D.rnames] pylab.xticks(xticks, xlabels, rotation=35, verticalalignment='top') test['d'] = [np.array(_flatten(d)) for d in D] test['xlabels'] = xlabels maintitle = '%s'%val if factors != []: maintitle += ' by ' maintitle += ' * '.join(factors) fig.text(0.5, 0.95, maintitle, horizontalalignment='center', verticalalignment='top') test['maintitle'] = maintitle if fname == None: fname = 'box(%s'%val if factors != []: fname += '~' + '_X_'.join([str(f) for f in factors]) fname += ').png' fname = os.path.join(output_dir, fname) test['fname'] = fname # save figure if quality == 'low' or fname.endswith('.svg'): pylab.savefig(fname) elif quality == 'medium': pylab.savefig(fname, dpi=200) elif quality == 'high': pylab.savefig(fname, dpi=300) else: pylab.savefig(fname) pylab.close() if df.TESTMODE: return test
def box_plot(df, val, factors=None, where=None, fname=None, output_dir='', quality='medium'): """ Makes a box plot args: df: a pyvttbl.DataFrame object val: the label of the dependent variable kwds: factors: a list of factors to include in boxplot where: a string, list of strings, or list of tuples applied to the DataFrame before plotting fname: output file name quality: {'low' | 'medium' | 'high'} specifies image file dpi """ if factors == None: factors = [] if where == None: where = [] # check to see if there is any data in the table if df == {}: raise Exception('Table must have data to print data') # check to see if data columns have equal lengths if not df._are_col_lengths_equal(): raise Exception('columns have unequal lengths') # check the supplied arguments if val not in list(df.keys()): raise KeyError(val) if not hasattr(factors, '__iter__'): raise TypeError("'%s' object is not iterable" % type(factors).__name__) for k in factors: if k not in list(df.keys()): raise KeyError(k) # check for duplicate names dup = Counter([val] + factors) del dup[None] if not all([count == 1 for count in list(dup.values())]): raise Exception('duplicate labels specified as plot parameters') # check fname if not isinstance(fname, _strobj) and fname != None: raise TypeError('fname must be None or string') if isinstance(fname, _strobj): if not (fname.lower().endswith('.png') or \ fname.lower().endswith('.svg')): raise Exception('fname must end with .png or .svg') test = {} if factors == []: d = df.select_col(val, where=where) fig = pylab.figure() pylab.boxplot(np.array(d)) xticks = pylab.xticks()[0] xlabels = [val] pylab.xticks(xticks, xlabels) test['d'] = d test['val'] = val else: D = df.pivot(val, rows=factors, where=where, aggregate='tolist') fig = pylab.figure(figsize=(6 * len(factors), 6)) fig.subplots_adjust(left=.05, right=.97, bottom=0.24) pylab.boxplot([np.array(_flatten(d)) for d in D]) xticks = pylab.xticks()[0] xlabels = ['\n'.join('%s = %s' % fc for fc in c) for c in D.rnames] pylab.xticks(xticks, xlabels, rotation=35, verticalalignment='top') test['d'] = [np.array(_flatten(d)) for d in D] test['xlabels'] = xlabels maintitle = '%s' % val if factors != []: maintitle += ' by ' maintitle += ' * '.join(factors) fig.text(0.5, 0.95, maintitle, horizontalalignment='center', verticalalignment='top') test['maintitle'] = maintitle if fname == None: fname = 'box(%s' % val if factors != []: fname += '~' + '_X_'.join([str(f) for f in factors]) fname += ').png' fname = os.path.join(output_dir, fname) test['fname'] = fname # save figure if quality == 'low' or fname.endswith('.svg'): pylab.savefig(fname) elif quality == 'medium': pylab.savefig(fname, dpi=200) elif quality == 'high': pylab.savefig(fname, dpi=300) else: pylab.savefig(fname) pylab.close() if df.TESTMODE: return test
def run(self, A, B=None, pop_mean=None, paired=False, equal_variance=True, alpha=0.05, aname=None, bname=None): """ Compares the data in A to the data in B. If A or B are multidimensional they are flattened before testing. When paired is True, the equal_variance parameter has no effect, an exception is raised if A and B are not of equal length. t = \frac{\overline{X}_D - \mu_0}{s_D/\sqrt{n}} where: \overline{X}_D is the difference of the averages s_D is the standard deviation of the differences \mathrm{d.f.} = n_1 - 1 When paired is False and equal_variance is True. t = \frac{\bar {X}_1 - \bar{X}_2}{S_{X_1X_2} \cdot \sqrt{\frac{1}{n_1}+\frac{1}{n_2}}} where: {S_{X_1X_2} is the pooled standard deviation computed as: S_{X_1X_2} = \sqrt{\frac{(n_1-1)S_{X_1}^2+(n_2-1)S_{X_2}^2}{n_1+n_2-2}} \mathrm{d.f.} = n_1 + n_2 - 2 When paired is False and equal_variance is False. t = {\overline{X}_1 - \overline{X}_2 \over s_{\overline{X}_1 - \overline{X}_2}} where: s_{\overline{X}_1 - \overline{X}_2} = \sqrt{{s_1^2 \over n_1} + {s_2^2 \over n_2}} where: s_1^2 and s_2^2 are the unbiased variance estimates \mathrm{d.f.} = \frac{(s_1^2/n_1 + s_2^2/n_2)^2}{(s_1^2/n_1)^2/(n_1-1) + (s_2^2/n_2)^2/(n_2-1)} """ A = _flatten(list(copy(A))) ## try: ## A = _flatten(list(copy(A))) ## except: ## raise TypeError('A must be a list-like object') try: if B != None: B = _flatten(list(copy(B))) except: raise TypeError('B must be a list-like object') if aname == None: self.aname = 'A' else: self.aname = aname if bname == None: self.bname = 'B' else: self.bname = bname self.A = A self.B = B self.paired = paired self.equal_variance = equal_variance self.alpha = alpha if B == None: t, prob2, n, df, mu, v = _stats.lttest_1samp(A, pop_mean) self.type = 't-Test: One Sample for means' self['t'] = t self['p2tail'] = prob2 self['p1tail'] = prob2 / 2. self['n'] = n self['df'] = df self['mu'] = mu self['pop_mean'] = pop_mean self['var'] = v self['tc2tail'] = scipy.stats.t.ppf((1. - alpha), df) self['tc1tail'] = scipy.stats.t.ppf((1. - alpha / 2.), df) # post-hoc power analysis self['cohen_d'] = abs((pop_mean - mu) / math.sqrt(v)) self['delta'] = math.sqrt(n) * self['cohen_d'] self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta']) self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta']) elif paired == True: if len(A) - len(B) != 0: raise Exception('A and B must have equal lengths ' 'for paired comparisons') t, prob2, n, df, mu1, mu2, v1, v2 = _stats.ttest_rel(A, B) r, rprob2 = _stats.pearsonr(A, B) self.type = 't-Test: Paired Two Sample for means' self['t'] = t self['p2tail'] = prob2 self['p1tail'] = prob2 / 2. self['n1'] = n self['n2'] = n self['r'] = r self['df'] = df self['mu1'] = mu1 self['mu2'] = mu2 self['var1'] = v1 self['var2'] = v2 self['tc2tail'] = scipy.stats.t.ppf((1. - alpha), df) self['tc1tail'] = scipy.stats.t.ppf((1. - alpha / 2.), df) # post-hoc power analysis # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf sd1, sd2 = math.sqrt(v1), math.sqrt(v2) self['cohen_d'] = abs(mu1 - mu2) / math.sqrt(v1 + v2 - 2 * r * sd1 * sd2) self['delta'] = math.sqrt(n) * self['cohen_d'] self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta']) self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta']) elif equal_variance: t, prob2, n1, n2, df, mu1, mu2, v1, v2, svar = _stats.ttest_ind( A, B) self.type = 't-Test: Two-Sample Assuming Equal Variances' self['t'] = t self['p2tail'] = prob2 self['p1tail'] = prob2 / 2. self['n1'] = n1 self['n2'] = n2 self['df'] = df self['mu1'] = mu1 self['mu2'] = mu2 self['var1'] = v1 self['var2'] = v2 self['vpooled'] = svar self['tc2tail'] = scipy.stats.t.ppf((1. - alpha), df) self['tc1tail'] = scipy.stats.t.ppf((1. - alpha / 2.), df) # post-hoc power analysis # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf # # the pooled standard deviation is calculated as: # sqrt((v1+v2)/2.) # although wikipedia suggests a more sophisticated estimate might be preferred: # sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2)) # # the biased estimate is used so that the results agree with G*power s = math.sqrt((v1 + v2) / 2.) self['cohen_d'] = abs(mu1 - mu2) / s self['delta'] = math.sqrt((n1 * n2) / (n1 + n2)) * self['cohen_d'] self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta']) self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta']) else: t, prob2, n1, n2, df, mu1, mu2, v1, v2 = _stats.ttest_ind_uneq( A, B) self.type = 't-Test: Two-Sample Assuming Unequal Variances' self['t'] = t self['p2tail'] = prob2 self['p1tail'] = prob2 / 2. self['n1'] = n1 self['n2'] = n2 self['df'] = df self['mu1'] = mu1 self['mu2'] = mu2 self['var1'] = v1 self['var2'] = v2 self['tc2tail'] = scipy.stats.t.ppf((1. - alpha), df) self['tc1tail'] = scipy.stats.t.ppf((1. - alpha / 2.), df) # post-hoc power analysis # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf # # the pooled standard deviation is calculated as: # sqrt((v1+v2)/2.) # although wikipedia suggests a more sophisticated estimate might be preferred: # sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2)) # # the biased estimate is used so that the results agree with G*power s = math.sqrt((v1 + v2) / 2.) self['cohen_d'] = abs(mu1 - mu2) / s self['delta'] = math.sqrt((n1 * n2) / (n1 + n2)) * self['cohen_d'] self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta']) self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta'])
def run(self, A, B=None, pop_mean=None, paired=False, equal_variance=True, alpha=0.05, aname=None, bname=None): """ Compares the data in A to the data in B. If A or B are multidimensional they are flattened before testing. When paired is True, the equal_variance parameter has no effect, an exception is raised if A and B are not of equal length. t = \frac{\overline{X}_D - \mu_0}{s_D/\sqrt{n}} where: \overline{X}_D is the difference of the averages s_D is the standard deviation of the differences \mathrm{d.f.} = n_1 - 1 When paired is False and equal_variance is True. t = \frac{\bar {X}_1 - \bar{X}_2}{S_{X_1X_2} \cdot \sqrt{\frac{1}{n_1}+\frac{1}{n_2}}} where: {S_{X_1X_2} is the pooled standard deviation computed as: S_{X_1X_2} = \sqrt{\frac{(n_1-1)S_{X_1}^2+(n_2-1)S_{X_2}^2}{n_1+n_2-2}} \mathrm{d.f.} = n_1 + n_2 - 2 When paired is False and equal_variance is False. t = {\overline{X}_1 - \overline{X}_2 \over s_{\overline{X}_1 - \overline{X}_2}} where: s_{\overline{X}_1 - \overline{X}_2} = \sqrt{{s_1^2 \over n_1} + {s_2^2 \over n_2}} where: s_1^2 and s_2^2 are the unbiased variance estimates \mathrm{d.f.} = \frac{(s_1^2/n_1 + s_2^2/n_2)^2}{(s_1^2/n_1)^2/(n_1-1) + (s_2^2/n_2)^2/(n_2-1)} """ A = _flatten(list(copy(A))) ## try: ## A = _flatten(list(copy(A))) ## except: ## raise TypeError('A must be a list-like object') try: if B != None: B = _flatten(list(copy(B))) except: raise TypeError('B must be a list-like object') if aname == None: self.aname = 'A' else: self.aname = aname if bname == None: self.bname = 'B' else: self.bname = bname self.A = A self.B = B self.paired = paired self.equal_variance = equal_variance self.alpha = alpha if B == None: t, prob2, n, df, mu, v = _stats.lttest_1samp(A, pop_mean) self.type = 't-Test: One Sample for means' self['t'] = t self['p2tail'] = prob2 self['p1tail'] = prob2 / 2. self['n'] = n self['df'] = df self['mu'] = mu self['pop_mean'] = pop_mean self['var'] = v self['tc2tail'] = scipy.stats.t.ppf((1.-alpha),df) self['tc1tail'] = scipy.stats.t.ppf((1.-alpha/2.),df) # post-hoc power analysis self['cohen_d'] = abs( (pop_mean - mu) / math.sqrt(v) ) self['delta'] = math.sqrt(n) *self['cohen_d'] self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta']) self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta']) elif paired == True: if len(A) - len(B) != 0: raise Exception('A and B must have equal lengths ' 'for paired comparisons') t, prob2, n, df, mu1, mu2, v1, v2 = _stats.ttest_rel(A, B) r, rprob2 = _stats.pearsonr(A,B) self.type = 't-Test: Paired Two Sample for means' self['t'] = t self['p2tail'] = prob2 self['p1tail'] = prob2 / 2. self['n1'] = n self['n2'] = n self['r'] = r self['df'] = df self['mu1'] = mu1 self['mu2'] = mu2 self['var1'] = v1 self['var2'] = v2 self['tc2tail'] = scipy.stats.t.ppf((1.-alpha),df) self['tc1tail'] = scipy.stats.t.ppf((1.-alpha/2.),df) # post-hoc power analysis # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf sd1,sd2 = math.sqrt(v1), math.sqrt(v2) self['cohen_d'] = abs(mu1 - mu2) / math.sqrt(v1 + v2 - 2*r*sd1*sd2) self['delta'] = math.sqrt(n) *self['cohen_d'] self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta']) self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta']) elif equal_variance: t, prob2, n1, n2, df, mu1, mu2, v1, v2, svar = _stats.ttest_ind(A, B) self.type = 't-Test: Two-Sample Assuming Equal Variances' self['t'] = t self['p2tail'] = prob2 self['p1tail'] = prob2 / 2. self['n1'] = n1 self['n2'] = n2 self['df'] = df self['mu1'] = mu1 self['mu2'] = mu2 self['var1'] = v1 self['var2'] = v2 self['vpooled'] = svar self['tc2tail'] = scipy.stats.t.ppf((1.-alpha),df) self['tc1tail'] = scipy.stats.t.ppf((1.-alpha/2.),df) # post-hoc power analysis # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf # # the pooled standard deviation is calculated as: # sqrt((v1+v2)/2.) # although wikipedia suggests a more sophisticated estimate might be preferred: # sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2)) # # the biased estimate is used so that the results agree with G*power s = math.sqrt((v1+v2)/2.) self['cohen_d'] = abs(mu1 - mu2) / s self['delta'] = math.sqrt((n1*n2)/(n1+n2)) *self['cohen_d'] self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta']) self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta']) else: t, prob2, n1, n2, df, mu1, mu2, v1, v2 = _stats.ttest_ind_uneq(A, B) self.type = 't-Test: Two-Sample Assuming Unequal Variances' self['t'] = t self['p2tail'] = prob2 self['p1tail'] = prob2 / 2. self['n1'] = n1 self['n2'] = n2 self['df'] = df self['mu1'] = mu1 self['mu2'] = mu2 self['var1'] = v1 self['var2'] = v2 self['tc2tail'] = scipy.stats.t.ppf((1.-alpha),df) self['tc1tail'] = scipy.stats.t.ppf((1.-alpha/2.),df) # post-hoc power analysis # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf # # the pooled standard deviation is calculated as: # sqrt((v1+v2)/2.) # although wikipedia suggests a more sophisticated estimate might be preferred: # sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2)) # # the biased estimate is used so that the results agree with G*power s = math.sqrt((v1+v2)/2.) self['cohen_d'] = abs(mu1 - mu2) / s self['delta'] = math.sqrt((n1*n2)/(n1+n2)) *self['cohen_d'] self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta']) self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta'])