def types_diff(data): """ Statistically significance correction by using bonferroni. Arguments: --------- data: neuro_data + clinical_data Returns: ------- pandas dataframe with the features that passed the bonferroni test """ conn_stat = pd.DataFrame(columns=['ROI', 'pvalue']) feats = data.iloc[:, :-1].columns.to_list() for connections in feats: stat, p = shapiro(data[connections]) alpha = 0.05 sample1 = data.loc[data["controls_ms"] == 0, connections] sample2 = data.loc[data["controls_ms"] == 1, connections] if p > alpha: stat, p = bartlett(sample1, sample2) homovar = True if (p <= 0.05): homovar = False stat, p = ttest_ind(sample1, sample2, equal_var=homovar) else: stat, p = mannwhitneyu(data.loc[data["controls_ms"] == 0, connections], data.loc[data["controls_ms"] == 1, connections], alternative='two-sided') if (p <= 0.05): conn_stat = conn_stat.append({ 'ROI': connections, 'pvalue': p }, ignore_index=True) print('Statistically diferences in %d of connections' % len(conn_stat)) diff = conn_stat.copy() p_corr = multipletests(diff["pvalue"], alpha=0.05, method="bonferroni", is_sorted=False) diff["p_corr"] = p_corr[1] #Added the bonferroni correction diff_fdr = diff[diff["p_corr"] < 0.05] #FDR correction with lowest pvalue print('Statistically diferences in %d of connections with FDR' % len(diff_fdr)) diff_fa = diff_fdr["ROI"].tolist() fa_clinic = data.loc[:, ["age", "sex", "dd", "edss", "controls_ms"]] #Adding index fa_har # fa_har = data.set_index(data.index) perque??? fa_har_bonferroni = data[diff_fa] fa_har_corr = pd.merge(fa_har_bonferroni, fa_clinic, left_index=True, right_index=True) return fa_har_corr
def anova_by_group(data_df, resp_var, group_var): """One way anova.""" model = ols(resp_var + ' ~ ' + group_var, data=data_df).fit() anova_df = sm.stats.anova_lm(model, typ=2) anova_df['mean_sq'] = anova_df['sum_sq'] / anova_df['df'] args = [] describe_df = pd.DataFrame() for group in data_df[group_var].unique(): grouped_data_df = data_df.loc[data_df[group_var] == group] group_describe_df = grouped_data_df.describe().T.rename( {resp_var: group}) describe_df = pd.concat([ describe_df, group_describe_df.loc[group_describe_df.index == group] ]) args.append(grouped_data_df[resp_var]) markdown('#### Groups description') display(describe_df) markdown('#### ANOVA') display(anova_df[['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)']].replace({np.NaN: ''})) markdown('#### Bartlett\'s test of same variance') display(stats.bartlett(*args))
def tTestEqlVar(tValue1, tValue2, tTitle, rConclusion, frConclusion, yLabel, figName): #t test for equal variances alpha = .05 tvar, p_valvar = stats.bartlett(tValue1, tValue2) print(tTitle) print( f"The t test statistic is {round(tvar,3)} and the p-value is {round(p_valvar,4)}" ) if p_valvar < alpha: print(rConclusion) tEqVar = False ttype = 'Welch (unequal variances) Two-Sample t test' else: print(frConclusion) tEqVar = True ttype = 'Two-Sample t test (assuming equal variances)' # Create the boxplot y = [tValue1, tValue2] plt.boxplot(y) plt.title(f't: {round(tvar,3)}, p-val: {round(p_valvar,4)}', size=10) plt.suptitle(ttype, size=10) plt.xticks(range(1, 3), [ f"4 Bed rooms: {round(tValue1.mean(),2)}", f"5 Bed rooms: {round(tValue2.mean(),2)}" ]) plt.ylabel(yLabel) plt.savefig(figName, bbox_inches='tight') plt.show()
def _homogeneity_tests(self): df = self.__df homogeneityTests = pd.DataFrame( { "Test Statistic": [ stats.levene(df.iloc[:, 0], df.iloc[:, 1])[0], stats.bartlett(df.iloc[:, 0], df.iloc[:, 1])[0] ], "P-value": [ stats.levene(df.iloc[:, 0], df.iloc[:, 1])[1], stats.bartlett(df.iloc[:, 0], df.iloc[:, 1])[1] ] }, index=["Levene", "Bartlett"]) return round(homogeneityTests, 3)
def bartlett(tamannoMuestras, poblacion): results = st.bartlett(muestra(poblacion, tamannoMuestras), muestra(poblacion, tamannoMuestras), muestra(poblacion, tamannoMuestras), muestra(poblacion, tamannoMuestras)) print("Bartlett Valor Estadistico %f" % results[0]) print("Bartlett Valor p %f" % results[1])
def bartlett(data): """Description of bartlett https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bartlett.html """ if len(data) == 3: statistic, pvalue = stats.bartlett(data[0], data[1], data[2]) elif len(data) == 4: statistic, pvalue = stats.bartlett(data[0], data[1], data[2], data[3]) else: utils.print_error("TODO barlett manage more values") print("Bartlett Statistic " + str(statistic) + " and p-value " + str(pvalue)) if pvalue > 0.05: return True else: return False
def test_data(self): args = [] for k in range(1,11): args.append(eval('g%d'%k)) T, pval = stats.bartlett(*args) assert_almost_equal(T,20.78587342806484,7) assert_almost_equal(pval,0.0136358632781,7)
def kmo_Bartlett(x): x = x.astype(float) dataset_corr = x.corr() list = [dataset_corr.iloc[:, i] for i in range(dataset_corr.shape[1])] statistic, pvalue = bartlett(*list) corr_inv = np.linalg.inv(dataset_corr) nrow_inv_corr, ncol_inv_corr = dataset_corr.shape A = np.ones((nrow_inv_corr, ncol_inv_corr)) for i in range(0, nrow_inv_corr, 1): for j in range(i, ncol_inv_corr, 1): A[i, j] = -(corr_inv[i, j]) / (math.sqrt( corr_inv[i, i] * corr_inv[j, j])) A[j, i] = A[i, j] dataset_corr = np.asarray(dataset_corr) kmo_num = np.sum(np.square(dataset_corr)) - np.sum( np.square(np.diagonal(A))) kmo_denom = kmo_num + np.sum(np.square(A)) - np.sum( np.square(np.diagonal(A))) kmo_value = kmo_num / kmo_denom # kmo_value = int(kmo_value) # statistic = int(statistic) # pvalue = int(pvalue) res = [] res.append([ "{:.4f}".format(kmo_value), "{:.4f}".format(statistic), "{:.4f}".format(pvalue) ]) col = ["KMO检验统计量", "Bartlett's球状检验统计量", "Bartlett's球状检验显著性"] title = "KMO检验和Bartlett's球状检验" return {'title': title, 'col': col, 'data': res}
def fifth(data, data1, data2, data3): print(f_oneway(data1, data2, data3)) print(ttest_ind(data1, data2)) print(ttest_ind(data1, data3)) print(ttest_ind(data2, data3)) print(ttest_ind(data1, data2, equal_var=False)) print(ttest_ind(data1, data3, equal_var=False)) print(ttest_ind(data2, data3, equal_var=False)) print(bartlett(data1, data2, data3)) print(bartlett(data1, data2)) print(bartlett(data1, data3)) print(bartlett(data2, data3)) z = f.ppf(0.975, N - 1, N - 1) print(z)
def bartletts_test(table, response_cols, factor_col): groups = table[factor_col].unique() data_list = [] stat_list = [] p_list = [] for response_col in response_cols: response = table[response_col] stat_bart, p_bart = bartlett(*[response[table[factor_col] == group] for group in groups]) data = '{response_col} by {factor_col}'.format(response_col=response_col, factor_col=factor_col) data_list.append(data) stat_list.append(stat_bart) p_list.append(p_bart) result_table = pd.DataFrame.from_items([ ['data', data_list], ['estimate', stat_list], ['p_value', p_list] ]) result = dict() result['result_table'] = result_table rb = ReportBuilder() rb.addMD(strip_margin(""" ## Bartlett's Test Result | - H0: k population variances are equal. | - H1: at least two variances are different. | | {result_table} """.format(result_table=pandasDF2MD(result_table)))) result['report'] = rb.get() return {'result': result}
def _anova_assumptions(self, cl): arrays = [['Normality (Shapiro-Wilk)', 'Normality (Shapiro-Wilk)', 'Variance', 'Variance'], ['test stats', 'p-value', 'test stats', 'p-value']] temp = np.zeros((4, 1+len(self.indep_var))) index = [self.dep_var] # Experimental errors are normally distributed temp[0,0], temp[1,0] = ss.shapiro(self.ols_model.resid) if temp[1,0] > cl: # test for equal variances using Bartlett's test for i in range(len(self.indep_var)): index.append(self.indep_var[i]) list_unique = self.df[self.indep_var[i]].unique() args = [self.df.loc[self.df[self.indep_var[i]]== x].accuracy for x in list_unique] temp[2,i+1], temp[3,i+1] = ss.bartlett(*args) arrays[0][2] = arrays[0][2] + ' (Bartlett)' arrays[0][3] = arrays[0][3] + ' (Bartlett)' else: # test for equal variances using Levene's test for i in range(len(self.indep_var)): list_unique = self.df[self.indep_var[i]].unique() args = [self.df.loc[self.df[self.indep_var[i]]== x].accuracy for x in list_unique] temp[2,i+1], temp[3,i+1] = ss.levene(*args) arrays[0][2] = arrays[0][2] + ' (Levene)' arrays[0][3] = arrays[0][3] + ' (Levene)' self.anova_assump_df = pd.DataFrame(temp, index=arrays, columns=index) if self.print_output==True: print(' ------------------\n', 'ANOVA assumptions', '\n ------------------'),\ print(self.anova_assump_df, '\n') return
def check_homogene_variances(self, value_col, group_col, condition=False, display_result=True): # collect data data = self.__get_condition_sets(self.df, value_col, group_col, condition) # perform test stat, p = bartlett(*data) if display_result: print("### Homogeneity of Variances ###") if condition is False: print("{0:} between {1:}: stat={2:.5}, p={3:.5}".format( value_col, group_col, stat, p)) else: print( "{0:} in {1:} between {2:}: stat={3:.5}, p={4:.5}".format( value_col, condition, group_col, stat, p)) if p > self.alpha: print('--> Homogene Variances') else: print('--> Non-Homogene Variances') print("") return stat, p
def test_data(self): args = [] for k in range(1, 11): args.append(eval('g%d' % k)) T, pval = stats.bartlett(*args) assert_almost_equal(T, 20.78587342806484, 7) assert_almost_equal(pval, 0.0136358632781, 7)
def check_homegenity(col1,col2,verbose=False): """ Check whether distances computed for 2 models are from the same distribution """ if check_normality(col1) == True and check_normality(col2) == True: # Check homogenity for variances -- bartlett if verbose is True: print('Performing bartlett test for equal variances') _,p = bartlett(col1,col2) if p > 0.05: # Variances equal if verbose is True: print('T-test with equal variances') _,p = ttest_ind(col1,col2,equal_var=True) else: if verbose is True: print('T-test with unequal variances') _,p = ttest_ind(col1,col2,equal_var=False) if p > 0.05: if verbose is True: print('Distributions are homogenous') return True else: if verbose is True: print('Distributions are not homogenous') return False else: # Check homegenity for variances -- levene if verbose is True: print('Performing levene test for equal variances') _,p = levene(col1,col2) if p > 0.05: if verbose is True: print('Performing Mann-Whitney U test for equality of medians') _,p = mannwhitneyu(col1,col2) if p > 0.05: if verbose is True: print('Distributions are homogenous') return True else: if verbose is True: print('Distributions are not homogenous') return False else: if verbose is True: print('Variances for non-normal data are not equal') _,p = mannwhitneyu(col1,col2) if p > 0.05: if verbose is True: print('Distributions are homogenous') return True else: if verbose is True: print('Distributions are not homogenous') return False
def test_rankings(ranks_df,num_times): '''runs ind T-Test num_times to determine if there is a difference in rank across platforms args: ranks_df: str, merged DataFrame of podcasts that have both an apple and platform rank num_times: int, number of times to run the T-test to determine differences''' #create empty list for p-values of each T-test p_values =[] for i in range(num_times): ranks_sample = ranks_df.sample(50,replace = True) #collect 50 podcasts randomly w/ replacement #if variances different, T-test will use if statement. Otherwise, T-test will use else statement bartletts, bart_p = stats.bartlett(ranks_sample['apple_rank'], ranks_sample['spotify_rank']) T, T_p = stats.ttest_ind(ranks_sample['apple_rank'],ranks_sample['spotify_rank'],equal_var=bart_p >.05) # if bart_p <.05: # T, T_p = stats.ttest_ind(ranks_sample['apple_rank'],ranks_sample['spotify_rank'],equal_var=False) # elif bart_p >.05: # T, T_p = stats.ttest_ind(ranks_sample['apple_rank'],ranks_sample['spotify_rank'],equal_var=True) p_values.append(T_p) #calculate the average p-value of the tests conducted final_p = np.mean(p_values) if final_p >0.05: print('No significant difference in rank across platforms!') else: print('Significant difference in rank across platforms!') return final_p
def get_stats_Emmanuelle(stats_type, groups, *data): ''' author: version adjusted by Emmanuelle Mazur-Lainé 202206 Args: type of stats, data Return: float ''' data = data[0] nbr_gr = len(groups) if stats_type == 'mean': res_stats = () for group in data: res = stats.tmean(group) res_stats += (res, ) return (res_stats), 'mean' if stats_type == 'std': res_stats = () for group in data: res = stats.tstd(group) res_stats += (res, ) return res_stats, 'std' elif stats_type == 'kurtosis': res_stats = () for group in data: res = stats.kurtosis(group) res_stats += (res, ) return res_stats, 'kurtosis' elif stats_type == 'skewness': res_stats = () for group in data: res = stats.skew(group) res_stats += (res, ) return res_stats, 'skewness' elif stats_type == 'TTest': return stats.ttest_ind(data[0], data[1], equal_var=True), ('t', 'p') elif stats_type == 'Welch': return stats.ttest_ind(data[0], data[1], equal_var=False), ('t', 'p') elif stats_type == 'MannWhitneyu': try: return stats.mannwhitneyu(data[0], data[1]), ('u', 'p') except ValueError: return (0, 0), ('h', 'p') ########### RESTE À TROUVER COMMENT METTRE TOUS LES GROUPES # EN PRAMÈTRES DES TESTS BARTLETT, KRUSKAL ET ANOVA#### elif stats_type == 'Bartlett': return stats.bartlett(*data), ('t', 'p' ) # Bartlett, tests the null hypothesis elif stats_type == 'Kruskal': try: return stats.kruskal(*data), ('h', 'p') except ValueError: return (0, 0), ('h', 'p') elif stats_type == 'ANOVA': return stats.f_oneway(*data), ('t', 'p' ) #One way ANOVA, checks the variance
def homo_variance(data, val_col, group_col, result=False): """ data : pd.DataFrame val_col : str The name of columns which you test. group_col : strings or list The name of columns which you want to devide data. **kwargs : bool result = True -> Show p-value and result """ one_d_data = [ data.loc[ids, val_col].values for ids in data.groupby(group_col).groups.values() ] pw_normal_dist = normal_dist(data, val_col, group_col, result=result) if pw_normal_dist > 0.05: stastic, p_value = ss.bartlett(*one_d_data) if p_value > 0.05: print("Equal variance") if result == True: print(""" =========================================================== p-value is {}. Null hypothesis is not rejected. These data's variance are not different. / from levene test ===========================================================""".format(p_value)) if p_value < 0.05: print("Unequal variance") if result == True: print(""" =========================================================== p-value is {}. Null hypothesis is not rejected. These data's variance are not the same. / from levene test ===========================================================""".format(p_value)) return pw_normal_dist, p_value if pw_normal_dist < 0.05: stastic, p_value = ss.levene(*one_d_data) if p_value > 0.05: print("Equal variance") if result == True: print(""" =========================================================== p-value is {}. Null hypothesis is not rejected. These data's variance are not different. / from fligner test ===========================================================""".format(p_value)) if p_value < 0.05: print("Unequal variance") if result == True: print(""" =========================================================== p-value is {}. Null hypothesis is not rejected. These data's variance are not the same. / from fligner test ===========================================================""".format(p_value)) return pw_normal_dist, p_value
def levene_test(data): s1, p1 = levene(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2]) s2, p2 = bartlett(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2]) res = [s1, p1, s2, p2] res = [round(x, 2) for x in res] return res
def test_barlettTest_xResult(self): data_1 = randint(0, 100, 10) data_2 = randint(500, 550, 10) data_3 = randint(0, 10, 10) data_4 = randint(0, 50, 10) x1, p1 = bartlett_test(data_1, data_2, data_3, data_4) x2, p2 = bartlett(data_1, data_2, data_3, data_4) assert pytest.approx(x2) == x1
def full_test(self, groups, alpha): list_of_data = [sample.data() for sample in groups.groups()] (criterion_value, p_value) = bartlett(*list_of_data) crit_left, crit_right = self.critical_values(alpha) return criterion_value, ( crit_left, crit_right), p_value, criterion_value < crit_right
def homo_cal(self, x, norm_res, skews): '''Calculate and return the homoscedasticity test result. Parameters: ---------- x : list of numpy.ndarray The variables to test on norm_res : dict Results of normality test skews : list Skewness values of variables of x Returns: ------- homo_res : dict 'Variables': number of variables tested 'Statistic': statistic value calculated by the test 'Pvalue': p-value calculated by the test 'Test': name of the test used 'Result': True if homogeneous, False otherwise Notes: ------ Both Barlett Test and Levene Test don't require equal sample sizes''' homo_res = {} if sum(norm_res.values()) == len(x) and all(abs(np.array(skews)) < .5): # All normal, use Barlett Test homo_res['Variables'] = len(x) homo_res['Statistic'] = ss.bartlett(*x)[0] homo_res['Pvalue'] = ss.bartlett(*x)[1] homo_res['Test'] = 'Bartlett Test' else: # Not all normal, use unparametric Levene Test c = self.get_center(skews) homo_res['Variables'] = len(x) homo_res['Statistic'] = ss.levene(*x, center=c)[0] homo_res['Pvalue'] = ss.levene(*x, center=c)[1] homo_res['Test'] = 'Levene Test' # Also Fligner-Killeen Test is an option if homo_res['Pvalue'] >= .05: homo_res['Result'] = True else: homo_res['Result'] = False return homo_res
def equal_variance_test(df): """ test for heteroskedasticity """ """ to test the entire data set, pass argument as pd.concat([train,test]) """ all_samples = df[[ 'ProductCategory', 'MasterSKU', 'month', 'new_product', 'price_change', 'cluster', 'monthly_sum_order_qty' ]].values return bartlett(*all_samples)
def is_homoscedastic(residuals, y, ha_threshold=0.05, verbose=False): print_verbose(f"Testing for homoscedasticity with an alpha of: {str(ha_threshold)}. The null hypothesis is that the errors are homoscedastic.", verbose=verbose) result = bartlett(residuals, y) if ha_threshold >= result[1]: print_verbose(f"P-value for Bartlett test is {str(result[1])} which is at or below the threshold. We therefore reject the null hypothesis and accept the errors are heteroscedastic.", verbose=verbose) return False print_verbose(f"P-value for Bartlett test is {str(result[1])} which is greater than the threshold. We therefore do not reject the null hypothesis and accept the errors are homoscedastic.", verbose=verbose) return True
def fit(self, *args): """Perform Bartlett’s test for equal variances Parameters ------- sample1, sample2,... : array_like arrays of sample data. May be different lengths. """ self._statistic, self._p = bartlett(*args)
def homoscedasticity_test(df, X, Y, test_type='levene'): groups = [df[df[X] == cls][Y] for cls in df[X].unique()] if test_type is 'levene': levene, p_value = st.levene(*groups) elif test_type is 'bartlett': bartlett, p_value = st.bartlett(*groups) else: raise Exception('{} not valid'.format(test_type)) return p_value
def levene(self): test = stats.bartlett(x[self.var], y[self.var]) print('< 등분산성 검정 levene test >', end='\n ') if test.pvalue < 0.05: print(str(self.var) + '변수는 두 집단 사이에서 분산이 같지 않다 => 이분산') self.Ttest(False) else: print(str(self.var) + '변수는 두 집단 사이에서 분산이 같다 => 등분산') self.Ttest(True)
def getSamples(df, val, survivaldict): if val == 0: col = 'subtypes' elif val == 1: col = 'novel' else: col = 'sklearn' clist = df.loc[df[col] == 0, 'samples'] mlist = df.loc[df[col] == 1, 'samples'] nlist = df.loc[df[col] == 2, 'samples'] plist = df.loc[df[col] == 3, 'samples'] cvals = [ float(survivaldict[i]) for i in clist if (float(survivaldict[i]) > 0) ] mvals = [ float(survivaldict[i]) for i in mlist if (float(survivaldict[i]) > 0) ] nvals = [ float(survivaldict[i]) for i in nlist if (float(survivaldict[i]) > 0) ] pvals = [ float(survivaldict[i]) for i in plist if (float(survivaldict[i]) > 0) ] # print(min(cvals), max(cvals), median(cvals)) # print(min(mvals), max(mvals), median(mvals)) # print(min(nvals), max(nvals), median(nvals)) # print(min(pvals), max(pvals), median(pvals)) print("----Signficance tests----") # bartlett for equal variance bartf, bartp = stats.bartlett(cvals, mvals, nvals, pvals) print("Bartlett's test: p-value = ", bartp) # one-way anova for significant differences in mean -> ASSUMES equal variance anovaf, anovap = stats.f_oneway(cvals, mvals, nvals, pvals) print("ANOVA test: p-value = ", anovap) # kruskal doesnt assume equal variance krusf, krusp = stats.kruskal(cvals, mvals, nvals, pvals) print("Kruskal test: p-value = ", krusp) plt.clf() plt.boxplot([cvals, mvals, nvals, pvals]) plt.show() # name = col + "_bp.png" # plt.savefig(name) cmean = mean(cvals) mmean = mean(mvals) nmean = mean(nvals) pmean = mean(pvals) cstd = variance(cvals) mstd = variance(mvals) nstd = variance(nvals) pstd = variance(pvals) return cmean, mmean, nmean, pmean, cstd, mstd, nstd, pstd
def bartlett(df, treatment_name_list, treatment_name, value_name): """ Equal Variances (barlett's Test) """ data = [] for i, name in enumerate(treatment_name_list): data.append(df[value_name][df[treatment_name] == name]) stat = stats.bartlett(*data) print(f'p-value: {stat[1]}') return stat
def compare(data_old: pd.DataFrame, data_new: pd.DataFrame, t_test_var: str): # T test _, p = ttest_ind(data_old[t_test_var], data_new[t_test_var], equal_var=False) print(f'p val for {t_test_var} mean: {p}') # Variance test _, p = bartlett(data_old[t_test_var], data_new[t_test_var]) print(f'p val {t_test_var} variance: {p}')
def check_homoscedasticy(): X, _, y, _ = generate_year_06_dataset() statistic, p_value = levene(*list(X.T.to_numpy())) print("levene: statistic = {}, p_value = {}".format(statistic, p_value)) statistic, p_value = bartlett(*list(X.T.to_numpy())) print("bartlett: statistic = {}, p_value = {}".format(statistic, p_value))
def bartlett(a, b, c): print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") print("Kiểm định Bartlett:") stat, pvalue = stats.bartlett(a, b, c) print("Statistic =", stat, "\n", "p value =", pvalue) if pvalue > 0.05: print("Các features đồng nhất về phương sai") else: print("Các features không đồng nhất về phương sai") print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
def bartlett_pandas(group_vec, response_vec, min_size=5): ''' Wrapper to do a one way anova on pandas Series ------------------------------------------------ group_vec: Series of labels response_vec: Series of measurements ''' if group_vec.value_counts().min() < min_size: return np.nan group_vec, response_vec = _match_series(group_vec, response_vec) res = stats.bartlett(*[response_vec[group_vec == num] for num in group_vec.unique()]) return pd.Series(res, index=['T','p'])
def run(self): if len(self._data) < self._min_size: pass if len(self._data.groups.values()) <= 1: raise NoDataError("Equal variance test requires at least two numeric vectors.") if NormTest(self._data, display=False, alpha=self._alpha).p_value > self._alpha: statistic, p_value = bartlett(*self._data.groups.values()) r = 'Bartlett' self._results.update({'p value': p_value, self._statistic_name[r]: statistic, 'alpha': self._alpha}) else: statistic, p_value = levene(*self._data.groups.values()) r = 'Levene' self._results.update({'p value': p_value, self._statistic_name[r]: statistic, 'alpha': self._alpha}) self._test = r self._name = self._names[r]
def is_equal_variance(mesa1, mesa2): """ Determine if two sets of values have equal variance. This uses the Bartlett test to determine whether or not the values are of equal variance. This test only holds for a normal distribution. The caller should have checked this for us. TODO: Implement Levene’s test for non-normally distributed data """ # scipy really doesn't like 0 variance if np.var(mesa1) == 0 and np.var(mesa1) == np.var(mesa2): return True # http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm T, _p = stats.bartlett(mesa1, mesa2) x2 = chisquare_critical(BARTLETT_CI, len(mesa1)) return T <= x2
def anova1(data,alpha): ''' Returns list of anova-table data. Inputs: data - list of numpy arrays with different length. It is assumed that all arrays have the same variance and are normal distributed. Computational steps: 1. All ''' IsAllNormalDistributed=True for k in data: res=st.shapiro(k) if alpha>res[1]: IsAllNormalDistributed=False break #It is not necessary for looping... if IsAllNormalDistributed==False: res=st.kruskal(*data) res=list(res) res.append('kr') else: w=st.bartlett(*data) if alpha<w[1]: #All variances are equal. We have to perform standart anova analysis. res=st.f_oneway(*data) res=list(res) res.append('fs') else: # All the data are normal distributed, but variances are different. # Do pairwise comparison of columns using chrancox criterium print 'All data normal distributed' wta=[] for k in data: for j in [x for x in data if np.all(x!=k)]: wta.append(welch_test(k,j)[1]) res=[0,min(wta),'wl'] return res
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] trendData={} annot={} minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) if firstColAnnot: colAnnot=cols[0] cols=cols[1:] annotThisFile=[] annot[startIdx]=annotThisFile else: colAnnot=-1 annotThisFile=None for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) if plotTrend: #print >> stderr,"plotTrend" trendDataThisFile=[] trendData[startIdx]=trendDataThisFile else: trendDataThisFile=None lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) if plotTrend: #print >> stderr,"a" trendDataThisLine=[] else: trendDataThisLine=None allDataOKThisLine=True if colAnnot>=0: annotThisFile.append(fields[colAnnot]) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: if value==0.0: raise ValueError value=log(value)/logb plotData[idx].append(value) if plotTrend: trendDataThisLine.append(value) #print >> stderr,"value:",value except: allDataOKThisLine=False if plotTrend: if allDataOKThisLine: trendDataThisFile.append(trendDataThisLine) else: trendDataThisFile.append(None) fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr,xtickLabels print >> stderr,relabels for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])<minNDataToKeep: print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep del plotData[c] del xtickLabels[c] if not skipStat: print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val","median" if writeDataSummaryStat: fDSS=open(writeDataSummaryStat,"w") print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x]) except: print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x]) if writeDataSummaryStat: sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1]) if NIN>1: #print >> stderr,"sumData=",sumData #print >> stderr,mean mea=mean2(sumData) DDOF=1 sd=std(sumData,ddof=DDOF) var=sd*sd mi=min(sumData) ma=max(sumData) else: mea="NA" sd="NA" var="NA" mi="NA" ma="NA" print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N) pvalueM=[] if writeDataSummaryStat: fDSS.close() print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) #####now the variance tests print >> stdout,"" print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=ansari(plotData[x],plotData[y])[1] except: pvalue="NA" if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 #pvalue=1.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=fligner(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Levene's Two-sample Test for equal variance" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=levene(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=bartlett(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster) ##### figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels) drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
def create_stats_dict(df, group_var, continuous_measures=None, discrete_measures=None): ''' This function first groups the data frame (df) by the grouping variable (group_var) It then loops through the discrete and continuous measures and tests the two groups for equality of means and variances. Next, the continous measures and discrete measures are all compared for each group separately and the whole group together. All data is stored in the returned dictionary (stats_dict). For the discrete measures there are the following entries: * KEY: Number of participants, eg: Meds_male_n VALUE: Contingency table of the number of people in each combination of group_var and discrete measure * KEY: Output of fisher's exact test, eg: Meds_male_fisher VALUE: odds_ratio and p_value tuple For the continuous measures there are the following entries: * KEY: Number of observations, eg: Age_mean VALUE: Number of observations in each group of group_var * KEY: Mean value, eg: Age_mean VALUE: Mean value for the continuous measure for each group of group_var * KEY: Standard deviation, eg: Age_std VALUE: Sandard deviation for the continuous measure for each group of group_var * KEY: Percentile_values, eg: Age_perc25, Age_perc50, Age_perc75 VALUE: 25th, 50th and 75th percentile values for the continuous measure for each group of group_var * KEY: Median value, eg: Age_perc50 VALUE: Median value for the continuous measure for each group of group_var * KEY: Test of equal variance output, eg: Age_eqvar VALUE: t and p values for bartlett test of equal variance * KEY: Test of normality, eg: Age_normal VALUE: k2 and p values for omnibus test of normality for *ungrouped* data * KEY: Test of equal means given equal variance output, eg: Age_ttest_eqvar VALUE: t and p values for student's t-test * KEY: Test of equal means given unequal variance output, eg: Age_ttest_uneqvar VALUE: t and p values for Welsch's t-test * KEY: Test of equal medians, eg: Age_mannwhitneyu VALUE: U and p values for Mann Whitney U test ''' # Import what you need import numpy as np from scipy.stats import ttest_ind, bartlett, fisher_exact, pearsonr, mannwhitneyu, normaltest import itertools as it # Create the stats dictionary that we're going to fill in stats_dict = {} # Group the data frame by the grouping variable grouped = df.groupby(group_var) # Loop first through the discrete measures (if there are any) if discrete_measures: for measure in discrete_measures[0]: # Group the data frame by BOTH the grouping variable # and the discrete measure grouped_again = df.groupby([group_var, measure]) # Define the key for "n" entry to the stats dictionary key = '_'.join([group_var, measure, 'n']) # Add the number of members of each group into the stats_dict stats_dict[key] = grouped_again[measure].count().values[:] # If you have members in each of the four groups then # calculate the Fisher's exact test on this contingency table if len(np.array(grouped_again[measure].count())) == 4: # The n_array is the contingency table n_array = np.array(grouped_again[measure].count()).reshape([2,2]) # Define the key for the dictionary entry key = '_'.join([group_var, measure, 'fisher']) # And add the fisher's exact output to the stats_dict stats_dict[key] = fisher_exact(n_array) # Loop through the continuous measures if continuous_measures: for measure in continuous_measures: # Save some basic stats for each measure by group key = '_'.join([measure, 'n']) stats_dict[key] = grouped[measure].count().values[:] key = '_'.join([measure, 'mean']) stats_dict[key] = grouped[measure].mean().values[:] key = '_'.join([measure, 'std']) stats_dict[key] = grouped[measure].std().values[:] key = '_'.join([measure, 'perc25']) stats_dict[key] = grouped[measure].quantile(0.25).values[:] key = '_'.join([measure, 'perc50']) stats_dict[key] = grouped[measure].quantile(0.5).values[:] key = '_'.join([measure, 'median']) stats_dict[key] = grouped[measure].quantile(0.5).values[:] key = '_'.join([measure, 'perc75']) stats_dict[key] = grouped[measure].quantile(0.75).values[:] # Now save the output of tests of equal variance # and equal means # Use this snazzy little list manipulation to get the # group values values = [ g.values for n, g in grouped[measure] ] # If there are two groups if len(values) == 2: # Mask out the not a numbers values[0] = [ x for x in values[0] if not np.isnan(x) ] values[1] = [ x for x in values[1] if not np.isnan(x) ] # Conduct test for equal variance key = '_'.join([measure, 'eqvar']) stats_dict[key] = bartlett(values[0], values[1]) # Conduct test for normality key = '_'.join([measure, 'normal']) stats_dict[key] = normaltest(np.hstack([values[0], values[1]])) # When you test for equal means (ttest) you have different options # depending on if you have equal variances or not. You can also # run the non-parametric Mann Whitney U test # All three will be entered in the stats_dict # Conduct Welch's t-test (unequal variances) key = '_'.join([measure, 'ttest_uneqvar']) stats_dict[key] = ttest_ind(values[1], values[0], equal_var = False) # Conduct standard student's t-test (equal variances) key = '_'.join([measure, 'ttest_eqvar']) stats_dict[key] = ttest_ind(values[1], values[0], equal_var = True) # Conduct mann whitney U test (non-parametric test of medians) key = '_'.join([measure, 'mannwhitneyu']) u, p = mannwhitneyu(values[1], values[0]) stats_dict[key] = (u, p*2) # For two continuous measues we can calculate # PAIRWISE CORRELATIONS if continuous_measures: for a, b in it.combinations(continuous_measures,2): # First look at the whole group # mask out participants who don't have both measures mask = (df[a].notnull()) * (df[b].notnull()) # Enter the number of participants that were included for the # regression into your stats dict key = '_'.join([group_var, 'all', a, b, 'n']) stats_dict[key] = np.sum(mask) # Figure out the pairwise correlation for this pair of measures # and add it to your stats_dict a_values = df[a][mask].values b_values = df[b][mask].values key = '_'.join([group_var, 'all', a, b, 'pwcorr']) stats_dict[key] = pearsonr(a_values, b_values) # Then do the same thing for the groups individually for name, group in grouped: mask = (group[a].notnull()) * (group[b].notnull()) # Save the number of members of the group who were # included in the regression key = '_'.join([group_var, str(name), a, b, 'n']) stats_dict[key] = np.sum(mask) # And save the pairwise correlation a_values = group[a][mask].values b_values = group[b][mask].values key = '_'.join([group_var, str(name), a, b, 'pwcorr']) stats_dict[key] = pearsonr(a_values, b_values) # For a combination of a continous measure and a discrete measure # we can conduct TTESTS if discrete_measures: for discrete, a in it.product(discrete_measures[0], continuous_measures): # First look at the whole group grouped_discrete = df.groupby(discrete) values = [ g.values for n, g in grouped_discrete[a] ] if len(values) == 2: # Mask out the not a numbers values[0] = [ x for x in values[0] if not np.isnan(x) ] values[1] = [ x for x in values[1] if not np.isnan(x) ] # Conduct test for equal variance key = '_'.join([group_var, 'all', discrete, a, 'eqvar']) stats_dict[key] = bartlett(values[0], values[1]) # Conduct test for normality key = '_'.join([group_var, 'all', discrete, a, 'normal']) stats_dict[key] = normaltest(np.hstack([values[0], values[1]])) # When you test for equal means (ttest) you have different options # depending on if you have equal variances or not. You can also # run the non-parametric Mann Whitney U test # All three will be entered in the stats_dict # Conduct Welch's t-test (unequal variances) key = '_'.join([group_var, 'all', discrete, a, 'ttest_uneqvar']) stats_dict[key] = ttest_ind(values[1], values[0], equal_var = False) # Conduct standard student's t-test (equal variances) key = '_'.join([group_var, 'all', discrete, a, 'ttest_eqvar']) stats_dict[key] = ttest_ind(values[1], values[0], equal_var = True) # Conduct mann whitney U test (non-parametric test of medians) key = '_'.join([group_var, 'all', discrete, a, 'mannwhitneyu']) u, p = mannwhitneyu(values[1], values[0]) stats_dict[key] = (u, p*2) # Next look at the two groups separately: for name, group in grouped: grouped_discrete = group.groupby(discrete) values = [ g.values for n, g in grouped_discrete[a] ] if len(values) == 2: # Mask out the not a numbers values[0] = [ x for x in values[0] if not np.isnan(x) ] values[1] = [ x for x in values[1] if not np.isnan(x) ] # Conduct test for equal variance key = '_'.join([group_var, str(name), discrete, a, 'eqvar']) stats_dict[key] = bartlett(values[0], values[1]) # Conduct test for normality key = '_'.join([group_var, str(name), discrete, a, 'normal']) stats_dict[key] = normaltest(np.hstack([values[0], values[1]])) # When you test for equal means (ttest) you have different options # depending on if you have equal variances or not. You can also # run the non-parametric Mann Whitney U test # All three will be entered in the stats_dict # Conduct Welch's t-test (unequal variances) key = '_'.join([group_var, str(name), discrete, a, 'ttest_uneqvar']) stats_dict[key] = ttest_ind(values[1], values[0], equal_var = False) # Conduct standard student's t-test (equal variances) key = '_'.join([group_var, str(name), discrete, a, 'ttest_eqvar']) stats_dict[key] = ttest_ind(values[1], values[0], equal_var = True) # Conduct mann whitney U test (non-parametric test of medians) # NOTE that this returns a 1 tailed p value so we multiply it here key = '_'.join([group_var, str(name), discrete, a, 'mannwhitneyu']) u, p = mannwhitneyu(values[1], values[0]) stats_dict[key] = (u, p*2) # For combos of discrete measures then you can conduct # FISHER EXACT tests if discrete_measures: if len(discrete_measures) > 1: for a, b in it.combinations(discrete_measures[0], 2): # Look first at the whole group grouped_again = df.groupby([a,b]) if len(np.array(grouped_again[b].count())) == 4: key = '_'.join([group_var, 'all', a, b, 'n']) stats_dict[key] = grouped_again[b].count().values[:] # Now calculate the Fisher's exact test on this contingency table n_array = np.array(grouped_again[b].count()).reshape([2,2]) key = '_'.join([group_var, 'all', a, b, 'fisher']) stats_dict[key] = fisher_exact(n_array) # Now loop through the two groups separately for name, group in grouped: grouped_again = group.groupby([a,b]) if len(np.array(grouped_again[b].count())) == 4: key = '_'.join([group_var, 'all', a, b, 'n']) stats_dict[key] = grouped_again[b].count().values[:] # Now calculate the Fisher's exact test on this contingency table n_array = np.array(grouped_again[b].count()).reshape([2,2]) key = '_'.join([group_var, 'all', a, b, 'fisher']) stats_dict[key] = fisher_exact(n_array) return stats_dict
def bartlett((x, y)): return stats.bartlett(x, y)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
# bartlett(*args): This package is used to perform Bartlett’s test for equal variances # levene(args, *kwds): This package is used to perform Levene test for equal variances # shapiro(x[, a, reta]): This package is used to perform the Shapiro-Wilk test for normality # anderson(x[, dist]): This package is used to perform the Anderson-Darling test for data coming from a particular distribution # anderson_ksamp(samples[, midrank]): This package is used to perform the Anderson-Darling test for k-samples ## one sample t_test t_statistic, p_value = stats.ttest_1samp(a= engineering_breaks, popmean= breaks.mean()) ## Bartlett for equal variance and then t_test fig, axes = plt.subplot(1,2, figsize=(14, 4)) sns.boxplot(x='acl', y='Grade', data=student, ax=axes[0]) sns.pointplot(x='acl', y='Grade', data=student, ax=axes[1]) grades_low = student['Grade']['acl'=='low'] grades_high = student['Grade']['acl'=='High'] stats.bartlett(grades_low, grades_high) stats.ttest_ind(grades_low, grades_high, equal_var=True) A = np.random.normal(25.0, 5.0, 100000) B = np.random.normal(25.0, 5.0, 100000) stats.ttest_ind(A, B) %%R confint(lm(booking_successful ~ assignment-1, data=tot_success)) ## chi-square test for goodness of fit observed = [102, 178, 186, 34] expected = [156, 165.5, 147, 31.5] chi_squared, p_value = stats.chisquare(f_obs= observed, f_exp= expected) chi_squared, p_value
nonathelete = data[data['Athlete'] == 0]['MileMinDur'] # Converting dataset from hh:mm:ss format to a numerical number: running time in minutes athelete=athelete.astype(str).reshape(athelete.size,1) nonathelete=nonathelete.astype(str).reshape(nonathelete.size,1) athelete=athelete[numpy.where(athelete!=[' '])] nonathelete=nonathelete[numpy.where(nonathelete!=[' '])] for i in range(numpy.shape(athelete)[0]) : h,m,s=athelete[i].split(':') athelete[i]=int(h)*60+int(m)+(int(s)/60.) for j in range(numpy.shape(nonathelete)[0]) : h,m,s=nonathelete[j].split(':') nonathelete[j]=int(h)*60+int(m)+(int(s)/60.) #Defining significance level alpha=0.05 #Perfoming Barlett's Test. This tests whether the populations variances are equal t, p = stats.bartlett(athelete,nonathelete) print 'barlett test statistic is', t, 'and p-value', p # Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level if p <= alpha: print 'p-value is too small, null hypothesis is rejected. Variances of the data sets are not equal' else: print 'P-value is greater than significance level',alpha,', Barlett test fails to reject the null hypothesis. Variances are similar' #Performing Levene's Test. This tests whether the populations are equal t, p = stats.levene(athelete,nonathelete) print 'Levene test statistic is', t, 'and p-value', p # Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level if p <= alpha: print 'p-value is too small, null hypothesis is rejected. Variances of the data sets are not equal' else: print 'P-value is greater than significance level',alpha,', Levene test fails to reject the null hypothesis. Variances are similar' #Performing Welch's Test. This test whether population means are equal
def analysis_3(df_Coredata, setumei, mokuteki): """一元配置分散分析 : 等分散検定 バートレットの検定 """ # http://lang.sist.chukyo-u.ac.jp/classes/PythonProbStat/Python-statistics6.html # https://py3math.hatenablog.com/entry/oneway-anovatests1 # 使用したデータは『すぐできる生物統計』 # # 一元配置分散分析: # 概要: # 2つのグループが互いに有意に違っているかどうか。 # # 帰無仮説: # 2つのグループが同じ *** をもっている。 #ユニークな要素のリスト u = df_Coredata[setumei].unique() print( "水準数 : " + str(len(u)) ) #listNumb = range(10) #print(listNumb) #最小の水準数にあわせる。最大だとnp.sumで問題発生。 valList = [] j = 0 for i in u: valList.append( len(df_Coredata[ (df_Coredata[setumei] == i) ] ) ) print( "最小サンプル数 : " + str(min(valList)) ) minVal = min(valList) df_temp = pd.DataFrame(index = range(minVal)) #インデックスを0からの連番としたデータフレームの定義 for i in u: #print(df_Coredata[ (df_Coredata[setumei] == i) ]) df_temp_2 = df_Coredata[ (df_Coredata[setumei] == i) ] df_temp_2.index = range(len(df_temp_2)) df_temp[i] = df_temp_2[mokuteki] print(df_temp) GroupAverageMatrix =np.ones(df_temp.shape) for i in range(df_temp.shape[1]): GroupAverageMatrix[:,i] = df_temp.mean().iloc[i] InGroup = np.array(df_temp - GroupAverageMatrix) InGroupSquareSum = np.sum(InGroup**2) OverallMean = np.sum(df_temp.mean())/len(df_temp.columns) InterGroup =GroupAverageMatrix - np.ones(df_temp.shape)*OverallMean InterGroupSquareSum = np.sum(InterGroup**2) Dividend = InterGroupSquareSum / (len(df_temp.columns) - 1.0) Divider = InGroupSquareSum /( ( len(df_temp.index) -1.0)*len(df_temp.columns)) print(Divider) #### f値の導出 print( df_temp[ u[0:len(u)] ] ) res = f_oneway( df_temp[ u[0] ] , df_temp[ u[1] ], df_temp[ u[2] ], df_temp[ u[3] ] ) print("f_oneway : " + str( res )) F_value, p_value = res # 帰無仮説が棄却されるかどうか。 if p_value < 0.05: print('p 値: {} < 0.05'.format(p_value)) print('帰無仮説は棄却される。') else: print('p 値: {} > 0.05'.format(p_value)) print('帰無仮説は棄却されない。') #バートレット検定 bt_results = stats.bartlett( df_temp[ u[0] ] , df_temp[ u[1] ], df_temp[ u[2] ], df_temp[ u[3] ] ) print(bt_results)
def calc_ttest_dict(a, b, paired=False): ''' Calculate the comparison between the two sets of data Importantly, although the stars will be the same, this code accurately applies either a Student's t, Welch's t, or Mann Whitney U test ''' # Import what you need import numpy as np from scipy.stats import ttest_ind, ttest_rel, bartlett, mannwhitneyu, normaltest, wilcoxon stats_dict = {} # Mask out the not a numbers a = [ x for x in a if not np.isnan(x) ] b = [ x for x in b if not np.isnan(x) ] # Save number of people in each group stats_dict['n'] = (len(a), len(b)) # Conduct test for equal variance stats_dict['eqvar'] = bartlett(a, b) # Conduct test for normality stats_dict['normal'] = normaltest(np.hstack([a, b])) # When you test for equal means (ttest) you have different options # depending on if you have equal variances or not. You can also # run the non-parametric Mann Whitney U test # Alternatively these data may be paired so there's also the # paired t-test and the Wilcoxon signed rank test # All five will be entered in the stats_dict # Conduct Welch's t-test (unequal variances) stats_dict['ttest_uneqvar'] = ttest_ind(a, b, equal_var = False) # Conduct standard student's t-test (equal variances) stats_dict['ttest_eqvar'] = ttest_ind(a, b, equal_var = True) # Conduct mann whitney U test (non-parametric test of medians) stats_dict['mannwhitneyu'] = mannwhitneyu(a, b) if paired: # Conduct the paired student's t-test stats_dict['ttest_paired'] = ttest_rel(a, b) # Conduct Wilcoxon signed rank test (non-parametric *paired* test of medians) stats_dict['wilcoxon'] = wilcoxon(a, b) # Save in the stats dict the various other measures you might # want to report stats_dict['medians'] = [np.percentile(a, 50), np.percentile(b, 50)] stats_dict['percentile25'] = [np.percentile(a, 25), np.percentile(b, 25)] stats_dict['percentile75'] = [np.percentile(a, 75), np.percentile(b, 75)] stats_dict['means'] = [np.mean(a), np.mean(b)] stats_dict['stds'] = [np.std(a), np.std(b)] stats_dict['dfs'] = [(np.float(stats_dict['n'][0])-1), (np.float(stats_dict['n'][1])-1)] stats_dict['pooled_std'] = np.sqrt( (np.float(stats_dict['dfs'][0])*(np.float(stats_dict['stds'][0])**2) + np.float(stats_dict['dfs'][1])*(np.float(stats_dict['stds'][0])**2)) / (np.float(stats_dict['dfs'][0]) + np.float(stats_dict['dfs'][1]))) if paired: stats_dict['mean_difference'] = np.mean(np.array(b)-np.array(a)) stats_dict['std_difference'] = np.std(np.array(b)-np.array(a)) stats_dict['median_difference'] = np.percentile(np.array(b)-np.array(a), 50) stats_dict['percentile25_difference'] = np.percentile(np.array(b)-np.array(a), 25) stats_dict['percentile75_difference'] = np.percentile(np.array(b)-np.array(a), 75) stats_dict['cohens_d'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['pooled_std']) stats_dict['cohens_d_paired'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['std_difference']) return stats_dict
cur.append(stats.mstats.normaltest(Y)[1]) #plt.hist(Y) #stats.probplot(Y, dist="norm", plot=pylab) # F-test, strong normality required F = np.var(Xr)/np.var(Y) df1 = len(Xr) - 1; df2 = len(Y) - 1 alpha = 0.05 #Or whatever you want your alpha to be. p_value = stats.f.sf(F, df1, df2) # p-value = 1-CDF cur.append(p_value) if p_value < alpha: print "Reject the null hypothesis that Var(X) == Var(Y)" else: print "equal variance !" cur.append(stats.bartlett(Xr,Y)[1]) # require normal cur.append(stats.levene(Xr,Y,center='median')[1]) # for non-normal samples # t-test, after equal variance, test for mean cur.append(stats.ttest_ind(Xr, Y)[1]) cur.append(stats.ttest_ind(Xr, Y, equal_var=False)[1]) cur.append(stats.mannwhitneyu(Xr, Y)[1]) out.append(cur) alpha = .5 outary = np.array(out) col = 0 nrej = sum(outary[:,col]<alpha) print 'Fraction of reject normaltest on Xr: %.2f' % (nrej*1./nbootstrapp) col = 1 nrej = sum(outary[:,col]<alpha)
lexperiments = ['e140515'] for expname in lexperiments: datainfo = experiments[expname] f = h5py.File(datainfo.dpath + datainfo.name + '.hdf5', 'r') rslt = 2400000 step = 2400000 d = f[datainfo.datafiles[0] + '/' + 'RawFiltered'] datainit = d[0:step, 5] # print datainfo.datafiles[0], np.mean(datainit), np.std(datainit) for dfile in range(1,len(datainfo.datafiles)): d = f[datainfo.datafiles[dfile] + '/' + 'RawFiltered'] datanext = d[0:step, 5] bval, pval = bartlett(datainit, datanext) print datainfo.datafiles[dfile], np.mean(datanext), np.std(datanext) print datainfo.datafiles[dfile], bval, pval, '*' datainit = datanext # for s in range(len(datainfo.sensors)): # print dfile, datainfo.sensors[s] # for pos in range(1, length): # pvar = np.mean(d[pos*step:(pos*step)+rslt, s]) # pmean = np.std(d[pos*step:(pos*step)+rslt, s]) # #print pmean, pvar # show_signal(d[(pos-1)*step:((pos-1)*step)+rslt, s]) # show_signal(d[pos*step:(pos*step)+rslt, s])
from scipy import stats #let's first try Bartlett's test: angles = np.load('/Users/cyrilrocke/Documents/c_elegans/data/test1/data/angles.npy') #check the variances: Vars = [] for i in range(48): Vars.append(np.var(angles[:,i])) features = [] for i in range(48): features.append(angles[:,i]) if stats.bartlett(*features)[1] < 0.05: print('We cant use Kmeans') #now let's test the assumption that cluster sizes are approximately uniformly #distributed. This test isn't as important as the first. all_postures = np.load('/Users/cyrilrocke/Documents/c_elegans/data/arrays/all_postures.npy') ALL = [] for i in range(39): ALL+=all_postures[i].split(' ') ALL.remove('') ALL = [int(i) for i in ALL] if stats.chisquare(ALL)[1] < 0.05: print('chisquare fail')
def test_data(self): args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] T, pval = stats.bartlett(*args) assert_almost_equal(T,20.78587342806484,7) assert_almost_equal(pval,0.0136358632781,7)
# 2samplepaired_braindata.py: import pandas from scipy import stats import numpy # reading data file data = pandas.read_csv('http://www.scipy-lectures.org/_downloads/brain_size.csv', sep=';', na_values=".") #Defining significance level alpha=0.05 #Perfoming Barlett's Test. This tests whether the populations variances are equal t, p = stats.bartlett(data['FSIQ'], data['PIQ']) print 'barlett test statistic is', t, 'and p-value', p # Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level if p <= alpha: print 'p-value is too small, null hypothesis is rejected. Variances of the data sets are not equal' else: print 'P-value is greater than significance level',alpha,', Barlett test fails to reject the null hypothesis. Variances are similar' #Performing Levene's Test. This tests whether the populations are equal t, p = stats.levene(data['FSIQ'], data['PIQ']) print 'Levene test statistic is', t, 'and p-value', p # Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level if p <= alpha: print 'p-value is too small, null hypothesis is rejected. Variances of the data sets are not equal' else: print 'P-value is greater than significance level',alpha,', Levene test fails to reject the null hypothesis. Variances are similar' #Performing Paired Samples t-test. # Null Hypothesis: Mean Full Scale IQ (FSIQ) and Mean Performance IQ (PIQ), measured on the same individuals, are equal. " t, p=stats.ttest_rel(data['FSIQ'], data['PIQ']) print 'Paired t- test statistic is', t, 'and p-value', p #Calculation of difference between mean point estimates: diffmean=abs(numpy.mean(data['FSIQ']) - numpy.mean(data['PIQ'])) # Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level
def statdesc(data, missing='NaN', labels=[], alpha=.05, show=2): """ Descriptive statistics of data. This function calculates the following statistics for each column (variable) of the input: mean and unbiased standard deviation [1]_, 95% confidence interval (confidence limits for the mean) with unknown population STD [2]_, minimum and maximum, median, 25th and 75th percentiles [3]_, test for normality (Shapiro-Wilk's test) [4]_, and a test for equality of variances for all columns (Levene's or Bartlett's test) [5]_. This function also generates plots (if matplotlib is available) to visualize the data and shows the calculated statistics on screen. Parameters ---------- data : array_like 1D or 2D (column oriented) numerical data with possible missing values missing : string ('nan') or number (int or float), optional option to enter a number representing missing values (default = 'nan') labels : list of strings, optional labels for each column (variable) in data alpha : float, optional statistical significance level (to decide which test for equality of variances to use) show : integer (0 or 1 or 2), optional option to show plots with some descritive statistics (0: don't show any plot; 1: show plots only for the grouped data; 2: show plots for individual data as well as for the grouped data (default)) Returns ------- m_sd : array mean and unbiased standard deviation of each column (variable) in data ci : array 95% confidence interval (confidence limits for the mean) with unknown population STD for each column (variable) in data min_max : array minimum and maximum of each column (variable) in data quartiles : array median, 25th and 75th percentiles of each column (variable) in data normality : array test for normality of each column (variable) in data (Shapiro-Wilk's test) eq_var : array test for equality of variances for all columns (variables) in data (Levene's or Bartlett's test) References ---------- .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda356.htm .. [2] http://www.itl.nist.gov/div898/handbook/prc/section1/prc14.htm. .. [3] http://www.itl.nist.gov/div898/handbook/prc/section2/prc252.htm. .. [4] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm. .. [5] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm. Examples -------- >>> import numpy as np >>> from statdesc import statdesc >>> y = np.random.randn(20,3) >>> statdesc(y) # use the default options >>> y[8:12,1] = np.NaN # add a missing value >>> y[12,1] = 2 # add another missing value >>> statdesc(y, False, 2, ['A','B'], .01) # set arguments >>> m_sd,ci,minmax,quartiles,normality,eq_var = statdesc(y) See Also -------- scipy.stats.describe : Computes several descriptive statistics of the passed array """ data = np.asarray(data) # convert the input to array if len(data.shape) == 1: data = data.reshape(data.shape[0], 1) # missing data: don't use masked arrray, some functions don't handle that if isinstance(missing, (int, float)) and ~np.isnan(missing): # if missing option is string, must be 'NaN', then data has already NaN data[data == missing] = np.NaN m_sd = np.zeros((data.shape[1], 2)) * np.NaN ci = np.zeros((data.shape[1], 2)) * np.NaN min_max = np.zeros((data.shape[1], 2)) * np.NaN quartiles = np.zeros((data.shape[1], 3)) * np.NaN normality = np.zeros((data.shape[1], 2)) * np.NaN eq_var = np.zeros((1, 2)) * np.NaN x = [] nmiss = 0 min_len = 0 for i in range(data.shape[1]): # due to missing data, each column can have different length; # use list of arrays x.append(data[~np.isnan(data[:, i]), i]) nmiss += data.shape[0] - x[i].shape[0] # total number of missing value # skip empty array (data column with missing data only) if x[i].shape[0] == 0: print('Skipping column %d, only missing data' % (i + 1)) continue # at least 2 sets with 3 points to test for equality of variances if x[i].shape[0] > 2: min_len += 1 # handle labels if len(labels) > i and labels[i]: pass else: if len(labels) > i: labels[i] = str(i+1) else: labels.append(str(i+1)) # summary statistics m_sd[i], ci[i], min_max[i], quartiles[i], normality[i] = summary(x[i]) if show > 1 and plt: # PLOT #plot for each variable plot1var(data[:, i], x[i], m_sd[i], min_max[i], normality[i], labels[i], alpha, data.shape[1]) # remove empty arrays (data columns with missing data only) i = 0 while i < len(x): if x[i].size == 0: x.pop(i) else: i += 1 # test for equality of variances if len(x) > 1 and min_len > 1: # at least 2 sets with 3 points to run this function # Levene's test is an alternative to the Bartlett test. The Levene test # is less sensitive than the Bartlett test to departures from normality # For data with nornal distribution, Bartlett's test has better # performance. if np.all(normality[:, 1] > .05): eq_var[0] = stats.bartlett(*x) else: eq_var[0] = stats.levene(*x, center='median') if show and plt: # PLOT if data.shape[1] > 1: #summary plot plotallvar(data, x, min_max, eq_var, min_len, alpha, labels) #scatterplot matrix scatterplot(data, x, label=labels) #print results on screen statprint(m_sd, ci, min_max, quartiles, normality, eq_var, labels, alpha, data.shape[0], data.shape[1], nmiss, len(x)) return m_sd, ci, min_max, quartiles, normality, eq_var
for fiber in fiber_list: mod = Model(lambda x, a, b: a * x + b) slope_displ = mod.fit(fiber.binned_exp['static_fr_mean'], x=fiber.binned_exp['displ_mean'], a=1, b=1).best_values['a'] slope_force = mod.fit(fiber.binned_exp['static_fr_mean'], x=fiber.binned_exp['force_mean'], a=1, b=1).best_values['a'] slope_displ_list.append(slope_displ) slope_force_list.append(slope_force) slope_displ_arr = np.array(slope_displ_list) slope_force_arr = np.array(slope_force_list) sensitivity_df = pd.DataFrame( np.c_[slope_displ_arr, slope_force_arr], index=['#' + str(i+1) for i in range(slope_displ_arr.size)], columns=['Displacement sensitivity (Hz/mm)', 'Force sensitivity (Hz/mN)']) for column in sensitivity_df.columns: sensitivity_df[column[:5] + '_normalized'] = sensitivity_df[column] /\ sensitivity_df[column].median() sensitivity_df.transpose().to_excel('./csvs/sensitivity.xlsx') print(sensitivity_df.var()) from scipy.stats import f, bartlett, levene print(f.cdf(sensitivity_df['Displ_normalized'].var() / sensitivity_df['Force_normalized'].var(), sensitivity_df.shape[0], sensitivity_df.shape[0])) print(bartlett(sensitivity_df['Displ_normalized'], sensitivity_df['Force_normalized'])) print(levene(sensitivity_df['Displ_normalized'], sensitivity_df['Force_normalized']))
def test_result_attributes(self): args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10] res = stats.bartlett(*args) attributes = ('statistic', 'pvalue') check_named_results(res, attributes)
def test_empty_arg(self): args = (g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, []) assert_equal((np.nan, np.nan), stats.bartlett(*args))
def main(): def n_digits(num): if num <= 1: return 1 return math.ceil(math.log(num) / math.log(10)) db = sqlite.connect(db_fn) dbc = db.cursor() rows = [] integer_digits = {'best': 0, 'best_time': 0, 'mean': 0, 'stddev': 0} allvals = [] allvals_dict = {} for variant in VARIANTS: query = ("select tw from (select min(treewidth) as tw from validationresults where variant='%(variant)s' and instance='%(instance)s' group by seed)") result = dbc.execute(query % {'variant': variant, 'instance': instance}) vals = NP.array([row[0] for row in result]) min, mean, stddev = vals.min(), vals.mean(), vals.std() # print('%s: vals=%r' % (variant, vals), file=sys.stderr) W, p = STATS.shapiro(vals) print('%s: normal distribution? shapiro-wilk: W=%s (p=%s) %s@5%% %s@2%%' % (variant, W, p, 'no' if W <= .905 else 'yes', 'no' if W <= .884 else 'yes'), file=sys.stderr) z, p = STATS.skewtest(vals) print('%s: normal distribution? skew test: (z=%s) p=%s => %s' % (variant, z, p, 'no' if p < .5 else 'yes'), file=sys.stderr) allvals.append(vals) allvals_dict[variant] = vals query = ("select min(runtime_s)" " from validationresults" " where variant='%(variant)s' and instance='%(instance)s' and treewidth='%(treewidth)s'") result = dbc.execute(query % {'variant': variant, 'instance': instance, 'treewidth': min}) best_time = [row[0] for row in result][0] # print("%s: best=%s @ %ss, avg=%s +- %s" % (variant, min, best_time, mean, stddev), file=sys.stderr) row = {'variant': variant, 'best': min, 'best_time': round(best_time, 1), 'mean': round(mean, 1), 'stddev': round(stddev, 1)} rows.append(row) integer_digits['best'] = max(integer_digits['best'], n_digits(row['best'])) integer_digits['best_time'] = max(integer_digits['best_time'], n_digits(row['best_time'])) integer_digits['mean'] = max(integer_digits['mean'], n_digits(row['mean'])) integer_digits['stddev'] = max(integer_digits['stddev'], n_digits(row['stddev'])) db.close() T, p = STATS.bartlett(*allvals) print('equal variances? bartlett: T=%s (p=%s) [vs Chi-Quadrat_{k-1=%s, alpha=.5}]' % (T, p, len(allvals) - 1), file=sys.stderr) W, p = STATS.levene(*allvals, center='mean') print('equal variances? levene (mean): (W=%s) p=%s' % (W, p), file=sys.stderr) W, p = STATS.levene(*allvals, center='median') print('equal variances? levene (median): (W=%s) p=%s' % (W, p), file=sys.stderr) F, p = STATS.f_oneway(*allvals) print('equal means? one-way ANOVA: F=%s, p=%s [vs F_{k-1=%s,n-k=%s}]' % (F, p, len(allvals) - 1, sum([len(x) for x in allvals]) - len(allvals)), file=sys.stderr) try: W, p = STATS.kruskal(*allvals) print('equal means? kruskal wallis: W=%s, p=%s' % (W, p), file=sys.stderr) except Exception as e: print(e) lsd = LSD.LSD(allvals, .05) print('LSD: %r' % lsd, file=sys.stderr) print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.10), file=sys.stderr) print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.05), file=sys.stderr) def welch(var1, var2): res = STATS.ttest_ind(allvals_dict[var1], allvals_dict[var2], equal_var=False) print('%4s vs %s t,p=%r => \t%s @a=10%%, %s @a=5%%' % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr) print('pairwise Welch\'s t-test with Bonferroni correction:', file=sys.stderr) welch('IHA', 'MA1') welch('IHA', 'MA2') welch('IHA', 'MA3') welch('GAtw', 'MA1') welch('GAtw', 'MA2') welch('GAtw', 'MA3') welch('MA1', 'MA2') welch('MA1', 'MA3') welch('MA2', 'MA3') def mannwhitneyu(var1, var2): try: res = STATS.mannwhitneyu(allvals_dict[var1], allvals_dict[var2]) print('%4s vs %s u,p=%r => \t%s @a=10%%, %s @a=5%%' % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr) except Exception as e: print('%4s vs %s failed: %r' % (var1, var2, e)) print('pairwise Mann-Whitney U test with Bonferroni correction:', file=sys.stderr) mannwhitneyu('IHA', 'MA1') mannwhitneyu('IHA', 'MA2') mannwhitneyu('IHA', 'MA3') mannwhitneyu('GAtw', 'MA1') mannwhitneyu('GAtw', 'MA2') mannwhitneyu('GAtw', 'MA3') mannwhitneyu('MA1', 'MA2') mannwhitneyu('MA1', 'MA3') mannwhitneyu('MA2', 'MA3') #latex = [r'\begin{sidefigure}{caption={Results for instance \Instance{%(instanceTexEsc)s}},label={fig:%(instanceFileEsc)s-results},place={htbp}}''\n' #r' \begin{center}''\n' latex = [r'\begin{table}[hbtp]''\n' r' \caption{Results for instance \Instance{%(instanceTexEsc)s}}''\n' r' \label{fig:%(instanceFileEsc)s-results}''\n' r' \centering\small''\n' r' \begin{tabular}{l S[table-format=%(best)s] S[table-format=%(best_time)s.1]%%''\n' r' S[table-format=%(mean)s.1,table-number-alignment=right] @{$\,\pm\,$} S[table-format=%(stddev)s.1,table-number-alignment=left]''\n' r' S[table-format=2]} \toprule''\n' r' & \multicolumn{2}{c}{\header{Best}} & \multicolumn{2}{c}{\header{Average}} & \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}''\n' r' & \header{treewidth} & \header{seconds} & \multicolumn{2}{c}{\header{treewidth}} & \header{samples} \\ \midrule' % dict(integer_digits.items() | dict(instanceTexEsc=instance.replace('_', r'\textunderscore{}'), instanceFileEsc=instance.replace('_', '-')).items())] for row in rows: latex.append(' ' * (3 * 3) + ' & '.join([row['variant'], str(row['best']), str(row['best_time']), str(row['mean']), str(row['stddev']), "20"]) + r'\\') latex.append(r' \bottomrule''\n' r' \end{tabular}''\n' r'\end{table}') #r' \end{center}''\n' #r'\end{sidefigure}') with open('validation-validationset-%s-results.tex' % instance.replace('_', '-'), 'w') as f: print('\n'.join(latex), file=f)
def ttest(data, dataLabel=None, paired=False, decimals=4): """ Perform a t-test using Scipy.stats Parameters ---------- data : Dictionary (default: None) Data format {'group1': [dataset], 'group2': [dataset]}. dataLabel : string (default: None) title to use for print out of data paired : Boolean (default: False) Set true to do "paired" t-test, false to do unpaired (independent samples) test. decimals : int (default 4) decimals in formatted printout Returns ------- (df, t, p) : tuple df : degrees of freedom calculated assuming unequal variance t : t statistic for the difference p : p value """ # test calling values if data is None or not isinstance(data, dict) or len(data.keys()) != 2: raise ValueError('RStats.permutation: data must be a dictionary with at exactly 2 keys' + '\nUse KW (anova) for more than 2 groups') k = data.keys() g1 = data[k[0]] g2 = data[k[1]] n1 = len(g1) n2 = len(g2) (w1, p1) = Stats.shapiro(g1, a=None, reta=False) (w2, p2) = Stats.shapiro(g2, a=None, reta=False) Tb, pb = Stats.bartlett(g1, g2) # do bartletss for equal variance if pb > 0.05: equalVar = True else: equalVar = False if paired: (t, p) = Stats.ttest_rel(g1, g2, equal_var=equalVar) else: (t, p) = Stats.ttest_ind(g1, g2,) g1mean = np.mean(g1) g1std = np.std(g1) g2mean = np.mean(g2) g2std = np.std(g2) # df = (tstd[k]**2/tN[k] + dstd[k]**2/dN[k])**2 / (( (tstd[k]**2 / # tN[k])**2 / (tN[k] - 1) ) + ( (dstd[k]**2 / dN[k])**2 / (tN[k] - 1) ) ) df = (g1std**2/n1 + g2std**2/n2)**2 / (((g1std**2 / n1)**2 / (n1 - 1) + ((g2std**2 / n2)**2 / (n1 - 1)))) if dataLabel is not None: testtype = 'Independent' if paired: testtype = 'Paired' n = max([len(l) for l in k]) print '\n%s T-test, data set = %s' % (testtype, dataLabel) if p1 < 0.05 and p2 < 0.05: print(u' Both data sets appear normally distributed: Shapiro-Wilk Group 1 p = {:6.3f}, Group2 p = {:6.3f}'.format(p1, p2)) else: print(u' ****At least one Data set is NOT normally distributed****\n Shapiro-Wilk Group 1 p = {:6.3f}, Group2 p = {:6.3f}'.format(p1, p2)) print (u' (performing test anyway, as requested)') if equalVar: print(u' Variances are equivalent (Bartletts test, p = {:.3f})'.format(pb)) else: print(u' Variances are unequal (Bartletts test, p = {:.3f}); not assuming equal variances'.format(pb)) print(u' {:s}={:8.{pc}f}\u00B1{:.{pc}f} (mean, SD)'.format(k[0].rjust(n), g1mean, g1std, pc=decimals)) print(u' {:s}={:8.{pc}f}\u00B1{:.{pc}f} (mean, SD)'.format(k[1].rjust(n), g2mean, g2std, pc=decimals)) print(u' t({:6.2f})={:8.4f} p={:8.6f}\n'.format(df, float(t), float(p))) return(df, float(t), float(p))