def test_trimmed1(self): # Test that center='trimmed' gives the same result as center='mean' # when proportiontocut=0. Xsq1, pval1 = stats.fligner(g1, g2, g3, center='mean') Xsq2, pval2 = stats.fligner(g1, g2, g3, center='trimmed', proportiontocut=0.0) assert_almost_equal(Xsq1, Xsq2) assert_almost_equal(pval1, pval2)
def test_trimmed2(self): x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0] y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0] # Use center='trimmed' Xsq1, pval1 = stats.fligner(x, y, center='trimmed', proportiontocut=0.125) # Trim the data here, and use center='mean' Xsq2, pval2 = stats.fligner(x[1:-1], y[1:-1], center='mean') # Result should be the same. assert_almost_equal(Xsq1, Xsq2) assert_almost_equal(pval1, pval2)
def _scipy_fk_test(self, mode='median', alpha=0.01): """ Fligner Killeen Test for differences in data variances Scipy implementation uses the CHI2 approximation for calculation of the FK statistics. Parameters ---------- mode alpha Returns ------- h : int 0 if no break found, 1 if break was found stats_fk : dict Fligner test statistics """ q0 = self.get_group_data(0, self.df_test_resampled, ['Q']) q1 = self.get_group_data(1, self.df_test_resampled, ['Q']) with warnings.catch_warnings(): # supress scipy warnings warnings.filterwarnings('ignore') fstats, pval = fligner(q0, q1, center=mode) stats_fk = {'z': fstats, 'pval': pval} if stats_fk['pval'] <= alpha: # With CHI2 approximation h = 1 else: h = 0 return h, stats_fk
def test_significance(y1, y2): # Тестируем гипотезу на нормальность y1_shapiro = stats.shapiro(y1) print(y1_shapiro) y2_shapiro = stats.shapiro(y2) print(y2_shapiro) if y1_shapiro[1] >= 0.05 and y2_shapiro[1] >= 0.05: print('Distributions of quantities are normal') # Тестируем гипотезу на равенство дисперсий fligner_test = stats.fligner(y1, y2) print(fligner_test) # Т-тест (только если нормальное распределение) if fligner_test[1] < 0.05: print('Variances are not equal') ttest_result = stats.ttest_ind(y1, y2, equal_var=False) else: print('Variances are equal') ttest_result = stats.ttest_ind(y1, y2, equal_var=True) print(ttest_result) if ttest_result[1] >= 0.05: print('Differences in predictions are not significant.') else: print('Differences in predictions are significant.') else: print('Distributions of quantities are not normal') # Тест Вилкоксона (если распределение не подчиняется нормальному закону) wilcoxon_result = stats.wilcoxon(y1, y2) print(wilcoxon_result) if wilcoxon_result[1] >= 0.05: print('Differences in predictions are not significant.') else: print('Differences in predictions are significant.')
def test_flignerKileenTest_xResult(self): data_1 = [51, 87, 50, 48, 79, 61, 53, 54] data_2 = [82, 91, 92, 80, 52, 79, 73, 74] data_4 = [85, 80, 65, 71, 67, 51, 63, 93] data_3 = [79, 84, 74, 98, 63, 83, 85, 58] x1, p1 = fligner_kileen_test(data_1, data_2, data_3, data_4, center='median') x2, p2 = fligner(data_1, data_2, data_3, data_4, center='median') assert pytest.approx(x2) == x1
def stats_tests(): global errors tests = ['Brown-Forsythe', 'Bartlett', 'Levene', 'Fligner-Killeen'] securities = list(container.index) indicators = list(container.columns) output = pd.DataFrame(index=pd.MultiIndex.from_product([securities, indicators]), columns=tests) for security in securities: for indicator in indicators: all = pd.Series(container.loc[security][indicator]['all']) signal = pd.Series(container.loc[security][indicator]['signal']) all = pd.to_numeric(all, errors='coerce') signal = pd.to_numeric(signal, errors='coerce') try: output.loc[security, indicator][tests[0]] = stats.levene( all, signal, center='median' ) except: errors.append([security, indicator, tests[0]]) try: output.loc[security, indicator][tests[1]] = stats.bartlett( all, signal ) except: errors.append([security, indicator, tests[1]]) try: output.loc[security, indicator][tests[2]] = stats.levene( all, signal, center='mean' ) except: errors.append([security, indicator, tests[2]]) try: output.loc[security, indicator][tests[3]] = stats.fligner( all, signal ) except: errors.append([security, indicator, tests[3]]) p_values = output.dropna().applymap(lambda x: x.pvalue).unstack() p_values_container = output.dropna().applymap(lambda x: x.pvalue).unstack().melt() p_values.to_pickle('p_values_full') p_values_container.to_pickle('p_values_container_full')
def print_parametric_info(dfs, df_valid, key): values = [] for df in dfs: print(df.conditionType.iloc[0]) print(stats.shapiro(df[key])) plt.figure(len(values)) plt.hist(df[key]) #stats.probplot(df[key], plot=plt) values.append(df[key]) print('') print('general') plt.figure(len(values)) plt.hist(df_valid[key]) print(stats.shapiro(df_valid[key])) print(stats.fligner(*values))
def vector_hypotheses(a, b): dict_stat = {} dict_pval = {} pea = pearsonr(a, b) dict_stat["pearsonr"], dict_pval["pearsonr"] = pea[0], pea[1] ran = ranksums(a, b) dict_stat["ranksums"], dict_pval["ranksums"] = ran[0], ran[1] moo = mood(a, b) dict_stat["mood"], dict_pval["mood"] = moo[0], moo[1] fli = fligner(a, b) dict_stat["fligner"], dict_pval["fligner"] = fli[0], fli[1] ans = ansari(a, b) dict_stat["ansari"], dict_pval["ansari"] = ans[0], ans[1] bar = bartlett(a, b) dict_stat["bartlett"], dict_pval["bartlett"] = bar[0], bar[1] lev = levene(a, b) dict_stat["levene"], dict_pval["levene"] = lev[0], lev[1] man = mannwhitneyu(a, b) dict_stat["mannwhitneyu"], dict_pval["mannwhitneyu"] = man[0], man[1] return dict_stat, dict_pval
def isHomogeneous(df, alpha, levene=True): print "\nChecking if all the columns are homogeneous by Levene or Fligner-Killeen test...\n" #colums to list h = list(df.columns.values)[:-1] #columns values to list col1 = df[h[0]].tolist() col2 = df[h[1]].tolist() col3 = df[h[2]].tolist() col4 = df[h[3]].tolist() col5 = df[h[4]].tolist() col6 = df[h[5]].tolist() col7 = df[h[6]].tolist() col8 = df[h[7]].tolist() col9 = df[h[8]].tolist() col10 = df[h[9]].tolist() col11 = df[h[10]].tolist() L, p_val = ss.levene(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11) F, p = ss.fligner(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11) if (levene): if p_val < alpha: print "\n It is not an homegeneous dataset (Levene)\n" else: print "\n It is an homogeneneous dataset (Levene)\n" else: if p < alpha: print "\n It is not an homegeneous dataset (Fligner-Killeen) \n" else: print "\n It is an homogeneneous dataset (Fligner-Killen)\n"
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument( "--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;", ) parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help= "Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help= "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values.", ) parser.add_argument( "--fisher", action="store_true", default=False, help="if true then Fisher definition is used", ) parser.add_argument( "--bias", action="store_true", default=False, help= "if false,then the calculations are corrected for statistical bias", ) parser.add_argument( "--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored", ) parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored", ) parser.add_argument( "--inclusive", action="store_true", default=False, help="if false,limit will be ignored", ) parser.add_argument( "--printextras", action="store_true", default=False, help= "If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help= "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument( "--correction", action="store_true", default=False, help="continuity correction ", ) parser.add_argument( "--axis", type=int, default=0, help= "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help= "the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument( "--score", type=int, default=0, help="Score that is compared to the elements in a.", ) parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help= "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument( "--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds", ) parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help= "lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help= "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument( "--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e", ) parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols is not None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols is not None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols is not None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe( map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis( map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias, ) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode, ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf == 0 and mf == 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf == 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf == 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf == 0 and mf == 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf == 0 and mf == 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf == 0 and mf == 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf == 0 and mf == 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation, ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation, ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf == 0 and mf == 0: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf == 0 and mf == 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf == 0 and mf == 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1( map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail, ) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf == 0 and mf == 0: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf == 0 and mf == 0: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf == 0 and mf == 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda == 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two)) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity, ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one), map(float, sample_two), equal_var=args.equal_var) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort, ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction, ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_, ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two), ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
print('Resultado del test de Anderson-Darling') AD_test(df_high_quality) AD_test(df_normal_quality) ###################### ## Variance Test ## ###################### print('\n-- Sección Variance Test --') # The Variance Homogeneity Test will be done using the Fligner-Killeen test matrix_fligner = [['Dataset', 'Statistic', 'p-value']] statistic, p_value = stats.fligner( df_high_quality.iloc[:, 0], df_high_quality.iloc[:, 1], df_high_quality.iloc[:, 2], df_high_quality.iloc[:, 3], df_high_quality.iloc[:, 4], df_high_quality.iloc[:, 5], df_high_quality.iloc[:, 6], df_high_quality.iloc[:, 7], df_high_quality.iloc[:, 8], df_high_quality.iloc[:, 9], df_high_quality.iloc[:, 10], df_high_quality.iloc[:, 11]) matrix_fligner.append(['df_high_quality', statistic, p_value]) statistic, p_value = stats.fligner( df_normal_quality.iloc[:, 0], df_normal_quality.iloc[:, 1], df_normal_quality.iloc[:, 2], df_normal_quality.iloc[:, 3], df_normal_quality.iloc[:, 4], df_normal_quality.iloc[:, 5], df_normal_quality.iloc[:, 6], df_normal_quality.iloc[:, 7], df_normal_quality.iloc[:, 8], df_normal_quality.iloc[:, 9], df_normal_quality.iloc[:, 10], df_normal_quality.iloc[:, 11]) matrix_fligner.append(['df_normal_quality', statistic, p_value])
def test_data(self): # numbers from R: fligner.test in package stats x1 = np.arange(5) assert_array_almost_equal(stats.fligner(x1,x1**2), (3.2282229927203536, 0.072379187848207877), 11)
import matplotlib.pyplot as plt import seaborn as sns sns.distplot(sco1, kde=False, fit=stats.norm) sns.distplot(sco2, kde=False, fit=stats.norm) plt.show() # 정규성 확인 함수 print(stats.shapiro(sco1), '\n') # 0.36799 > 0.05 이므로 정규성 분포를 이룸 print(stats.shapiro(sco2), '\n') # 0.67141 > 0.05 이므로 정규성 분포를 이룸 # 등분산성 print(stats.levene(sco1, sco2).pvalue, '\n') # 가장 일반적 0.45684 < 0.05 이므로 등분산성을 따름 print(stats.fligner(sco1, sco2).pvalue, '\n') print(stats.bartlett(sco1, sco2).pvalue, '\n') print(stats.ttest_ind(sco1, sco2), '\n') # 등분산성을 만족한 경우 . 기본값 #Ttest_indResult(statistic=-0.19649386929539883, pvalue=0.8450532207209545) #해석 : pvalue(0.8450) > 0.05 이므로 귀무채택이다 #r print(stats.ttest_ind(sco1, sco2, equal_var=False)) # 등분산성을 만족하지 않는 경우 #만약 정규성을 만족하지 않는경우 #stats.wilcoxon(sco1,sco2) #stats.kruskal() #stats.mannwhitneyu()
#hfmt = dates.DateFormatter('%H:%M') #ax.xaxis.set_major_formatter(hfmt) # y_formatter = mpl.ticker.ScalarFormatter(useOffset=False) # ax.yaxis.set_major_formatter(y_formatter) # ax.grid(True) f.suptitle("Dichte der Leistungsgradienten") f.autofmt_xdate() plt.savefig("images/sonnenfinsternis-dichte-gradienten.png")#, bbox_inches='tight') plt.clf() friday_series, friday_vals = ecdf.get_ecdf(friday_momentum_df.momentum) ecdf.plot_ecdf_curve(friday_series, friday_vals, color="b", label="Typischer Freitag") eclipse_series, eclipse_vals = ecdf.get_ecdf(eclipse_momentum_df.momentum) ecdf.plot_ecdf_curve(eclipse_series, eclipse_vals, color="r", label="Sonnenfinsternis") print "Mittelwert alle Freitage: %f" % np.median(friday_momentum_df.momentum) print "Mittelwert Sonnenfinsternis: %f" % np.median(eclipse_momentum_df.momentum) # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html#scipy.stats.levene W, p_val = stats.levene(friday_momentum_df.momentum, eclipse_momentum_df.momentum, center='median') print ("Levenes Test auf Gleichheit der Varianz: P=%s (gleiche Varianz für p<=0.05)" % p_val) W, p_val = stats.fligner(friday_momentum_df.momentum, eclipse_momentum_df.momentum) print "Fliegners Test auf Gleichheit der Varianz: P=%s" % p_val f.suptitle("ECDF der Leistungsgradienten: Ungleiche Varianzen (Levene, p=%f)" % p_val) plt.savefig("images/sonnenfinsternis-ecdf-gradienten.png")#, bbox_inches='tight')
# ============================================================================= # ~ Anàlisi - Homoscedasticitat # ============================================================================= sep_("Anàlisi Homoscedasticitat") # Agafem un nivell de significàcia de 0.05 per a avaluar la homoscedasticitat # entre els dos grups sota el test de Fligner-Killeen, donat que no hi ha # normalitat. Obtenim que l'única variable amb homoscedasticitat és SibSp. print('\n--Test Fligner-Killeen:', '\tH0: la variància és igual en ambdós grups (homoscedasticitat)', '\tH1: la variància no és igual entre els grups (heteroscedasticitat)', sep='\n') alpha = 0.05 res = pd.DataFrame(columns=['Variable', 'Estadistic', 'p-valor', 'H0']) for var in ['Age', 'Fare', 'SibSp', 'Parch']: stat, p = fligner(X0[var], X1[var]) res = res.append( { 'Variable': var, 'Estadistic': stat, 'p-valor': round(p, 6), 'H0': p >= alpha }, ignore_index=True) print(res) res_homo = res.copy() # ============================================================================= # ~ Anàlisi - Tendència central # ============================================================================= sep_("Anàlisi Tendència Central")
def fligner((x, y)): return stats.fligner(x, y)
print("test mean: %s std: %s" % (test_mean, sqrt(test_var))) # percent uplifts in test mean over control mean percent_uplift_mean = ((test_mean - control_mean)/control_mean)*100 print("percent uplift in %s mean over %s mean: %s" % (test_group, control_group, percent_uplift_mean)) ######################### Hypothesis Testing # compute pearsonr test for h_A:r_test != r_control pearsonr_obj = stats.pearsonr(control_metric, test_metric) #print("correlation coef: %s p-value: %s" % (pearsonr_obj[0], pearsonr_obj[1])) p_rtest = pearsonr_obj[1] is_correlated = p_rtest <= alpha print("are groups correlated? %s" % is_correlated) # compute flinger's test for h_A: sig^2_test != sig^2_control fligner_obj = stats.fligner(control_metric, test_metric, center="mean") p_fligner = fligner_obj[1] is_var_equal = p_fligner > alpha print("is variance of groups equal? %s" % is_var_equal) # compute student t test for h_A: mu_test != mu_control if is_correlated: p_ttest = stats.ttest_rel(control_metric, test_metric)[1] else: p_ttest = stats.ttest_ind(control_metric, test_metric, equal_var=is_var_equal)[1] print("t test p value: %s" % p_ttest) # output test results if p_ttest <= alpha: print("reject null hypothesis, means are not equal")
def plotExpBox_Main(inputFiles, headers, valcols, outputFile, sep, startRow, showIndPoints, mark, markMean, showMean, notch, whisker, outliers, plotPvalueCluster, outputClusterPrefix, methodCluster, xlegendrotation, xlabe, ylabe, figsz, titl, showSampleSizes, trimToMinSize, relabels, logb, plotHistogramToFile, plotMedianForGroups, botta, showViolin, showBox, firstColAnnot, plotTrend, showLegend, makePzfxFile, makeBinMatrix, writeDataSummaryStat, summaryStatRange, minuslog10pvalue, minNDataToKeep, vfacecolor, valpha, outXYZPvalues, dividePlots): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData = [] xtickLabels = [] trendData = {} annot = {} minSize = -1 for inputFile, header, cols in zip(inputFiles, headers, valcols): fin = generic_istream(inputFile) startIdx = len(plotData) if firstColAnnot: colAnnot = cols[0] cols = cols[1:] annotThisFile = [] annot[startIdx] = annotThisFile else: colAnnot = -1 annotThisFile = None for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices = range(startIdx, startIdx + len(cols)) if plotTrend: #print >> stderr,"plotTrend" trendDataThisFile = [] trendData[startIdx] = trendDataThisFile else: trendDataThisFile = None lino = 0 for lin in fin: lino += 1 if lino < startRow: continue fields = lin.rstrip("\r\n").split(sep) if plotTrend: #print >> stderr,"a" trendDataThisLine = [] else: trendDataThisLine = None allDataOKThisLine = True if colAnnot >= 0: annotThisFile.append(fields[colAnnot]) for idx, col in zip(colIndices, cols): try: value = float(fields[col]) if logb != 0: if value == 0.0: raise ValueError value = log(value) / logb plotData[idx].append(value) if plotTrend: trendDataThisLine.append(value) #print >> stderr,"value:",value except: allDataOKThisLine = False if plotTrend: if allDataOKThisLine: trendDataThisFile.append(trendDataThisLine) else: trendDataThisFile.append(None) fin.close() if minSize == -1: minSize = len(plotData[idx]) #or startIDX? else: minSize = min([minSize, len(plotData[idx])]) if trimToMinSize: print >> stderr, "trimming to min size =", minSize trimData(plotData, minSize) if len(relabels) > 0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr, xtickLabels print >> stderr, relabels for i, relabel in zip(range(0, len(relabels)), relabels): xtickLabels[i] = relabel for i in range(0, len(plotMedianForGroups)): plotMedianForGroups[i] = getCol0ListFromCol1ListStringAdv( xtickLabels, plotMedianForGroups[i]) #drawing medians: medianToDraw = [] for mediangrouper in plotMedianForGroups: curD = [] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData) - 1, -1, -1): if len(plotData[c]) < minNDataToKeep: print >> stderr, xtickLabels[c], "discarded because has only", len( plotData[c]), "data points <", minNDataToKeep del plotData[c] del xtickLabels[c] if not skipStat: print >> stdout, "student t-test (1 sample; mean=0)" print >> stdout, "sample", "mean", "p-val", "median" if writeDataSummaryStat: fDSS = open(writeDataSummaryStat, "w") print >> fDSS, "sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange[" + str( summaryStatRange[0]) + "," + str( summaryStatRange[1] ) + "]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove" for x in range(0, len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x], mean( plotData[x]), ttest_1samp(plotData[x], 0)[1], median(plotData[x]) except: print >> stdout, xtickLabels[x], mean( plotData[x]), "NA", median(plotData[x]) if writeDataSummaryStat: sumData, N, NIN, NBelow, NAbove = filterDataInRangeInclusive( plotData[x], summaryStatRange[0], summaryStatRange[1]) if NIN > 1: #print >> stderr,"sumData=",sumData #print >> stderr,mean mea = mean2(sumData) DDOF = 1 sd = std(sumData, ddof=DDOF) var = sd * sd mi = min(sumData) ma = max(sumData) else: mea = "NA" sd = "NA" var = "NA" mi = "NA" ma = "NA" print >> fDSS, xtickLabels[x] + "\t" + str(mea) + "\t" + str( var) + "\t" + str(sd) + "\t" + str(mi) + "\t" + str( ma) + "\t" + str(N) + "\t" + str(NIN) + "\t" + str( float(NIN) * 100 / N) + "\t" + str(NBelow) + "\t" + str( float(NBelow) * 100 / N) + "\t" + str(NAbove) + "\t" + str( float(NAbove) * 100 / N) pvalueM = [] if writeDataSummaryStat: fDSS.close() print >> stdout, "" print >> stdout, "student t-test (2 samples)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue = ttest_ind(plotData[x], plotData[y])[1] except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout, "" print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_t_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_t", xtickLabels, pvalueM, methodCluster) pvalueM = [] print >> stdout, "welch t-test" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue = welchs_approximate_ttest_arr( plotData[x], plotData[y])[3] except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout, "" if outXYZPvalues: writeXYZPvalues(outXYZPvalues + "_Welch.xyz", xtickLabels, pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_Welch_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_Welch", xtickLabels, pvalueM, methodCluster) print >> stdout, "" print >> stdout, "non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = mannwhitneyu(plotData[x], plotData[y])[1] * 2 except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if outXYZPvalues: writeXYZPvalues(outXYZPvalues + "_U.xyz", xtickLabels, pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_U_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_U", xtickLabels, pvalueM, methodCluster) #####now the variance tests print >> stdout, "" print >> stdout, "Ansari-Bradley Two-sample Test for difference in scale parameters " print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = ansari(plotData[x], plotData[y])[1] except: pvalue = "NA" if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 #pvalue=1.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_Ansari_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_Ansari", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Fligner's Two-sample Test for equal variance (non-parametrics)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = fligner(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_fligner_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_fligner", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Levene's Two-sample Test for equal variance" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = levene(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_levene_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_levene", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Bartlett's Two-sample Test for equal variance (for normal distributions)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = bartlett(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_bartlett_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_bartlett", xtickLabels, pvalueM, methodCluster) ##### figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl) == 0: titl = outputFile plotExpBox(plotData, xtickLabels, showIndPoints, mark, markMean, showMean, notch, whisker, outliers, xlegendrotation, xlabe, ylabe, titl, showSampleSizes, showViolin, showBox, annot, trendData, showLegend, makePzfxFile, makeBinMatrix, dividePlots) #ylim([0,200]) for m in medianToDraw: axhline(y=m, linestyle=':', color='gray') savefig(outputFile, bbox_inches="tight") if len(plotHistogramToFile) > 0: drawHistogram(plotHistogramToFile, plotData, xtickLabels) drawDensigram(plotHistogramToFile + ".density.png", plotData, xtickLabels)
# Cuando los datos no cumplen con la condición de normalidad, cómo hemos visto en el apartado anterior, utilizamos el test Fligner-Killeen # # La hipótesis nula asume igualdad de varianzas en los diferentes grupos de datos, por lo que p-valores inferiores al nivel de significancia (0.05) indicán heterocedasticidad. # # En este caso lo aplicaremos a todas las variables, comparando los atributos (data_wine) con la variable target (quality) # In[25]: # Aplicamos la prueba de Fligner from scipy.stats import fligner names = list(data_wine.columns) # Consideramos las variables/atributos y la comparamos con quality for i in range(len(names)): stat, p = fligner(data_wine.iloc[:, i], wine['quality']) print(names[i], ': p = %.3f' % (p)) # Encontramos por tanto Heterocedastidad # ## 4.3. Aplicación de pruebas estadísticas para comparar los grupos de datos. # #### En función de los datos y el objetivo del estudio, aplicar pruebas de contraste de hipótesis, correlaciones, regresiones, etc. Aplicar al menos tres métodos de análisis diferentes. # ### ***Contraste de hipótesis*** # * #### **Test Mann Whitney** # En el caso que consideremos 2 niveles para la variable 'rating' o podemos utilizar también la variable 'quality' directamente # In[26]:
cut_coords=cut_coords) plt.savefig('lm.png') plotting.plot_roi(atlas, title="Harvard Oxford atlas", cut_coords=cut_coords) # print labels from scipy.stats import fligner X = roi_masker.transform(func_filename) y, session = np.loadtxt(haxby_dataset.session_target).astype('int').T conditions = np.recfromtxt(haxby_dataset.conditions_target)['f0'] non_rest = conditions != b'rest' conditions = conditions[non_rest] y, session = y[non_rest], session[non_rest] y = y[session < 4] var_stat = np.zeros(X.shape[1]) for j, x in enumerate(X.T): _, var_stat[j] = fligner(x[y == 8], x[y == 1], x[y == 2], x[y == 3], x[y == 4], x[y == 5], x[y == 6], x[y == 7]) var_img = roi_masker.inverse_transform(-np.hstack( (0, np.log10(var_stat)))[np.newaxis]) plotting.plot_stat_map(var_img, cut_coords=cut_coords, title='Fligner test', vmax=4) plt.savefig('var_stat.png') plt.show()
plotting.plot_stat_map(roi_score_img, title='Linear model', cut_coords=cut_coords) plt.savefig('lm.png') plotting.plot_roi(atlas, title="Harvard Oxford atlas", cut_coords=cut_coords) # print labels from scipy.stats import fligner X = roi_masker.transform(func_filename) y, session = np.loadtxt(haxby_dataset.session_target).astype('int').T conditions = np.recfromtxt(haxby_dataset.conditions_target)['f0'] non_rest = conditions != b'rest' conditions = conditions[non_rest] y, session = y[non_rest], session[non_rest] y = y[session < 4] var_stat = np.zeros(X.shape[1]) for j, x in enumerate(X.T): _, var_stat[j] = fligner( x[y == 8], x[y == 1], x[y == 2], x[y == 3], x[y == 4], x[y == 5], x[y == 6], x[y == 7]) var_img = roi_masker.inverse_transform( - np.hstack((0, np.log10(var_stat)))[np.newaxis]) plotting.plot_stat_map(var_img, cut_coords=cut_coords, title='Fligner test', vmax=4) plt.savefig('var_stat.png') plt.show()
def starplot(df=[], x='', y='', data=[], index=[], columns=[], fold=False, foldcol=0, mode=3, errorbar=True, plottype='barplot', stats='independent t test', test_var=False, stats_var='f test', crit_var=0.05, equal_var=True, rotate=0, elinewidth=0.5, fontsize=14, capsize=4, noffset_ylim=35, noffset_fst=10, noffset_diff=10, star_size=3, linewidth=1, crit=[0.05, 0.01, 0.001, 0.0001]): # data: list of data matrixs(or DataFrames) for comparison (row: obs, columns: var) # index: var, columns: obs # adjacent: annotate star for adjacent bar # control: annotate star between all other bars to selctive control bar # mix: mix mode # 3: annotate star for all combination of bar (only 3 bars available) crit = np.array(crit) plt.rcParams['font.family'] = 'Times New Roman' fig, ax = plt.subplots() star = ['*', '**', '***', '****'] n = len(data) m = data[0].shape[1] test = pd.DataFrame() for i, j in enumerate(data): if type(test) == type(j): data[i] = j.values.reshape(len(j.index), len(j.columns)) if plottype == 'barplot': error = pd.DataFrame() mean = pd.DataFrame() for i in range(m): error[i] = [data[j][:, i].std() for j in range(n)] mean[i] = [data[j][:, i].mean() for j in range(n)] error = error.transpose() mean = mean.transpose() if len(index) != 0: error.index = index mean.index = index if len(columns) != 0: error.columns = columns mean.columns = columns if fold == True: oldmean = mean.copy() olderror = error.copy() for i in range(len(mean.columns)): mean.iloc[:, i] = oldmean.iloc[:, i] / oldmean.iloc[:, foldcol] error.iloc[:, i] = olderror.iloc[:, i] / oldmean.iloc[:, foldcol] if errorbar == True: plot = plot = mean.plot.bar(yerr=error, ax=ax, rot=rotate, capsize=capsize, error_kw=dict(elinewidth=elinewidth), fontsize=fontsize) max_bar = [[mean.iloc[j, i] + error.iloc[j, i] for i in range(n)] for j in range(m)] min_bar = [ mean.iloc[j, i] - error.iloc[j, i] for i in range(n) for j in range(m) ] else: plot = plot = mean.plot.bar(ax=ax, rot=rotate, capsize=capsize, error_kw=dict(elinewidth=elinewidth), fontsize=fontsize) max_bar = [[mean.iloc[j, i] for i in range(n)] for j in range(m)] min_bar = [mean.iloc[j, i] for i in range(n) for j in range(m)] elif plottype == 'boxplot': print("under buiding") ylim = 0 offset = max([max_bar[i][j] for i in range(m) for j in range(n)]) / 100 blank = [] if mode == 3: for j in range(m): level = np.zeros(n) for i in range(n): if i < n - 1: k = i + 1 else: k = 0 if test_var == True: if stats_var == 'f test': f = 0.5 - abs(0.5 - ftest.sf( data[i][:, j].var() / data[k][:, j].var(), len(data[i][:, j]) - 1, len(data[k][:, j]) - 1)) if crit_var / 2 > f: equal_var = False else: equal_var = True else: if stats_var == 'bartlett': f = bartlett(data[i][:, j], data[k][:, j])[1] elif stats_var == 'levene': f = bartlett(data[i][:, j], data[k][:, j])[1] elif stats_var == 'fligner': f = fligner(data[i][:, j], data[k][:, j])[1] if crit_var > f: equal_var = False else: equal_var = True if stats == 'independent t test': p = ttest_ind(data[i][:, j], data[k][:, j], equal_var=equal_var)[1] elif stats == 'paired t test': if equal_var == True: p = ttest_rel(data[i][:, j], data[k][:, j])[1] else: p = 0 elif stats == 'median test': p = median_test(data[i][:, j], data[k][:, j])[1] elif stats == 'mannwhitneyu': if equal_var == True: p = mannwhitneyu(data[i][:, j], data[k][:, j])[1] else: p = 0 elif stats == 'wilcoxon': if equal_var == True: p = wilcoxon(data[i][:, j], data[k][:, j])[1] else: p = 0 level[i] = len(crit) - len(crit.compress(p > crit)) for k in range(n): height = 0 if level[k] != 0 and k != n - 1: center = [ plot.patches[k * m + j].get_x(), plot.patches[k * m + m + j].get_x() ] height = max([max_bar[j][k], max_bar[j][k + 1]]) h1 = max_bar[j][k] h2 = max_bar[j][k + 1] width = plot.patches[k * m + j].get_width() blank.append( (center[0] + width / 2, height + noffset_fst * offset + (-1)**k * 2 * offset)) blank.append( (center[1] + width / 2, height + noffset_fst * offset + (-1)**k * 2 * offset)) ax.vlines(x=center[0] + width / 2, ymin=h1 + offset * 2, ymax=height + noffset_fst * offset + (-1)**k * 2 * offset, lw=linewidth) ax.vlines(x=center[1] + width / 2, ymin=h2 + offset * 2, ymax=height + noffset_fst * offset + (-1)**k * 2 * offset, lw=linewidth) ax.annotate(star[int(level[k] - 1)], xy=((center[0] + center[1]) / 2 + width / 2, height + (noffset_fst + 1) * offset + (-1)**k * 2 * offset), ha='center', size=star_size) elif level[k] != 0 and k == n - 1: center = [ plot.patches[j].get_x(), plot.patches[k * m + j].get_x() ] height = max(max_bar[j]) h1 = max_bar[j][0] h2 = max_bar[j][k] blank.append( (center[0] + width / 2, height + (noffset_fst + noffset_diff) * offset)) blank.append((center[1] + width / 2, height + 20 * offset)) ax.vlines(x=center[0] + width / 2, ymin=h1 + offset * 2, ymax=height + (noffset_fst + noffset_diff) * offset, lw=linewidth) ax.vlines(x=center[1] + width / 2, ymin=h2 + offset * 2, ymax=height + (noffset_fst + noffset_diff) * offset, lw=linewidth) ax.annotate(star[int(level[k] - 1)], xy=((center[0] + center[1]) / 2 + width / 2, height + (noffset_fst + noffset_diff + 1) * offset), ha='center', size=star_size) if height > ylim: ylim = height if mode == 'adjacent': for j in range(m): level = np.zeros(n - 1) for i in range(n - 1): k = i + 1 if test_var == True: if stats_var == 'f test': f = 0.5 - abs(0.5 - ftest.sf( data[i][:, j].var() / data[k][:, j].var(), len(data[i][:, j]) - 1, len(data[k][:, j]) - 1)) if crit_var / 2 > f: equal_var = False else: equal_var = True else: if stats_var == 'bartlett': f = bartlett(data[i][:, j], data[k][:, j])[1] elif stats_var == 'levene': f = bartlett(data[i][:, j], data[k][:, j])[1] elif stats_var == 'fligner': f = fligner(data[i][:, j], data[k][:, j])[1] if crit_var > f: equal_var = False else: equal_var = True if stats == 'independent t test': p = ttest_ind(data[i][:, j], data[k][:, j], equal_var=equal_var)[1] elif stats == 'paired t test': if equal_var == True: p = ttest_rel(data[i][:, j], data[k][:, j])[1] else: p = 0 elif stats == 'median test': p = median_test(data[i][:, j], data[k][:, j])[1] elif stats == 'mannwhitneyu': if equal_var == True: p = mannwhitneyu(data[i][:, j], data[k][:, j])[1] else: p = 0 elif stats == 'wilcoxon': if equal_var == True: p = wilcoxon(data[i][:, j], data[k][:, j])[1] else: p = 0 level[i] = len(crit) - len(crit.compress(p > crit)) for k in range(n - 1): height = 0 if level[k] != 0: center = [ plot.patches[k * m + j].get_x(), plot.patches[k * m + m + j].get_x() ] height = max([max_bar[j][k], max_bar[j][k + 1]]) h1 = max_bar[j][k] h2 = max_bar[j][k + 1] width = plot.patches[k * m + j].get_width() blank.append( (center[0] + width / 2, height + noffset_fst * offset + (-1)**k * 2 * offset)) blank.append( (center[1] + width / 2, height + noffset_fst * offset + (-1)**k * 2 * offset)) ax.vlines(x=center[0] + width / 2, ymin=h1 + offset * 2, ymax=height + noffset_fst * offset + (-1)**k * 2 * offset, lw=linewidth) ax.vlines(x=center[1] + width / 2, ymin=h2 + offset * 2, ymax=height + noffset_fst * offset + (-1)**k * 2 * offset, lw=linewidth) ax.annotate(star[int(level[k] - 1)], xy=((center[0] + center[1]) / 2 + width / 2, height + (noffset_fst + 1) * offset + (-1)**k * 2 * offset), ha='center', size=star_size) if height > ylim: ylim = height ax.set_ylim(min(0, min(min_bar) - 10 * offset), ylim + noffset_ylim * offset) for j, i in enumerate(blank): ax.vlines(x=i[0], ymin=i[1], ymax=i[1] + offset * 2, color='white', lw=1.2 * linewidth) if j % 2 == 1: ax.hlines(y=i[1], xmin=blank[j - 1], xmax=blank[j], lw=linewidth)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def fligner_test(ts): values = pd.Series(index=np.arange(10, 60, 10)) for sample_size in values.index: values[sample_size] = stats.fligner(*chunks(ts, sample_size)).pvalue return values.mean()
# For each column of data we apply the Anderson Darling theorem for column in medium_quality_df: print(column) print(anderson(medium_quality_df[column], dist="norm")) # For each column of data we apply the Anderson Darling theorem for column in high_quality_df: print(column) print(anderson(high_quality_df[column], dist="norm")) # Apply the Flinger theory for column in high_quality_df: print(column) print( fligner(low_quality_df[column], medium_quality_df[column], high_quality_df[column])) # CORRELATION # Correlations between variables plt.figure(figsize=(10, 6)).subplots_adjust(bottom=0.25) sns.heatmap(df.corr(), annot=True, fmt='.0%') plt.show() # Calculate and order correlations plt.figure(figsize=(10, 6)).subplots_adjust(bottom=0.25) df.corr()['quality'].sort_values(ascending=False).plot(kind='bar') plt.show() # Matrix correlation between all variables plt.figure(figsize=(10, 6)) sns.stripplot(data=df, x="quality", y="alcohol", jitter=True)
# f, axes = plt.subplots(1, 3, figsize=(100, 100)) # axes[0].hist(A, bins = 10) # axes[1].hist(B, bins = 10) # axes[2].hist(C, bins = 5) # plt.axis("equal") # plt.show() # # stat_a, p_a = stats.shapiro(A) # stat_b, p_b = stats.shapiro(B) # stat_c, p_c = stats.shapiro(C) # # print("p-vlaues for shapiro-wilk test are\nA: %f\nB: %f\nC: %f"%(p_a,p_b,p_c)) # print("p-value for homogeneity of variance test is" ": %f"%(stats.fligner(B,C)[1])) # # k = [] # # k.append((np.std(A)**2)/np.mean(A)**2) # k.append(((np.std(A))**2)/np.mean(A)) # # k.append(((np.std(B))**2)/(np.mean(B))**2) # k.append(((np.std(A))**2)/np.mean(B)) # # k.append(((np.std(C))**2)/(np.mean(C))**2) # k.append(((np.std(A))**2)/np.mean(C)) # print(k[:]) # # _, p_anova = stats.f_oneway(A, B, C) # print(p_anova) # # print("p-value for kruskal-wallis test between B, C: %f"%stats.kruskal(B,C)[1])
#정규성 확인 print('정규성 만족 여부 :', stats.shapiro(data.score)[1]) #정규성 만족 여부 : 0.2986918091773987 > 0.05 ok # 등분산성 확인 result = data[['method', 'score']] m1 = result[result['method'] == 1] m2 = result[result['method'] == 2] m3 = result[result['method'] == 3] score1 = m1['score'] score2 = m2['score'] score3 = m3['score'] print('등분산성 확인 :', stats.levene(score1, score2, score3).pvalue) # 모수적 검정 print('등분산성 확인 :', stats.fligner(score1, score2, score3).pvalue) # 모수적 검정 print('등분산성 확인 :', stats.bartlett(score1, score2, score3).pvalue) # 비모수 검정 # 등분산성 확인 : 0.11322850654055751 > 0.05 등분산성 만족이므로 anova 를 사용 . 아니라면 welchi's anova # 등분산성 확인 : 0.10847180733221087 # 등분산성 확인 : 0.15251432724222921 print('\n 가공된 잘로 분산분석 --------') # 교차표 : 교육방법별 건수 data2 = pd.crosstab(index=data['method'], columns='count') data2.index = ['방법1', '방법2', '방법3'] print(data2) #교차표 : 교육방법별 만족 여부 data3 = pd.crosstab(data['method'], data['survey']) data3.index = ['방법1', '방법2', '방법3']
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] trendData={} annot={} minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) if firstColAnnot: colAnnot=cols[0] cols=cols[1:] annotThisFile=[] annot[startIdx]=annotThisFile else: colAnnot=-1 annotThisFile=None for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) if plotTrend: #print >> stderr,"plotTrend" trendDataThisFile=[] trendData[startIdx]=trendDataThisFile else: trendDataThisFile=None lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) if plotTrend: #print >> stderr,"a" trendDataThisLine=[] else: trendDataThisLine=None allDataOKThisLine=True if colAnnot>=0: annotThisFile.append(fields[colAnnot]) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: if value==0.0: raise ValueError value=log(value)/logb plotData[idx].append(value) if plotTrend: trendDataThisLine.append(value) #print >> stderr,"value:",value except: allDataOKThisLine=False if plotTrend: if allDataOKThisLine: trendDataThisFile.append(trendDataThisLine) else: trendDataThisFile.append(None) fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr,xtickLabels print >> stderr,relabels for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])<minNDataToKeep: print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep del plotData[c] del xtickLabels[c] if not skipStat: print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val","median" if writeDataSummaryStat: fDSS=open(writeDataSummaryStat,"w") print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x]) except: print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x]) if writeDataSummaryStat: sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1]) if NIN>1: #print >> stderr,"sumData=",sumData #print >> stderr,mean mea=mean2(sumData) DDOF=1 sd=std(sumData,ddof=DDOF) var=sd*sd mi=min(sumData) ma=max(sumData) else: mea="NA" sd="NA" var="NA" mi="NA" ma="NA" print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N) pvalueM=[] if writeDataSummaryStat: fDSS.close() print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) #####now the variance tests print >> stdout,"" print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=ansari(plotData[x],plotData[y])[1] except: pvalue="NA" if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 #pvalue=1.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=fligner(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Levene's Two-sample Test for equal variance" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=levene(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=bartlett(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster) ##### figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels) drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
#print(normParams) normQual = stats.kstest( newdata, stats.norm( loc=normParams[0], scale=normParams[1] ).cdf ).statistic normScores.append( normQual) elif( r == 1 ): v += 1 (_, _, MFEscores) = convertLineFields(data) newdata = pd.DataFrame(MFEscores) testData = testData.append(newdata) # Test for equal variances assert(len(trainData.get_values()[0]) == 50) print(stats.fligner( *trainData.get_values() )) fig, ax = plt.subplots() quants = np.arange(0.0, 1.0, 0.002) dfgs = pd.DataFrame(gammaScores) qdfgs = dfgs.quantile(q = quants) dfgs2 = pd.DataFrame(gammaScores2) qdfgs2 = dfgs2.quantile(q = quants) dfns = pd.DataFrame(normScores) qdfns = dfns.quantile(q = quants) #dflo = dfgs / dfns
def test_empty_arg(self): x = np.arange(5) assert_equal((np.nan, np.nan), stats.fligner(x, x**2, []))
x = df[df['COR'] == 1]['PRICE'] y = df[df['COR'] == 0]['PRICE'] x.name, y.name = 'corner', 'not corner' two_histograms(x, y) res = stats.mannwhitneyu(x, y) print('p-value: ', res[1]) df = pd.read_csv("./Shad_Python_06_2/agedeath.dat.txt", sep='\s+', header=None, names=['group', 'age', 'index']) print(df.head()) x = df[df['group'] == 'sovr']['age'] y = df[df['group'] == 'aris']['age'] two_histograms(x, y) res = stats.fligner(x, y) print('p-value: ', res[1]) res = stats.ttest_ind(x, y, equal_var=False) print('p-value: ', res[1]) df = pd.read_csv("./Shad_Python_06_2/interference.csv") print(df.head()) x = df['DiffCol'] y = df['Black'] x.name, y.name = 'DiffCol', 'Black' two_histograms(x, y) res = stats.fligner(x, y) print('p-value: ', res[1]) res = stats.ttest_ind(x, y, equal_var=False) print('p-value: ', res[1])
def get_statistic_and_pvalue(self, y): return fligner(*y, center="mean")
def pval(grp): if grp.size < min_count: return np.nan return fligner(grp.values, y.values)[1]
sns.distplot(other_prices, fit = norm); fig = plt.figure() res = stats.probplot(other_prices, plot = plt) # Prueba de Shapiro-Wilk stat, p = shapiro(other_prices) print(f"Stat: {round(stat,3)}") print(f"p-value: {round(p,3)}") """En base a los resultados obtenidos, no es posible garantizar la normalidad en la distribución de las muestras de precios tanto de apple como de otras marcas dado que el test de Shapiro rechaza la hipótesis nula de normalidad. Asimismo, en el caso del subset de Apple, no sería posible tampoco aplicar el teorema del límite central al no superar las 30 muestras. En este sentido, para poder confirmar si podemos asumir homocedasticidad (igualdad de varianzas entre muestras) debemos aplicar el test de Fligner-Killeen (no paramétrico) al no poder suponer normalidad: """ # Fligner-Killeen test fligner_test = stats.fligner(apple_prices, other_prices, center='median') fligner_test """A raíz de los resultados (p value >> 0.05) no podemos descartar la hipótesis nula, por lo que se confirma la homocedasticidad. No obstante, debido a que no hemos podido afirmar que sigan distribuciones normales, no podremos aplicar un contraste de muestras paramétrico (t-Student), si no que tendremos que aplicar uno no paramétrico (Mann-Whitney):""" # Mann-Whitney test mannwhitneyu_test = stats.mannwhitneyu(apple_prices, other_prices, alternative="greater") mannwhitneyu_test """En base a los resultados obtenidos del test de Mann-Whitney (p value << 0.05), podemos rechazar la hipótesis nula en favor de la hipótesis alternativa que, en este caso, correspondía con que el precio de los productos de Apple es superior de media que para el resto de marcas. ## **Estudio de variables numéricas** Realizamos primero una matriz de correlación, que nos podrá indicar con qué variables tiene más relación el valor del precio: """
print("ansari") data['ansari'] = [ ansari(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] #============================================================================== # print("mannwhitneyu") # data['mannwhitneyu'] = [mannwhitneyu(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), # np.nan_to_num(question2_vectors))] #============================================================================== print("fligner") data['fligner'] = [ fligner(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("mood") data['mood'] = [ mood(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("ks_2samp") data['ks_2samp'] = [ ks_2samp(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ]
def test_data(self): # numbers from R: fligner.test in package stats x1 = np.arange(5) assert_array_almost_equal(stats.fligner(x1, x1**2), (3.2282229927203536, 0.072379187848207877), 11)
# 분포를 시각화하여 정규성 확인 import matplotlib.pyplot as plt import seaborn as sns sns.distplot(sco1, kde=False, fit=stats.norm) sns.distplot(sco2, kde=False, fit=stats.norm) plt.show() # 정규성 확인 함수 - shapiro : 0.05보다 크면 정규성을 띈다. print(stats.shapiro(sco1)) # (0.965552806854248, 0.3679903745651245) 0.3679 > 0.05 정규성 분포를 이룸 print(stats.shapiro(sco2)) # (0.9621098637580872, 0.6714189648628235) 0.6714 > 0.05 정규성 분포를 이룸 # 등분산성 : 0.05보다 크면 등분산성을 이룸. print(stats.levene(sco1, sco2)) # 가장 일반적인 등분산성 함수. 0.4568 > 0.05 이므로 등분산성을 따름 # print(stats.levene(sco1, sco2).pvalue) print(stats.fligner(sco1, sco2)) print(stats.bartlett(sco1, sco2)) print(stats.ttest_ind(sco1, sco2)) print(stats.ttest_ind(sco1, sco2, equal_var=True)) # equal_var : 등분산성 만족한 경우. 기본값. # Ttest_indResult(statistic=-0.19649386929539883, pvalue=0.8450532207209545) # 해석 : p-value(0.8450) > 0.05 이므로 귀무 채택. 두 가지 교육방법에 따른 평균시험 점수에 차이가 없다. print(stats.ttest_ind(sco1, sco2, equal_var=False)) # equal_var : 등분산성 만족하지 못한 경우. # 만약 정규성 만족 못할 경우 # stats.wilcoxon() # 이걸 사용하자. 다만 두개의 길이가 같아야 실행된다. # stats.kruskal() # stats.mannwhitneyu()