def test_basic(self): # median_test calls chi2_contingency to compute the test statistic # and p-value. Make sure it hasn't screwed up the call... x = [1, 2, 3, 4, 5] y = [2, 4, 6, 8] stat, p, m, tbl = stats.median_test(x, y) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p) stat, p, m, tbl = stats.median_test(x, y, lambda_=0) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, lambda_=0) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p) stat, p, m, tbl = stats.median_test(x, y, correction=False) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, correction=False) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p)
def test_ties_options(self): # Test the contingency table calculation. x = [1, 2, 3, 4] y = [5, 6] z = [7, 8, 9] # grand median is 5. # Default 'ties' option is "below". stat, p, m, tbl = stats.median_test(x, y, z) assert_equal(m, 5) assert_equal(tbl, [[0, 1, 3], [4, 1, 0]]) stat, p, m, tbl = stats.median_test(x, y, z, ties="ignore") assert_equal(m, 5) assert_equal(tbl, [[0, 1, 3], [4, 0, 0]]) stat, p, m, tbl = stats.median_test(x, y, z, ties="above") assert_equal(m, 5) assert_equal(tbl, [[0, 2, 3], [4, 0, 0]])
def medianTest(df, alpha): original = df['Score_original'] fixed = df['Score_fixed'] stat, p, med, tbl = stats.median_test(original, fixed) if p < alpha: test = True else: test = False df['Median Test'] = test return df
def test_simple(self): x = [1, 2, 3] y = [1, 2, 3] stat, p, med, tbl = stats.median_test(x, y) # The median is floating point, but this equality test should be safe. assert_equal(med, 2.0) assert_array_equal(tbl, [[1, 1], [2, 2]]) # The expected values of the contingency table equal the contingency table, # so the statistic should be 0 and the p-value should be 1. assert_equal(stat, 0) assert_equal(p, 1)
def test_mood(a1, a2): """ Runs Mood's median test on the two supplied arrays, requires ndarrays with at least 2 distinct values :param list a1: array 1 :param list a2: array 2 :return: p-value :rtype: float """ if isinstance(a1, np.ndarray) and isinstance(a2, np.ndarray): if len(set(a1)) > 1 and len( set(a2)) > 1: # require at least 2 distinct values _, p, _, _ = stats.median_test(a1, a2) return p else: return np.NaN else: return np.NaN
def plotResults(distH, distA, titleString, digits=5): boxplot([distH, distA], notch=True, widths=0.25, positions=[0.75, 1.25], \ labels=['Diagnosed', 'Typical']) title(titleString, fontsize=18) xticks(fontsize=14) yticks(fontsize=10) show() stat, pValue = ttest_ind(distH, distA) print(titleString) print('mean / med:\t, diagnosed:', round(mean(distH), digits), '/', round(median(distH), digits), \ '\ttypical:', round(mean(distA), digits), '/', round(median(distA), digits)) print('t significance level p =', round(pValue, digits)) stat, pValue = mannwhitneyu(distH, distA) print('U significance level p =', round(pValue, digits)) stat, pValue, m, table = median_test(distH, distA, correction=False) print('median significance level p =', round(pValue, digits)) print()
def compare_to_auto(vals, weights): # Mood's median test stat is chisq -- near 0 for similar median try: stat, _p, _med, cont = median_test(auto_l, vals, ties='ignore', lambda_='log-likelihood') except ValueError: # "All values are below the grand median (0.0)" stat = None else: if stat == 0 and 0 in cont: stat = None # In case Mood's test failed for either sex if use_weight: med_diff = abs(descriptives.weighted_median(auto_l, auto_w) - descriptives.weighted_median(vals, weights)) else: med_diff = abs(np.median(auto_l) - np.median(vals)) return (stat, med_diff)
def test_significance_tests(normal_obs, normal_obs_control): treatment = ab.sample(normal_obs) control = ab.sample(normal_obs_control) res = treatment.t_test(control, equal_var=True) res_expected = ttest_ind(normal_obs, normal_obs_control, equal_var=True) assert res.p_value == res_expected.pvalue assert res.statistic == res_expected.statistic res = treatment.t_test(control, equal_var=False) res_expected = ttest_ind(normal_obs, normal_obs_control, equal_var=False) assert res.p_value == res_expected.pvalue assert res.statistic == res_expected.statistic res = treatment.t_test_1samp(101) res_expected = ttest_1samp(normal_obs, 101) assert res.p_value == res_expected.pvalue assert res.statistic == res_expected.statistic res = treatment.mann_whitney_u_test(control) res_expected = mannwhitneyu(normal_obs_control, normal_obs, alternative='two-sided') assert res.p_value == pytest.approx(res_expected.pvalue, 1e-6) assert res.u_statistic == res_expected.statistic res = treatment.shapiro_test() res_expected = shapiro(normal_obs) assert res.statistic == res_expected[0] assert res.p_value == res_expected[1] res = treatment.median_test(control) res_expected = median_test(normal_obs, normal_obs_control) assert res.p_value == res_expected[1] assert res.statistic == res_expected[0] assert res.grand_median == res_expected[2] res = treatment.levene_test(control) res_expected = levene(normal_obs, normal_obs_control) assert res.p_value == res_expected.pvalue assert res.statistic == res_expected.statistic res = treatment.mood_test(control) res_expected = mood(normal_obs, normal_obs_control) assert res.p_value == res_expected[1] assert res.statistic == res_expected[0]
def fit(self, *args, **kwargs): """Perform a Mood’s median test. Parameters ---------- sample1, sample2, … : array_like The set of samples. There must be at least two samples. Each sample must be a one-dimensional sequence containing at least one value. The samples are not required to have the same length. ties : str, optional Determines how values equal to the grand median are classified in the contingency table. The string must be one of: "below": Values equal to the grand median are counted as "below". "above": Values equal to the grand median are counted as "above". "ignore": Values equal to the grand median are not counted. The default is “below”. correction : bool, optional If True, and there are just two samples, apply Yates’ correction for continuity when computing the test statistic associated with the contingency table. Default is True. lambda_ : float or str, optional By default, the statistic computed in this test is Pearson’s chi-squared statistic. lambda_ allows a statistic from the Cressie-Read power divergence family to be used instead. See power_divergence for details. Default is 1 (Pearson’s chi-squared statistic). nan_policy : {‘propagate’, ‘raise’, ‘omit’}, optional Defines how to handle when input contains nan. ‘propagate’ returns nan, ‘raise’ throws an error, ‘omit’ performs the calculations ignoring nan values. Default is ‘propagate’. """ self._statistic, self._p, self._m, self._ctable = median_test( *args, **kwargs)
def median_test(*args): median, pval, m, table = ss.median_test(*args) return median, pval, m, table
# -*- coding: utf-8 -*- import math import random from scipy import stats # Test Mood's równoci median: stats.median_test(dane_1, dane_2) # Test U Manna Whitney'a (nieparametryczny odpowiednik testu t-studenta dla prób niezależnych): stats.mannwhitneyu(dane_1, dane_2) # Test Wilcoxsona (odpowiednik testu t-studenta dla prób zależnych): stats.wilcoxon(dane_1, dane_2) # Test Kurskala - Wallisa (nieparametryczny odpowiednik jednoczynnikowej ANOVA dla prób niezależnych): stats.kruskal(dane_1, dane_2, dane_3) # Test Friedmana (nieparametryczny odpowiednik jednoczynnikowej ANOVA dla prób zależnych): stats.friedmanchisquare(dane_1, dane_2, dane_3)
def median_test(list_of_samples): ''' [[],[],..] ''' stat, p, med, tbl = stats.median_test(*list_of_samples) return p
def expectation_index_hist(database, name): # -- Ascending -- highFreqRespCellsA = [] midFreqRespCellsA = [] lowFreqRespCellsA = [] for indRow, dbRow in database.iterrows(): pValueHighA = database['pValHighResponseA'][indRow] pValueMidA = database['pValMidResponseA'][indRow] pValueLowA = database['pValLowResponseA'][indRow] pValuesA = dict(pValueHA=pValueHighA, pValueMA=pValueMidA, pValueLA=pValueLowA) # -- The best frequency is the one with the lowest pValue in sound responsive cells. -- minimumA = min(pValuesA, key=pValuesA.get) # -- Appending to a list the cells that were most responsive to each of the three frequencies. -- if minimumA == 'pValueHA': highFreqRespCellsA.append(database.iloc[indRow]) elif minimumA == 'pValueMA': midFreqRespCellsA.append(database.iloc[indRow]) else: lowFreqRespCellsA.append(database.iloc[indRow]) respHighA = pd.DataFrame( highFreqRespCellsA ) # Database of cells where the high frequency tone in the ascending sequence is the most responsive respMidA = pd.DataFrame( midFreqRespCellsA ) # Database of cells where the middle frequency tone in the ascending sequence is the most responsive respLowA = pd.DataFrame( lowFreqRespCellsA ) # Database of cells where the low frequency tone in the ascending sequence is the most responsive signRespCellsHighA = respHighA.query( 'pValHighFRA < 0.05' ) # Cells that were most responsive for the high frequency sound that also show a significant difference in firing between the high frequency oddball and standard (first oddball/std) signRespCellsMidA = respMidA.query( 'pValMidFRA < 0.05' ) # Cells that were most responsive for the middle frequency sound that also show a significant difference in firing between the high frequency oddball and standard (first oddball/std) signRespCellsLowA = respLowA.query( 'pValLowFRA < 0.05' ) # Cells that were most responsive for the low frequency sound that also show a significant difference in firing between the high frequency oddball and standard (first oddball/std) # -- Descending -- highFreqRespCellsD = [] midFreqRespCellsD = [] lowFreqRespCellsD = [] for indRow, dbRow in database.iterrows(): pValueHighD = database['pValHighResponseD'][indRow] pValueMidD = database['pValMidResponseD'][indRow] pValueLowD = database['pValLowResponseD'][indRow] pValuesD = dict(pValueHD=pValueHighD, pValueMD=pValueMidD, pValueLD=pValueLowD) # -- The best frequency is the one with the lowest pValue in sound responsive cells. -- minimumD = min(pValuesD, key=pValuesD.get) if minimumD == 'pValueHD': highFreqRespCellsD.append(database.iloc[indRow]) elif minimumD == 'pValueMD': midFreqRespCellsD.append(database.iloc[indRow]) else: lowFreqRespCellsD.append(database.iloc[indRow]) respHighD = pd.DataFrame(highFreqRespCellsD) respMidD = pd.DataFrame(midFreqRespCellsD) respLowD = pd.DataFrame(lowFreqRespCellsD) signRespCellsHighD = respHighD.query('pValHighFRD < 0.05') signRespCellsMidD = respMidD.query('pValMidFRD < 0.05') signRespCellsLowD = respLowD.query('pValLowFRD < 0.05') bins = 30 plt.figure(figsize=(10, 4.5)).suptitle(name, fontsize=9, y=1.01) ax = plt.subplot2grid((2, 3), (0, 0)) # Subplots ax0 = plt.subplot2grid((2, 3), (0, 0)) highIndA = respHighA['expIndHighA'] plt.hist(highIndA[~np.isnan(highIndA)], bins, histtype='step', color='limegreen') signCellsHighA = signRespCellsHighA['expIndHighA'] plt.hist(signCellsHighA[~np.isnan(signCellsHighA)], bins, color='limegreen') plt.title('High Frequency - Ascending', fontsize=8) plt.xlabel('Expectation Index', fontsize=8) plt.ylabel('Num of Cells', fontsize=8) plt.xlim(-1, 1) ax1 = plt.subplot2grid((2, 3), (0, 1)) midIndA = respMidA['expIndMidA'] plt.hist(midIndA[~np.isnan(midIndA)], bins, histtype='step', color='dodgerblue') signCellsMidA = signRespCellsMidA['expIndMidA'] plt.hist(signCellsMidA[~np.isnan(signCellsMidA)], bins, color='dodgerblue') plt.title('Middle Frequency - Ascending', fontsize=8) plt.xlabel('Expectation Index', fontsize=8) plt.ylabel('Num of Cells', fontsize=8) plt.xlim(-1, 1) ax2 = plt.subplot2grid((2, 3), (0, 2)) lowIndA = respLowA['expIndLowA'] plt.hist(lowIndA[~np.isnan(lowIndA)], bins, histtype='step', color='darkorchid') signCellsLowA = signRespCellsLowA['expIndLowA'] plt.hist(signCellsLowA[~np.isnan(signCellsLowA)], bins, color='darkorchid') plt.title('Low Frequency - Ascending', fontsize=8) plt.xlabel('Expectation Index', fontsize=8) plt.ylabel('Num of Cells', fontsize=8) plt.xlim(-1, 1) ax3 = plt.subplot2grid((2, 3), (1, 0)) highIndD = respHighD['expIndHighD'] plt.hist(highIndD[~np.isnan(highIndD)], bins, histtype='step', color='limegreen') signCellsHighD = signRespCellsHighD['expIndHighD'] plt.hist(signCellsHighD[~np.isnan(signCellsHighD)], bins, color='limegreen') plt.title('High Frequency - Descending', fontsize=8) plt.xlabel('Expectation Index', fontsize=8) plt.ylabel('Num of Cells', fontsize=8) plt.xlim(-1, 1) ax4 = plt.subplot2grid((2, 3), (1, 1)) midIndD = respMidD['expIndMidD'] plt.hist(midIndD[~np.isnan(midIndD)], bins, histtype='step', color='dodgerblue') signCellsMidD = signRespCellsMidD['expIndMidD'] plt.hist(signCellsMidD[~np.isnan(signCellsMidD)], bins, color='dodgerblue') plt.title('Middle Frequency - Descending', fontsize=8) plt.xlabel('Expectation Index', fontsize=8) plt.ylabel('Num of Cells', fontsize=8) plt.xlim(-1, 1) ax5 = plt.subplot2grid((2, 3), (1, 2)) lowIndD = respLowD['expIndLowD'] plt.hist(lowIndD[~np.isnan(lowIndD)], bins, histtype='step', color='darkorchid') signCellsLowD = signRespCellsLowD['expIndLowD'] plt.hist(signCellsLowD[~np.isnan(signCellsLowD)], bins, color='darkorchid') plt.title('Low Frequency - Descending', fontsize=8) plt.xlabel('Expectation Index', fontsize=8) plt.ylabel('Num of Cells', fontsize=8) plt.xlim(-1, 1) plt.tight_layout() plt.gcf().set_size_inches([10, 4.5]) figFormat = 'png' figFilename = 'expectation_index_hist_{}.{}'.format(name, figFormat) outputDir = os.path.join(settings.FIGURES_DATA_PATH, studyparams.STUDY_NAME) figFullpath = os.path.join(outputDir, figFilename) plt.savefig(figFullpath, format=figFormat) plt.show() # -- Statistics -- ## -- Ascending -- print('Ascending:') ### -- High Frequency -- highIndA = highIndA[~np.isnan(highIndA)] medianHighIndA = np.median(highIndA) signCellsHighA = signCellsHighA[~np.isnan(signCellsHighA)] medianSignCellsHighA = np.median(signCellsHighA) print('Median High Freq Responsive = {}').format(medianHighIndA) print('Median High Freq Significant = {}').format(medianSignCellsHighA) statHighA, pHighA, medHighA, tblHighA = stats.median_test( highIndA, signCellsHighA) print('Median_test = {}').format(medHighA) ### -- Middle Frequency -- midIndA = midIndA[~np.isnan(midIndA)] medianMidIndA = np.median(midIndA) signCellsMidA = signCellsMidA[~np.isnan(signCellsMidA)] medianSignCellsMidA = np.median(signCellsMidA) print('Median Mid Freq Responsive = {}').format(medianMidIndA) print('Median Mid Freq Significant = {}').format(medianSignCellsMidA) statMidA, pMidA, medMidA, tblMidA = stats.median_test( midIndA, signCellsMidA) print('Median_test = {}').format(medMidA) ### -- Low Frequency -- lowIndA = lowIndA[~np.isnan(lowIndA)] medianLowIndA = np.median(lowIndA) signCellsLowA = signCellsLowA[~np.isnan(signCellsLowA)] medianSignCellsLowA = np.median(signCellsLowA) print('Median Low Freq Responsive = {}').format(medianLowIndA) print('Median Low Freq Significant = {}').format(medianSignCellsLowA) statLowA, pLowA, medLowA, tblLowA = stats.median_test( lowIndA, signCellsLowA) print('Median_test = {}').format(medLowA) ## -- Descending -- print('Descending:') ### -- High Frequency -- highIndD = highIndD[~np.isnan(highIndD)] medianHighIndD = np.median(highIndD) signCellsHighD = signCellsHighD[~np.isnan(signCellsHighD)] medianSignCellsHighD = np.median(signCellsHighD) print('Median High Freq Responsive = {}').format(medianHighIndD) print('Median High Freq Significant = {}').format(medianSignCellsHighD) statHighD, pHighD, medHighD, tblHighD = stats.median_test( highIndD, signCellsHighD) print('Median_test = {}').format(medHighD) ### -- Middle Frequency -- midIndD = midIndD[~np.isnan(midIndD)] medianMidIndD = np.median(midIndD) signCellsMidD = signCellsMidD[~np.isnan(signCellsMidD)] medianSignCellsMidD = np.median(signCellsMidD) print('Median Mid Freq Responsive = {}').format(medianMidIndD) print('Median Mid Freq Significant = {}').format(medianSignCellsMidD) statMidD, pMidD, medMidD, tblMidD = stats.median_test( midIndD, signCellsMidD) print('Median_test = {}').format(medMidD) ### -- Low Frequency -- lowIndD = lowIndD[~np.isnan(lowIndD)] medianLowIndD = np.median(lowIndD) signCellsLowD = signCellsLowD[~np.isnan(signCellsLowD)] medianSignCellsLowD = np.median(signCellsLowD) print('Median Low Freq Responsive = {}').format(medianLowIndD) print('Median Low Freq Significant = {}').format(medianSignCellsLowD) statLowD, pLowD, medLowD, tblLowD = stats.median_test( lowIndD, signCellsLowD) print('Median_test = {}').format(medLowD) ### -- Responsive sessions -- statA, pA, medA, tblA = stats.median_test(highIndA, midIndA, lowIndA) statD, pD, medD, tblD = stats.median_test(highIndD, midIndD, lowIndD) print('Ascending median test = {}').format(medA) print('Descending median test = {}').format(medD) ### -- Significantly responsive sessions -- statSignA, pSignA, medSignA, tblSignA = stats.median_test( signCellsHighA, signCellsMidA, signCellsLowA) statSignD, pSignD, medSignD, tblSignD = stats.median_test( signCellsHighD, signCellsMidD, signCellsLowD) print('Ascending median test for significantly responsive cells = {}' ).format(medSignA) print('Descending median test for significantly responsive cells = {}' ).format(medSignD) ### -- Additional Stats -- print(name) percentCellsShiftedRightHighA = sum(highIndA > 0.0) / len(respHighA) * 100 percentSignCellsShiftedRightHighA = sum( signCellsHighA > 0.0) / len(signRespCellsHighA) * 100 print( 'High frequency ascending - {:.2f}% of cells have an expectation index shifted to signify an increase in firing from an unexpected sound to an expected one and {:.2f}% of significantly responsive cells show the same shift.' .format(percentCellsShiftedRightHighA, percentSignCellsShiftedRightHighA)) percentCellsShiftedRightMidA = sum(midIndA > 0.0) / len(respMidA) * 100 percentSignCellsShiftedRightMidA = sum( signCellsMidA > 0.0) / len(signRespCellsMidA) * 100 print( 'Middle frequency ascending - {:.2f}% of cells have an expectation index shifted to signify an increase in firing from an unexpected sound to an expected one and {:.2f}% of significantly responsive cells show the same shift.' .format(percentCellsShiftedRightMidA, percentSignCellsShiftedRightMidA)) percentCellsShiftedRightLowA = sum(lowIndA > 0.0) / len(respLowA) * 100 percentSignCellsShiftedRightLowA = sum( signCellsLowA > 0.0) / len(signRespCellsLowA) * 100 print( 'Low frequency ascending - {:.2f}% of cells have an expectation index shifted to signify an increase in firing from an unexpected sound to an expected one and {:.2f}% of significantly responsive cells show the same shift.' .format(percentCellsShiftedRightLowA, percentSignCellsShiftedRightLowA)) percentCellsShiftedRightHighD = sum(highIndD > 0.0) / len(respHighD) * 100 percentSignCellsShiftedRightHighD = sum( signCellsHighD > 0.0) / len(signRespCellsHighD) * 100 print( 'High frequency descending - {:.2f}% of cells have an expectation index shifted to signify an increase in firing from an unexpected sound to an expected one and {:.2f}% of significantly responsive cells show the same shift.' .format(percentCellsShiftedRightHighD, percentCellsShiftedRightHighD)) percentCellsShiftedRightMidD = sum(midIndD > 0.0) / len(respMidD) * 100 percentSignCellsShiftedRightMidD = sum( signCellsMidD > 0.0) / len(signRespCellsMidD) * 100 print( 'Middle frequency descending - {:.2f}% of cells have an expectation index shifted to signify an increase in firing from an unexpected sound to an expected one and {:.2f}% of significantly responsive cells show the same shift.' .format(percentCellsShiftedRightMidD, percentSignCellsShiftedRightMidD)) percentCellsShiftedRightLowD = sum(lowIndD > 0.0) / len(respLowD) * 100 percentSignCellsShiftedRightLowD = sum( signCellsLowD > 0.0) / len(signRespCellsLowD) * 100 print( 'Low frequency descending - {:.2f}% of cells have an expectation index shifted to signify an increase in firing from an unexpected sound to an expected one and {:.2f}% of significantly responsive cells show the same shift.' .format(percentCellsShiftedRightLowD, percentSignCellsShiftedRightLowD))
def custom(a, b): _, p, _, _ = stats.median_test(a, b) return p
def mood_median(L): score = median_test(*list(get_distances_per_class(L).values()))[0] if not pd.isnull(score): return (score, ) else: return (float('-inf'), )
def plot_synapse_delays(ax, data, xlim=None, ylim=None, xscale='log', density_scaling='count', naxes=3, report=sys.stdout): """ Plot corretion and marginals :param ax: axes handle :param data: pandas.DataFrame with columns pre, post, functional_delay, functional_strength, structural_delay, structural_strength, synaptic_delay, delayed, simultaneous :param xlim, ylim: limits for x and y axis of the scatter plot :param xscale: scaling for x axis of the scatter plot (default: 'log') :param density_scaling: scaling for density axis of the marginal plots (default: 'count') :param naxes: if 3 do plot the marginals (default: 3) :param report: file handle (default: sys.stdout just prints) """ # TODO Check correct behaviour for naxes == 2! # New axes if naxes == 2: axScatter = ax fig = ax.get_figure() divider = make_axes_locatable(axScatter) axHisty = divider.new_horizontal(size="50%", pad=0.05) fig.add_axes(axHisty) if naxes == 3: rect_histx, rect_histy, rect_scatter = axes_to_3_axes(ax) axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # Subsetting the data n_total = len(data) delayed = data[data.delayed] n_delayed = len(delayed) simultaneous = data[data.simultaneous] n_simultanous = len(simultaneous) # scatter plot axScatter.scatter(simultaneous.functional_strength, simultaneous.synaptic_delay, color='red', label='<1 ms (%d%%)' % (100.0 * n_simultanous / n_total)) axScatter.scatter(delayed.functional_strength, delayed.synaptic_delay, color='green', label='>1 ms (%d%%)' % (100.0 * n_delayed / n_total)) axScatter.set_xscale(xscale) axScatter.legend(frameon=False, scatterpoints=1) axScatter.set_xlabel(r'$\mathsf{z_{max}}$', fontsize=14) axScatter.set_ylabel( r'$\mathsf{\tau_{synapse}=\tau_{spike}-\tau_{axon}\ [ms]}$', fontsize=14) # density plot kernel_density(axHisty, data.synaptic_delay, yscale=density_scaling, style='k-', orientation='horizontal') if naxes == 3: # joint legend by proxies plt.sca(ax) plt.vlines(0, 0, 0, colors='green', linestyles='-', label='>1ms') plt.vlines(0, 0, 0, colors='red', linestyles='-', label='<1ms') plt.vlines(0, 0, 0, colors='black', linestyles='-', label='all') plt.legend(frameon=False, fontsize=12) kernel_density(axHistx, data.functional_strength, xscale=xscale, yscale=density_scaling, style='k-', orientation='vertical') kernel_density(axHistx, simultaneous.functional_strength, xscale=xscale, yscale=density_scaling, style='r-', orientation='vertical') kernel_density(axHistx, delayed.functional_strength, xscale=xscale, yscale=density_scaling, style='g-', orientation='vertical') axHistx.set_xscale(xscale) section = {} section['delayed'] = { 'median': float(np.median(delayed.functional_strength)), 'mean': float(np.mean(delayed.functional_strength)), 'n': int(n_delayed), 'p': float(n_delayed / n_total) } section['simultaneous'] = { 'median': float(np.median(simultaneous.functional_strength)), 'mean': float(np.mean(simultaneous.functional_strength)), 'n': int(n_simultanous), 'p': float(n_simultanous / n_total) } t, p = ttest_ind(np.log(simultaneous.functional_strength), np.log(delayed.functional_strength)) section['Students_t_test'] = {'p': float(p), 't': float(t)} xhi2, p, med, tbl = median_test(simultaneous.functional_strength, delayed.functional_strength) section['Moods_median_test'] = { 'xhi2': float(xhi2), 'p': float(p), 'median_difference': float(med) } yaml.dump({'synapse_z_max': section}, report) # define limits max_functional_strength = max(data.functional_strength) if xlim is None: xlim = (min(data.functional_strength), max_functional_strength * 2 ) # leave some room on the left if ylim is None: ylim = (min(data.synapse_delay), max(data.synapse_delay)) # set limits axScatter.set_xlim(xlim) axScatter.set_ylim(ylim) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) # add hlines to Scatter axScatter.hlines(0, 0, max_functional_strength * 2, linestyles='--') axScatter.hlines(-1, 0, max_functional_strength * 2, linestyles=':') axScatter.hlines(+1, 0, max_functional_strength * 2, linestyles=':') # no labels nullfmt = NullFormatter() # no labels axHistx.xaxis.set_major_formatter(nullfmt) axHistx.yaxis.set_major_formatter(nullfmt) axHisty.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt)
def my_median_test(df, metric='Yield', descriptors=['Product group', 'Line', 'Shift'], stat_cut_off=1e-2, continuous=False): """ Parameters ---------- metric: str, default Yield Yield, Rate, or Uptime (or whatever you have a col name for I guess jajajaj) stat_cut_off: float, default 1e-2 p-test cutoff (<0.01 chance of null hypothesis) Returns ------- stat_df: DataFrame Moods Median Test Results for Metric """ if continuous: moods = [] for descriptor in descriptors: stat, p = stats.pearsonr(production_df[metric], production_df[descriptor]) moods.append([descriptor, stat, p]) stat_df = pd.DataFrame(moods) stat_df.columns = ['descriptor', 'stat', 'p'] stat_df = stat_df.sort_values(by='stat', ascending=False).reset_index(drop=True) stat_df = stat_df.loc[stat_df['p'] < stat_cut_off].drop_duplicates( 'stat').reset_index(drop=True) stat_df['score'] = stat_df['stat'] stat_df = stat_df.reset_index(drop=True) else: moods = [] for descriptor in descriptors: for item in df[descriptor].unique(): try: stat, p, m, table = stats.median_test( df.loc[df[descriptor] == item][metric], df.loc[~(df[descriptor] == item)][metric], nan_policy='omit') moods.append([descriptor, item, stat, p, m, table]) except: pass stat_df = pd.DataFrame(moods) stat_df.columns = ['descriptor', 'group', 'stat', 'p', 'm', 'table'] stat_df = stat_df.sort_values(by='stat', ascending=False).reset_index(drop=True) stat_df = stat_df.loc[stat_df['p'] < stat_cut_off].drop_duplicates( 'stat').reset_index(drop=True) scores = [] for index in range(stat_df.shape[0]): x = df.loc[(df[stat_df.iloc[index]['descriptor']] == \ stat_df.iloc[index]['group'])][metric] y = df.loc[(df[stat_df.iloc[index]['descriptor']] == \ stat_df.iloc[index]['group'])][metric].median() y = df.loc[( df[stat_df.iloc[index]['descriptor']] == stat_df.iloc[index] ['group'])][stat_df.iloc[index]['descriptor']] if metric == 'Uptime': scores.append(stat_df['table'][index][1][0] / stat_df['table'][index][0][0]) else: scores.append(stat_df['table'][index][0][0] / stat_df['table'][index][1][0]) stat_df['score'] = scores stat_df = stat_df.sort_values('score', ascending=True) stat_df = stat_df.reset_index(drop=True) return stat_df
def starplot(df=[], x='', y='', data=[], index=[], columns=[], fold=False, foldcol=0, mode=3, errorbar=True, plottype='barplot', stats='independent t test', test_var=False, stats_var='f test', crit_var=0.05, equal_var=True, rotate=0, elinewidth=0.5, fontsize=14, capsize=4, noffset_ylim=35, noffset_fst=10, noffset_diff=10, star_size=3, linewidth=1, crit=[0.05, 0.01, 0.001, 0.0001]): # data: list of data matrixs(or DataFrames) for comparison (row: obs, columns: var) # index: var, columns: obs # adjacent: annotate star for adjacent bar # control: annotate star between all other bars to selctive control bar # mix: mix mode # 3: annotate star for all combination of bar (only 3 bars available) crit = np.array(crit) plt.rcParams['font.family'] = 'Times New Roman' fig, ax = plt.subplots() star = ['*', '**', '***', '****'] n = len(data) m = data[0].shape[1] test = pd.DataFrame() for i, j in enumerate(data): if type(test) == type(j): data[i] = j.values.reshape(len(j.index), len(j.columns)) if plottype == 'barplot': error = pd.DataFrame() mean = pd.DataFrame() for i in range(m): error[i] = [data[j][:, i].std() for j in range(n)] mean[i] = [data[j][:, i].mean() for j in range(n)] error = error.transpose() mean = mean.transpose() if len(index) != 0: error.index = index mean.index = index if len(columns) != 0: error.columns = columns mean.columns = columns if fold == True: oldmean = mean.copy() olderror = error.copy() for i in range(len(mean.columns)): mean.iloc[:, i] = oldmean.iloc[:, i] / oldmean.iloc[:, foldcol] error.iloc[:, i] = olderror.iloc[:, i] / oldmean.iloc[:, foldcol] if errorbar == True: plot = plot = mean.plot.bar(yerr=error, ax=ax, rot=rotate, capsize=capsize, error_kw=dict(elinewidth=elinewidth), fontsize=fontsize) max_bar = [[mean.iloc[j, i] + error.iloc[j, i] for i in range(n)] for j in range(m)] min_bar = [ mean.iloc[j, i] - error.iloc[j, i] for i in range(n) for j in range(m) ] else: plot = plot = mean.plot.bar(ax=ax, rot=rotate, capsize=capsize, error_kw=dict(elinewidth=elinewidth), fontsize=fontsize) max_bar = [[mean.iloc[j, i] for i in range(n)] for j in range(m)] min_bar = [mean.iloc[j, i] for i in range(n) for j in range(m)] elif plottype == 'boxplot': print("under buiding") ylim = 0 offset = max([max_bar[i][j] for i in range(m) for j in range(n)]) / 100 blank = [] if mode == 3: for j in range(m): level = np.zeros(n) for i in range(n): if i < n - 1: k = i + 1 else: k = 0 if test_var == True: if stats_var == 'f test': f = 0.5 - abs(0.5 - ftest.sf( data[i][:, j].var() / data[k][:, j].var(), len(data[i][:, j]) - 1, len(data[k][:, j]) - 1)) if crit_var / 2 > f: equal_var = False else: equal_var = True else: if stats_var == 'bartlett': f = bartlett(data[i][:, j], data[k][:, j])[1] elif stats_var == 'levene': f = bartlett(data[i][:, j], data[k][:, j])[1] elif stats_var == 'fligner': f = fligner(data[i][:, j], data[k][:, j])[1] if crit_var > f: equal_var = False else: equal_var = True if stats == 'independent t test': p = ttest_ind(data[i][:, j], data[k][:, j], equal_var=equal_var)[1] elif stats == 'paired t test': if equal_var == True: p = ttest_rel(data[i][:, j], data[k][:, j])[1] else: p = 0 elif stats == 'median test': p = median_test(data[i][:, j], data[k][:, j])[1] elif stats == 'mannwhitneyu': if equal_var == True: p = mannwhitneyu(data[i][:, j], data[k][:, j])[1] else: p = 0 elif stats == 'wilcoxon': if equal_var == True: p = wilcoxon(data[i][:, j], data[k][:, j])[1] else: p = 0 level[i] = len(crit) - len(crit.compress(p > crit)) for k in range(n): height = 0 if level[k] != 0 and k != n - 1: center = [ plot.patches[k * m + j].get_x(), plot.patches[k * m + m + j].get_x() ] height = max([max_bar[j][k], max_bar[j][k + 1]]) h1 = max_bar[j][k] h2 = max_bar[j][k + 1] width = plot.patches[k * m + j].get_width() blank.append( (center[0] + width / 2, height + noffset_fst * offset + (-1)**k * 2 * offset)) blank.append( (center[1] + width / 2, height + noffset_fst * offset + (-1)**k * 2 * offset)) ax.vlines(x=center[0] + width / 2, ymin=h1 + offset * 2, ymax=height + noffset_fst * offset + (-1)**k * 2 * offset, lw=linewidth) ax.vlines(x=center[1] + width / 2, ymin=h2 + offset * 2, ymax=height + noffset_fst * offset + (-1)**k * 2 * offset, lw=linewidth) ax.annotate(star[int(level[k] - 1)], xy=((center[0] + center[1]) / 2 + width / 2, height + (noffset_fst + 1) * offset + (-1)**k * 2 * offset), ha='center', size=star_size) elif level[k] != 0 and k == n - 1: center = [ plot.patches[j].get_x(), plot.patches[k * m + j].get_x() ] height = max(max_bar[j]) h1 = max_bar[j][0] h2 = max_bar[j][k] blank.append( (center[0] + width / 2, height + (noffset_fst + noffset_diff) * offset)) blank.append((center[1] + width / 2, height + 20 * offset)) ax.vlines(x=center[0] + width / 2, ymin=h1 + offset * 2, ymax=height + (noffset_fst + noffset_diff) * offset, lw=linewidth) ax.vlines(x=center[1] + width / 2, ymin=h2 + offset * 2, ymax=height + (noffset_fst + noffset_diff) * offset, lw=linewidth) ax.annotate(star[int(level[k] - 1)], xy=((center[0] + center[1]) / 2 + width / 2, height + (noffset_fst + noffset_diff + 1) * offset), ha='center', size=star_size) if height > ylim: ylim = height if mode == 'adjacent': for j in range(m): level = np.zeros(n - 1) for i in range(n - 1): k = i + 1 if test_var == True: if stats_var == 'f test': f = 0.5 - abs(0.5 - ftest.sf( data[i][:, j].var() / data[k][:, j].var(), len(data[i][:, j]) - 1, len(data[k][:, j]) - 1)) if crit_var / 2 > f: equal_var = False else: equal_var = True else: if stats_var == 'bartlett': f = bartlett(data[i][:, j], data[k][:, j])[1] elif stats_var == 'levene': f = bartlett(data[i][:, j], data[k][:, j])[1] elif stats_var == 'fligner': f = fligner(data[i][:, j], data[k][:, j])[1] if crit_var > f: equal_var = False else: equal_var = True if stats == 'independent t test': p = ttest_ind(data[i][:, j], data[k][:, j], equal_var=equal_var)[1] elif stats == 'paired t test': if equal_var == True: p = ttest_rel(data[i][:, j], data[k][:, j])[1] else: p = 0 elif stats == 'median test': p = median_test(data[i][:, j], data[k][:, j])[1] elif stats == 'mannwhitneyu': if equal_var == True: p = mannwhitneyu(data[i][:, j], data[k][:, j])[1] else: p = 0 elif stats == 'wilcoxon': if equal_var == True: p = wilcoxon(data[i][:, j], data[k][:, j])[1] else: p = 0 level[i] = len(crit) - len(crit.compress(p > crit)) for k in range(n - 1): height = 0 if level[k] != 0: center = [ plot.patches[k * m + j].get_x(), plot.patches[k * m + m + j].get_x() ] height = max([max_bar[j][k], max_bar[j][k + 1]]) h1 = max_bar[j][k] h2 = max_bar[j][k + 1] width = plot.patches[k * m + j].get_width() blank.append( (center[0] + width / 2, height + noffset_fst * offset + (-1)**k * 2 * offset)) blank.append( (center[1] + width / 2, height + noffset_fst * offset + (-1)**k * 2 * offset)) ax.vlines(x=center[0] + width / 2, ymin=h1 + offset * 2, ymax=height + noffset_fst * offset + (-1)**k * 2 * offset, lw=linewidth) ax.vlines(x=center[1] + width / 2, ymin=h2 + offset * 2, ymax=height + noffset_fst * offset + (-1)**k * 2 * offset, lw=linewidth) ax.annotate(star[int(level[k] - 1)], xy=((center[0] + center[1]) / 2 + width / 2, height + (noffset_fst + 1) * offset + (-1)**k * 2 * offset), ha='center', size=star_size) if height > ylim: ylim = height ax.set_ylim(min(0, min(min_bar) - 10 * offset), ylim + noffset_ylim * offset) for j, i in enumerate(blank): ax.vlines(x=i[0], ymin=i[1], ymax=i[1] + offset * 2, color='white', lw=1.2 * linewidth) if j % 2 == 1: ax.hlines(y=i[1], xmin=blank[j - 1], xmax=blank[j], lw=linewidth)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument( "--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;", ) parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help= "Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help= "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values.", ) parser.add_argument( "--fisher", action="store_true", default=False, help="if true then Fisher definition is used", ) parser.add_argument( "--bias", action="store_true", default=False, help= "if false,then the calculations are corrected for statistical bias", ) parser.add_argument( "--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored", ) parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored", ) parser.add_argument( "--inclusive", action="store_true", default=False, help="if false,limit will be ignored", ) parser.add_argument( "--printextras", action="store_true", default=False, help= "If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help= "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument( "--correction", action="store_true", default=False, help="continuity correction ", ) parser.add_argument( "--axis", type=int, default=0, help= "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help= "the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument( "--score", type=int, default=0, help="Score that is compared to the elements in a.", ) parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help= "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument( "--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds", ) parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help= "lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help= "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument( "--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e", ) parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols is not None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols is not None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols is not None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe( map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis( map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias, ) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode, ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf == 0 and mf == 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf == 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf == 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf == 0 and mf == 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf == 0 and mf == 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf == 0 and mf == 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf == 0 and mf == 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation, ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation, ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf == 0 and mf == 0: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf == 0 and mf == 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf == 0 and mf == 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1( map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail, ) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf == 0 and mf == 0: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf == 0 and mf == 0: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf == 0 and mf == 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda == 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two)) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity, ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one), map(float, sample_two), equal_var=args.equal_var) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort, ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction, ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_, ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two), ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def main(task_path, feature_path, ylabel, output_path, BINS=np.array(['-', 'N', '+', 'S']), colors=['#FF0000', '#FFFF00', '#00CC00', '#3d77ff'], star_colors=['#FF0000', 'orange', '#00CC00', '#3d77ff']): df = pd.read_csv(task_path) #loader = TruePairwiseFeatureLoader(feature_path) loader = SumPairwiseLoader(feature_path) bins = sorted(np.unique(df['bin'])) fig, ax = plt.subplots(1, 1, figsize=(10, 10)) df['feature'] = loader.get_values(df) df['bin'] = BINS[df['bin'].astype(int)] ax = sns.violinplot(x="bin", y="feature", ax=ax, order=BINS, data=df, palette=colors, saturation=1) ax.yaxis.set_tick_params(labelsize=plot_cfg['tick_label_size']) ax.xaxis.set_tick_params(labelsize=plot_cfg['tick_label_size'], pad=15) ax.set_ylabel(ylabel, fontsize=plot_cfg['ylabel_size'], weight='bold') ax.set_xlabel('') ax.yaxis.set_tick_params(length=10, width=1, which='both') ax.xaxis.set_tick_params(length=0) ax.grid(False) plt.setp(ax.spines.values(), linewidth=plot_cfg["border_size"], color='black') max_val = np.max(df['feature']) min_val, max_val = ax.get_ylim() ax.set_ylim([min_val, max_val * 1.3]) ax.grid(False) plt.setp(ax.spines.values(), linewidth=plot_cfg["border_size"], color='black') bins = BINS # plot pvalues ALPHA = 0.05 num_comparisons = len(bins) * (len(bins) - 1) / 2 adjusted_alpha = ALPHA / num_comparisons for i in range(len(bins)): a = df[df['bin'] == bins[i]]['feature'] a_med = np.median(a) ax.plot([i, i], [a_med, a_med], 'o', color=plot_cfg['iqr_color'], markersize=15) iqr_lower = np.percentile(a, 25) iqr_upper = np.percentile(a, 75) ax.plot([i, i], [iqr_lower, iqr_upper], linewidth=5, color=plot_cfg['iqr_color']) yoffset = max_val for j in range(i + 1, len(bins)): b = df[df['bin'] == bins[j]]['feature'] statistic, pvalue, _, _ = stats.median_test(a, b) print("%s (%0.2f) vs. %s (%0.2f): %0.6f [%0.6f]" % (bins[i], np.median(a), bins[j], np.median(b), pvalue, statistic)) if pvalue < adjusted_alpha: stars = '*' * eval_funcs.compute_stars(pvalue, adjusted_alpha) target_color = star_colors[j] ax.text(i, yoffset, stars, color=target_color, ha="center", va="center", weight='bold', fontsize=plot_cfg['stars_label_size']) yoffset += 0.1 * max_val plt.savefig(output_path, bbox_inches='tight', dpi=100) plt.show() plt.close()
if p_value > alpha: print('Same distributions (fail to reject H0)') else: print('Different distributions (reject H0)') stat, p_value = mood(dataset['Open'], dataset['Adj Close']) print('Mood Test') print('-' * 40) print('Statistics=%.3f, p=%.3f' % (stat, p_value)) # interpret alpha = 0.05 if p_value > alpha: print('Same distributions (fail to reject H0)') else: print('Different distributions (reject H0)') stat, p_value, med, tbl = median_test(dataset['Open'], dataset['Adj Close'], dataset['Volume']) print('Mood’s median test') print('-' * 40) print('Statistics=%.3f, p=%.3f' % (stat, p_value)) # interpret alpha = 0.05 if p_value > alpha: print('Same distributions (fail to reject H0)') else: print('Different distributions (reject H0)') stat, p_value, med, tbl = median_test(dataset['Open'], dataset['Adj Close'], dataset['Volume'], lambda_="log-likelihood")
"markerfacecolor": "black", "markeredgecolor": "black", "markersize": "8" }).set(title='Heart Disease Status vs. Serum Cholesterol') sns.set(font_scale=1.7) plt.text(2 + 0.2, 4.5, "* = mean ", horizontalalignment='left', size='small', color='black') # stats pd.set_option('display.expand_frame_repr', False) df.groupby([x, hue])[y].describe() stat, p, med, tbl = median_test(hdserum, ndhserum) med p stat tbl # Part 3: Make a violin plot of part 2 x = 'Heart Disease Status' y = "Serum Cholesterol in mg/dl" sns.catplot( x, y, kind='violin', hue='Sex', data=df, palette='Blues').set(title='Heart Disease Status vs. Serum Cholesterol') plt.text(2 + 0.2, 4.5, "* = mean ", horizontalalignment='left',
# print('ttest', i, tstat, ttpval) # KS test ksstat, ks_pval = stats.ks_2samp(fl_by_time_clpXminus_cut[i], fl_by_time_clpXplus_cut[i]) # print('KS test', ksstat, ks_pval) # Mann-Whitney mwstat, mwpval = stats.mannwhitneyu(fl_by_time_clpXminus_cut[i], fl_by_time_clpXplus_cut[i], alternative='greater') # print('Mann-Whitney', mwstat, mwpval) # Mood's median test median_args = (fl_by_time_clpXminus_cut[i], fl_by_time_clpXplus_cut[i]) mstat, mpval, _, _ = stats.median_test(*median_args) # print('Median test', mstat, mpval) # print('\n') # F test for variance var1 = np.var(fl_by_time_clpXplus_cut[i]) df1 = len(fl_by_time_clpXplus_cut[i]) - 1 var2 = np.var(fl_by_time_clpXminus_cut[i]) df2 = len(fl_by_time_clpXminus_cut[i]) - 1 if var1 > var2: F = var1 / var2 fpval = stats.f.sf(F, df1, df2) else: F = var2 / var1 fpval = stats.f.sf(F, df2, df1)
sns.boxplot(x="weather", y="y", data=train, ax=ax[0][1]) sns.boxplot(x="remarks", y="y", data=train, ax=ax[1][0]) ax[1][0].set_xticklabels(ax[1][0].get_xticklabels(), rotation=30) sns.boxplot(x="event", y="y", data=train, ax=ax[1][1]) plt.tight_layout() plt.show() train[train["remarks"] != "お楽しみメニュー"]["y"].plot(figsize=(15, 4), label="not Amuse") train[train["remarks"] == "お楽しみメニュー"]["y"].plot(figsize=(15, 4), label="Amuse") plt.legend() plt.show() train["fun"] = train["remarks"].apply(lambda x: 1 if x == "お楽しみメニュー" else 0) sns.boxplot(x="fun", y="y", data=train) plt.show() stat, p, med, tbl = median_test(train[train["fun"] == 1]["y"], train[train["fun"] == 0]["y"]) print("p", p, "stat", stat) train[train["remarks"] == "お楽しみメニュー"] train["curry"] = train["name"].apply(lambda x: 1 if x.find("カレー") >= 0 else 0) sns.boxplot(x="curry", y="y", data=train) plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
s1 = stats.ttest_ind(values, compare_values, equal_var=l)[0] s2 = stats.ttest_ind(values, compare_values, equal_var=l)[1] elif levene[agent - 2][action + 1] == "True": w = "U" s1 = stats.mannwhitneyu(values, compare_values, alternative='two-sided')[0] s2 = stats.mannwhitneyu(values, compare_values, alternative='two-sided')[1] else: w = "chi^2" try: s1 = stats.median_test(values, compare_values)[0] s2 = stats.median_test(values, compare_values)[1] except: s1 = 0 s2 = 0 s = [s1, s2] if s[1] < 0.0000000001: sec = "***" elif s[1] < 0.0000001: sec = "**" elif s[1] < 0.0001: sec = "*" else: sec = "%.4f" % np.around(s, decimals=4)[1] res = "%s (%.4f %s)" % (sec, np.around(s, decimals=4)[0], w) results[agent - 2][action + 1] = res
#均值检验结果 meantest=[] np.array(meantest) #中位数检验结果 mediantest=[] np.array(mediantest) from scipy import stats as st #检验 cols=["年龄","储蓄"] for cols in cols: meantest.append t,p=st.ttest_ind(datapass[cols].dropna(),datafail[cols].dropna())[0:2] meantest.append([cols,t,p]) t,p=st.median_test(datapass[cols].dropna(),datafail[cols].dropna())[0:2] mediantest.append([cols,t,p]) # In[114]: #显示结果 print(meantest) print(mediantest) #python均值检验与中位数检验,P值小于0.05显著,说明两组数据均值与中位数存在差异 #导出结果 pd.DataFrame(meantest).to_csv("meantest.csv",encoding="gbk") pd.DataFrame(mediantest).to_csv("mediantest.csv",encoding="gbk")