Exemplo n.º 1
0
    def twoway_anova(dv, iv1, iv2, dataset):
        res = ""

        formula = '{0} ~ {1} + {2} + {1}:{2}'.format(dv, iv1, iv2)
        regression = ols(formula, data=dataset).fit()

        anova_table = sm.stats.anova_lm(regression, typ=2)

        p_1 = anova_table['PR(>F)'][0]
        p_2 = anova_table['PR(>F)'][1]
        p_total = anova_table['PR(>F)'][2]
        res += str(anova_table)

        etaSquared_1 = anova_table['sum_sq'][0] / (anova_table['sum_sq'][0] +
                                                   anova_table['sum_sq'][3])
        etaSquared_2 = anova_table['sum_sq'][1] / (anova_table['sum_sq'][1] +
                                                   anova_table['sum_sq'][3])
        etaSquared_total = anova_table['sum_sq'][2] / (
            anova_table['sum_sq'][2] + anova_table['sum_sq'][3])
        res += "\nWe can explain {0:.3f}% of {1} difference due to difference in {2}\n".format(
            etaSquared_1 * 100, dv, iv1)
        res += "\nWe can explain {0:.3f}% of {1} difference due to difference in {2}\n".format(
            etaSquared_2 * 100, dv, iv2)
        res += "\nWe can explain {0:.3f}% of {1} difference due to difference in {2} and {3}\n\n".format(
            etaSquared_total * 100, dv, iv1, iv2)

        if p_1 >= 0.05:
            res += "\nThe is no significant difference in {0} between different categories of {1}".format(
                dv, iv1)
        else:
            res += "\nThere is significant difference in {0} between different categories of {1}".format(
                dv, iv1)

        if p_2 >= 0.05:
            res += "\nThe is no significant difference in {0} between different categories of {1}".format(
                dv, iv2)
        else:
            res += "\nThere is significant difference in {0} between different categories of {1}".format(
                dv, iv2)

        if p_total >= 0.05:
            res += "\nThe is no significant difference in {0} between different categories of {1} and {2}".format(
                dv, iv1, iv2)
        else:
            res += "\nThere is significant difference in {0} between different categories of {1} and {2}".format(
                dv, iv1, iv2)
            res += "\nThere are at least two groups different. which two? :\n\n"
            res += "\nPost-Hoc Test:-----------------------\n"
            tukey_1 = pairwise_tukeyhsd(endog=dataset[dv],
                                        groups=dataset[iv1],
                                        alpha=0.05)
            tukey_2 = pairwise_tukeyhsd(endog=dataset[dv],
                                        groups=dataset[iv2],
                                        alpha=0.05)

            res += str(tukey_1.summary())
            res += "\n-------------------------------------\n"
            res += str(tukey_2.summary())
        return res
Exemplo n.º 2
0
def compare_sentiment_means(data, field, covar=None, standardization="min-max"):
    from statsmodels.stats import anova
    from statsmodels.formula.api import ols
    from statsmodels.stats.multicomp import pairwise_tukeyhsd
    from scipy.stats.mstats import zscore

    colnames = [name for name in data.columns if field+'_' in name and not '_err' in name]
    if standardization=="min-max":
        data_std = ((data[colnames] - data[colnames].min()) / (data[colnames].max() - data[colnames].min()))*2-1
    elif standardization == "z-score":
        data_std = data.copy()
        for col in colnames:
            data_std[col] = zscore(data[col])
    else:
        data_std = data
    if 'ID' in data.columns: # for normal data
        data_std['ID'] = data['ID']
    else: # for aggregated data, create placeholder IDs
        data_std['ID'] = [i for i in range(len(data_std))]
    data_std['%s_recessie' %field] = data['%s_recessie' %field]

    if covar: data_std[covar] = data[covar]

    if not covar:
        print(data_std.describe())
        melted = data_std.melt(id_vars=['ID'])
    else:
        print(data_std.groupby(covar).describe())
        melted = data_std.melt(id_vars=['ID',covar])

    melted = melted.dropna()

    if covar:
        formula = "value ~ variable*%s" %covar
    else:
        formula = "value ~ variable"
    aov   = anova.anova_single(ols(formula, melted).fit())
    print()
    print("## ANOVA results for differences in the mean of {field} sentiment".format(field=field))
    print()
    print(aov)

    if covar:
        melted['subgroups'] = ['%s_%s' %(col,covar) for col, covar in zip(melted['variable'],melted[covar])]
        posthoc = pairwise_tukeyhsd(melted['value'],melted['subgroups'])
    else:
        posthoc = pairwise_tukeyhsd(melted['value'], melted['variable'])

    print()
    print("## Tukey HSD post-hoc tests for mean differences in {field} sentiment".format(field=field))
    print()
    print(posthoc.summary())

    return data_std
Exemplo n.º 3
0
def pairwise_tukeyhsd5(list1, list2, list3, list4, list5, filename):
    list1 = np.array(list1).T[2]
    list2 = np.array(list2).T[2]
    list3 = np.array(list3).T[2]
    list4 = np.array(list4).T[2]
    list5 = np.array(list5).T[2]

    list_all = []
    for value in list1:
        list_all.append(['10', float(value)])
    for value in list2:
        list_all.append(['20', float(value)])
    for value in list3:
        list_all.append(['40', float(value)])
    for value in list4:
        list_all.append(['80', float(value)])
    for value in list5:
        list_all.append(['inf.', float(value)])

    ave1 = np.average(list1)
    ave2 = np.average(list2)
    ave3 = np.average(list3)
    ave4 = np.average(list4)
    ave5 = np.average(list5)
    std1 = np.std(list1)
    std2 = np.std(list2)
    std3 = np.std(list3)
    std4 = np.std(list4)
    std5 = np.std(list5)
    print "---------------------------------------------------------------"
    print "         data               average               std          "
    print "---------------------------------------------------------------"
    print "         list1           " + str(ave1) + "       " + str(
        std1) + "     "
    print "         list2           " + str(ave2) + "       " + str(
        std2) + "     "
    print "         list3           " + str(ave3) + "       " + str(
        std3) + "     "
    print "         list4           " + str(ave4) + "       " + str(
        std4) + "     "
    print "         list5           " + str(ave5) + "       " + str(
        std5) + "     "
    print "---------------------------------------------------------------\n"
    list_all = np.rec.array(list_all,
                            dtype=[('carv', '|S4'), ('value', float)])
    print pairwise_tukeyhsd(list_all['value'], list_all['carv'])

    with open(filename, 'w') as write_file:
        write_file.write("1 " + str(ave1) + " " + str(std1) + "\n")
        write_file.write("2 " + str(ave2) + " " + str(std2) + "\n")
        write_file.write("3 " + str(ave3) + " " + str(std3) + "\n")
        write_file.write("4 " + str(ave4) + " " + str(std4) + "\n")
        write_file.write("5 " + str(ave5) + " " + str(std5) + "\n")
Exemplo n.º 4
0
def doTukey(data: np.ndarray, multiComp: MultiComparison) -> None:    
    """Do a pairwise comparison, and show the confidence intervals

    Parameters
    ----------
    data : structured array, containing the input data
    multComp : Result of the 'MultiComparison'-test
    """
    
    # Show the results of the multicomparison test
    print((multiComp.tukeyhsd().summary()))
    
    # Calculate the p-values:
    res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment'])
    df = pd.DataFrame(data)
    numData = len(df)
    numTreatments = len(df.Treatment.unique())
    dof = numData - numTreatments
    
    # Show the group names
    print((multiComp.groupsunique))
    
    # Generate a print -------------------
    
    # Get the data
    xvals = np.arange(3)
    res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment'])
    errors = np.ravel(np.diff(res2.confint)/2)
    
    # Plot them
    plt.plot(xvals, res2.meandiffs, 'o')
    plt.errorbar(xvals, res2.meandiffs, yerr=errors, fmt='o')
    
    # Put on labels
    pair_labels = \
            multiComp.groupsunique[np.column_stack(res2._multicomp.pairindices)]
    pairs = [':'.join(labels) for labels in pair_labels]
    plt.xticks(xvals, pairs)
    
    # Format the plot
    xlim = -0.5, 2.5
    plt.hlines(0, *xlim)
    plt.xlim(*xlim)
    plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
              '\n Pairwise Mean Differences')          
    
    # Save to outfile, and show the data
    outFile = 'multComp.png'
    showData(outFile)
Exemplo n.º 5
0
def TukeyHelper(crime_data, crimes, feature, alpha=0.05):
    df = crime_data.loc[crime_data['TYPE'].isin(crimes)]
    # The distance data is all right skewed as most crimes are often somewhat
    # near city features, so we sqrt them to get normal looking data
    df['nearest_' + feature] = np.sqrt(df['nearest_' + feature])
    # Plot the histogram of the data to confirm that it is mostly normal
    groups = df.groupby('TYPE')
    for crime in crimes:
        plt.hist(groups.get_group(crime)['nearest_' + feature],
                 alpha=0.5,
                 label=crime)
    plt.legend(loc=1)
    plt.show()
    # Run Levene test on each crime distance distribution
    # https://stackoverflow.com/questions/26202930/pandas-how-to-apply-scipy-stats-test-on-a-groupby-object
    values_per_group = [
        np.sqrt(col) for col_name, col in groups['nearest_' + feature]
    ]
    print('P-value for Levene test:', levene(*values_per_group).pvalue)
    # Finally run the Tukey test, with the above data confirming / denying
    # the test's validity
    posthoc = pairwise_tukeyhsd(df['nearest_' + feature],
                                df['TYPE'],
                                alpha=alpha)
    print(posthoc)
Exemplo n.º 6
0
def tukey_hsd(df, val_col, group_col):
    #dataを検定に使えるように整形
    keys = df.groupby(group_col).groups.keys()
    val_data = []
    group_data = []
    for key in keys:
        d = df.loc[df.groupby(group_col).groups[key], val_col]
        val_data.append(d)
        group_data.append([key] * len(d))
    result = sm.pairwise_tukeyhsd(np.concatenate(val_data),
                                  np.concatenate(group_data))

    #結果をDataFrameに変換
    groups = np.array(result.groupsunique, dtype=np.str)
    groups_len = len(groups)
    vs = pd.DataFrame(
        np.zeros(groups_len * groups_len).reshape(groups_len, groups_len))
    for a in result.summary()[1:]:
        a0 = str(a[0])
        a1 = str(a[1])
        a0i = np.where(groups == a0)[0][0]
        a1i = np.where(groups == a1)[0][0]
        vs[a0i].loc[a1i] = a[3].data
        vs[a1i].loc[a0i] = a[3].data
    vs.index, vs.columns = groups, groups
    return vs
Exemplo n.º 7
0
def multiple_test(data, X, Y, alpha=0.05):
    """
    :param data: 输入dataframe格式数据
    :return: 输出列表结果,第一行是列头,其他行是数据
    """
    log.info('---------------------multiple test------------------')
    # list_total = data.iloc[:, -1]  # 这里需要注意一下
    # combination = Combination(list_levels)
    # for pair in combination:
    #     LSD(list_levels, list_total, pair[0], pair[1])
    alpha_range = (1 - alpha) * 100
    res = pairwise_tukeyhsd(data[Y[0]], data[X[0]], alpha=alpha)
    res_summary = res.summary().data
    for r in range(1, len(res_summary)):
        res_summary[r][-1] = str(res_summary[r][-1])
    col_map = {
        "group1": "组1",
        "group2": "组2",
        "meandiff": "平均值差值2-1",
        "p-adj": "显著性",
        "lower": "{}%置信区间下限".format(alpha_range),
        "upper": "{}%置信区间上限".format(alpha_range),
        "reject": "拒绝原假设",
    }
    return {
        "col": [col_map[c] for c in res_summary[0]],
        "data": res_summary[1:],
        "title": "多重比较",
        "remarks": "注:多重比较方法基于Tukey HSD。拒绝原假设,False表示不拒绝原假设,True表示拒绝原假设。"
    }
Exemplo n.º 8
0
    def tukey_pairwise_test(self, feat, group):
        """
        Perform pairwise test on image roughness features

        Parameters:
        -----------
        feat: str
            A string that represents a colour feature
        group: str
            A string that specifies which rock type tier used in pairwise test;
            If group="Type", individual such as HEM will be used in test;
            If group="CombinedType", tier I rock type such as ORE will be used in test

        Returns:
        --------
        stat_tbl: pd.DataFrame
            A table that store the result of tukey pairwise test
        """
        if not isinstance(feat, str):
            raise TypeError("feat should be of string type")

        if feat not in COLOUR_FEATS:
            raise ValueError(
                "feat should be chosen from 'SkewnessBlue', KurtosisBlue','MeanPixelBlue',\
            'SkewnessGreen','KurtosisGreen','MeanPixelGreen', 'SkewnessRed','KurtosisRed','MeanPixelRed'"
            )
        tukey = pairwise_tukeyhsd(
            endog=self.rtbl[feat],  # Data
            groups=self.rtbl[group],  # Groups
            alpha=0.05)  # Significance level
        summary = tukey.summary()
        stat_tbl = pd.DataFrame(summary.data[1:], columns=summary.data[0])
        return stat_tbl
def oneway_anova(df, x, y, W, H, use_hsd=True, plot=True):
    # mean_compare_table
    mean_compare_table = df.groupby(x, as_index=False)[[y]].mean()
    print(mean_compare_table)
    if plot:
        # plot
        plt.figure(figsize=(W, H))
        sns.violinplot(x, y, data=df)
    # set group
    val_list = list(set(df[x]))
    groups = []
    for val in val_list:
        groups.append(df.loc[df[x] == val, y].tolist())
    # anova
    levene_test = levene(*groups)
    if levene_test.pvalue >= 0.05:
        print("方差齐")
        f_value, p_value = f_oneway(*groups)
    else:
        print("方差不齐")
        f_value, p_value = f_oneway(*groups)  # 实际都使用f_oneway
        #h_value, p_value = kruskalwallis(*groups)
    # 结论
    print(p_value)
    if use_hsd:
        hsd = pairwise_tukeyhsd(endog=df[y], groups=df[x], alpha=0.05)
        print(hsd.summary())
    return mean_compare_table
Exemplo n.º 10
0
    def TukeyHSD(self, listPops, confidence = 0.95):
        """
        Post-hoc Tukey HSD test.
        """

        if type(listPops) != type([]): listPops = [listPops]

        ## Gathering all data required.
        Paux = np.zeros((0,2))
        popid = 0
        for pop in listPops:
            p = np.array( self.getFluorescence(pop) )
            data1 = np.zeros( (p.shape[0]*p.shape[1],2) )
            data1[:,0] = np.reshape( p, (p.shape[0]*p.shape[1]) )
            data1[:,1] = popid
            popid += 1

            Paux = np.concatenate([Paux, data1], axis=0)

        ## Evaluating Tukey's HSD
        tukey_res = pairwise_tukeyhsd(Paux[:,0], Paux[:,1], alpha= 1 - confidence)

        ## Creating a dataframe with the results
        Groups = np.array( list( itertools.combinations(listPops, 2) ) )
        data = {'Group 1' : Groups[:,0], 'Group 2': Groups[:,1],
                'Mean diff': tukey_res.meandiffs, 'Reject H0?' : tukey_res.reject}
        df = pd.DataFrame(data=data)

        return df
Exemplo n.º 11
0
    def oneway_anova(dv, iv, dataset):
        res = ""

        formula = '{0} ~ {1}'.format(dv, iv)
        regression = ols(formula, data=dataset).fit()

        anova_table = sm.stats.anova_lm(regression, typ=2)

        p = anova_table['PR(>F)'][0]
        res += str(anova_table)

        etaSquared = anova_table['sum_sq'][0] / (anova_table['sum_sq'][0] +
                                                 anova_table['sum_sq'][1])
        res += "\nWe can explain {0:.3f}% of {1} difference due to difference in {2}\n\n".format(
            etaSquared * 100, dv, iv)

        if p >= 0.05:
            res += "\nThe is no significant difference in {0} between different categories of {1} ".format(
                dv, iv)
        else:
            res += "There is significant difference in {0} between different categories of {1}".format(
                dv, iv)
            res += "\nThere are at least two groups different. which two? :\n\n"
            res += "\nPost-Hoc Test:-----------------------\n"
            tukey = pairwise_tukeyhsd(endog=dataset[dv],
                                      groups=dataset[iv],
                                      alpha=0.05)

            res += str(tukey.summary())

        return res
Exemplo n.º 12
0
    def get_dist_df(self, svddf):
        CONSTS = DistMetricCalculator.DataFrameCols.DistDF
        SIGMA = svddf[CONSTS.SIGMA][0]

        series = svddf.drop(
            [DistMetricCalculator.DataFrameCols.CHANNEL, CONSTS.SIGMA], axis=1)

        s = pairwise_tukeyhsd(
            series, svddf[DistMetricCalculator.DataFrameCols.CHANNEL])
        df = pd.DataFrame(
            s._results_table.data,
            columns=DistMetricCalculator.DataFrameCols.DistDF.ALL_COLS).iloc[
                1:, :]

        df[CONSTS.G1] = df[CONSTS.G1].astype("category")
        df[CONSTS.G2] = df[CONSTS.G2].astype("category")
        df[CONSTS.REJECT] = df[CONSTS.REJECT].astype(str).str.strip(
            " ") == "True"
        idx = df[CONSTS.REJECT] == False
        df.loc[idx, CONSTS.MEAN_DIFF] = 0
        distfinaldf = df.groupby([
            CONSTS.G1, CONSTS.G2
        ]).apply(lambda df: np.abs(df[CONSTS.MEAN_DIFF]).mean() * SIGMA)
        distfinaldf = distfinaldf.reset_index(name=CONSTS.DIST)
        return distfinaldf
    def test_continuous_feat(self, cluster_dfs, categories):
        import statsmodels.api as sm
        from statsmodels.formula.api import ols
        from statsmodels.stats.multicomp import pairwise_tukeyhsd
        import pandas as pd
        for j in categories:
            for a, i in enumerate(cluster_dfs.keys()):
                if a == 0:
                    int_df = pd.DataFrame(cluster_dfs[i][j])
                    int_df.columns = [i]
                else:
                    temp = pd.DataFrame(cluster_dfs[i][j])
                    temp.columns = [i]
                    int_df = int_df.join(temp)

            int_df_unpiv = int_df.melt().dropna()
            int_df_unpiv.columns = ['cluster', 'value']
            mod = ols('value ~ cluster', data=int_df_unpiv).fit()
            aov_table = sm.stats.anova_lm(mod, typ=2)
            print('\n \n', j)
            print(aov_table, '\n')
            print(
                pairwise_tukeyhsd(int_df_unpiv['value'],
                                  int_df_unpiv['cluster']))
        self.stats_table = int_df
 def test_tukeyRangeTest_pResult(self):
     x1, x2, x3 = [1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]
     results = tukey_range_test(x1, x2, x3)
     model = pairwise_tukeyhsd(x1 + x2 + x3, groups=[0] * 5 + [1] * 5 + [2] * 5)
     p_vals = psturng(np.abs(model.meandiffs / model.std_pairs), len(model.groupsunique), model.df_total)
     for i in range(3):
         assert pytest.approx(p_vals[i]) == results[i][2]
Exemplo n.º 15
0
def ANOVA_1Way(dataframe, cat_cols, distr_cols, savepath=None):
    corrs = pd.DataFrame()
    for distr_col in distr_cols:
        for cat_col in cat_cols:
            values = dataframe[cat_col].dropna().unique().tolist()
            distr = [
                np.array(
                    DCut(dataframe, [cat_col], ['eq'], [value])[distr_col])
                for value in values
            ]
            fstat, pval = f_oneway(*distr)
            if pval < 1e6:
                corrs.at[cat_col, distr_col] = ', '.join(
                    ['F' + ': ' + '{:,.3f}'.format(fstat), r'p: 1e-6'])
            else:
                corrs.at[cat_col, distr_col] = ', '.join([
                    'F' + ': ' + '{:,.3f}'.format(fstat),
                    r'p: ' + '{:.2e}'.format(pval)
                ])
            posthoc = multi.pairwise_tukeyhsd(dataframe[distr_col],
                                              dataframe[cat_col],
                                              alpha=.05)
            posthoc = pd.DataFrame(posthoc._results_table.data[1:],
                                   columns=posthoc._results_table.data[0])
            if savepath != None:
                savename = os.path.dirname(
                    savepath
                ) + '/' + distr_col + '_' + cat_col + '_tukeyhsd.tex'
                posthoc.to_latex(savename.replace(' ', '_'),
                                 index=False,
                                 encoding='utf-8')
    if savepath != None:
        corrs.to_latex(savepath, encoding='utf-8')
    return corrs
Exemplo n.º 16
0
def posthocTukey(data, gene, sampleList):
    exp, group = setDataPosthoc(data, gene, sampleList)
    print('Summary post-hoc test Tukey of ' + gene)
    tukeyhsd = pairwise_tukeyhsd(endog=exp, groups=group, alpha=0.05)
    print(tukeyhsd)
    print()
    return tukeyhsd
Exemplo n.º 17
0
def tukey_hsd(ind, *args):
    data_arr = np.hstack( args ) 

    ind_arr = np.array([])
    for x in range(len(args)):
        ind_arr = np.append(ind_arr, np.repeat(ind[x], len(args[x]))) 
    print(pairwise_tukeyhsd(data_arr,ind_arr))
Exemplo n.º 18
0
    def stats_test_for_trend(self, mode, gain, param):
        lin10 = [
            self.lin10[param][j:j + 75]
            for j in range(0,
                           len(self.lin10[param]) - 75, 75)
        ]
        lin20 = [
            self.lin20[param][j:j + 75]
            for j in range(0,
                           len(self.lin20[param]) - 75, 75)
        ]

        sqrt10 = [
            self.sqrt10[param][j:j + 75]
            for j in range(0,
                           len(self.sqrt10[param]) - 75, 75)
        ]
        sqrt20 = [
            self.sqrt20[param][j:j + 75]
            for j in range(0,
                           len(self.sqrt10[param]) - 75, 75)
        ]

        quad10 = [
            self.quad10[param][j:j + 75]
            for j in range(0,
                           len(self.quad10[param]) - 75, 75)
        ]
        quad20 = [
            self.quad20[param][j:j + 75]
            for j in range(0,
                           len(self.quad20[param]) - 75, 75)
        ]
        # print(lin10)
        # print(st.shapiro((lin10, sqrt10, quad10)))
        # print(st.shapiro((lin20, sqrt20, quad20)))

        datalist = {
            "lin10": lin10,
            "lin20": lin20,
            "sqrt10": sqrt10,
            "sqrt20": sqrt20,
            "quad10": quad10,
            "quad20": quad20
        }

        tgt = datalist[mode + str(gain)]

        print("shapiro-wilk result, p-value: ", st.shapiro(tgt)[1])
        N = len(tgt)
        data = np.hstack(tgt)
        group = [[i] * 75 for i in range(9)]
        group = np.hstack(group)

        res = st.kruskal(*tgt)
        print("kruskal result, p-value: ", res)

        res = pairwise_tukeyhsd(data, group)
        print(res)
Exemplo n.º 19
0
def add_stats(ax, times, names, ylim):
    to_plot= []

    #Find y coordinate to plot at: 
    if ylim is None:
        y_ = np.max(np.max(np.array(times)))
        dy = .05(y_ - np.min(np.min(np.array(times))))

    else:
        y_ = ylim[1]- (.025*(ylim[1]-ylim[0]))
        dy = .05*(ylim[1]-ylim[0])

    #Test all 3 distributions for normality: 
    abnorm_cnt = 0
    for tt, tm_arr in enumerate(times):
        print names[tt], len(tm_arr)
        zscore_tm_arr = ( np.array(tm_arr) - np.mean(np.array(tm_arr)) )/np.std(np.array(tm_arr))
        x, p = scipy.stats.kstest(zscore_tm_arr,'norm')
        if p < 0.05:
            #Not normal (two-sided test)
            abnorm_cnt += 1
            #Print 
            print 'non-normal: ', names[tt]

    #If all are normal, use ANOVA + Tukey's HSD
    if abnorm_cnt == 0:
        #All normal distributions:
        F, p = scipy.stats.f_oneway(*times)
        if p < 0.05:
            #Passed ANOVA, continue with Tukey's LSD test
            print 'passed anova!'
            for i, tm_i in enumerate(times):
                for j, tm_j in enumerate(times[i+1:]):
                    X = np.hstack(( np.array(tm_i), np.array(tm_j) ))
                    Y = np.hstack(( np.zeros((len(tm_i))), np.ones((len(tm_j))) ))
                    res2 = pairwise_tukeyhsd(X, Y)

                    if res2.reject:
                        to_plot.append([i, j+i+1, res2.reject, 'norm'])
                        print 'sig! ', [i, j+i+1, res2.reject, 'norm'], names[i], names[j+i+1]

    #If any is not normal, use KW + MW:
    else:
        #Use KW test: 
        H, p = scipy.stats.mstats.kruskalwallis(*times)
        if p < 0.05:
            #Passed group test, continue w/ Mann Whitney test:
            for i, tm_i in enumerate(times):
                for j, tm_j in enumerate(times[i+1:]):
                    u, p_onesided = scipy.stats.mannwhitneyu(tm_i, tm_j)
                    if (p_onesided*2.) < 0.05:
                        to_plot.append([i, j+i+1, p_onesided*2., 'nonnorm'])
                        print 'sig! ', [i, j+i+1, p_onesided*2., 'nonnorm']

    if len(to_plot) > 0:
        for plt_line in to_plot:
            # ax, x1, x2, y, dy, pvalue
            ax = plot_sig_line(ax, plt_line[0], plt_line[1], y_, dy, plt_line[2], plt_line[3])
    return ax, dy
Exemplo n.º 20
0
def getOWANOVAmultiComp(data, labels, verbose=False):
    tlabels = np.concatenate([[labels[j] for _, y in enumerate(x)]
                              for j, x in enumerate(data)])
    res = pairwise_tukeyhsd(np.concatenate(data), tlabels)
    if verbose:
        print(res.summary())
    return psturng(np.abs(res.meandiffs / res.std_pairs),
                   len(res.groupsunique), res.df_total)
Exemplo n.º 21
0
def sent_ttest(filepath1, filelist):
    sent = []

    file1 = open(filepath1, 'r')
    lines = csv.reader(file1)

    for l in lines:
        sentence = l[0] + l[1]
        blob = TextBlob(sentence)
        sent.append(('news', blob.sentiment.polarity))

    df = pd.DataFrame(sent, columns=['group', 'polarity'])
    include = ['object', 'float', 'int']
    #print(filepath1)
    #print(df.describe(include = include))

    sent2 = []
    sent3 = []
    sent4 = []
    sentarray = [sent2, sent3, sent4]
    group = ['academia', 'companies', 'defense']
    counter = 0

    for item in filelist:
        readfile = open(item, 'r')
        rows = csv.reader(readfile)

        for row in rows:
            sentence = row[0]
            blob = TextBlob(sentence)
            sentarray[counter].append(
                (group[counter], blob.sentiment.polarity))

        counter += 1

    df2 = pd.DataFrame(sent2, columns=['group', 'polarity'])
    #print(filelist[0])
    #print(df2.describe(include = include))

    df3 = pd.DataFrame(sent3, columns=['group', 'polarity'])
    #print(filelist[1])
    #print(df3.describe(include = include))

    df4 = pd.DataFrame(sent4, columns=['group', 'polarity'])
    #print(filelist[2])
    #print(df4.describe(include = include))

    # run anova test for difference of group of means
    print(scipy.stats.f_oneway(sent, sent2, sent3, sent4))

    frames = [df, df2, df3, df4]
    stacked = pd.concat(frames)

    MultiComp = pairwise_tukeyhsd(endog=stacked['polarity'],
                                  groups=stacked['group'],
                                  alpha=0.001)

    print(MultiComp)
Exemplo n.º 22
0
def main():
    data = pd.read_csv('data.csv')

    reshape_data = pd.melt(data)
    posthoc = pairwise_tukeyhsd(reshape_data['value'],
                                reshape_data['variable'])
    print(posthoc)
    posthoc.plot_simultaneous()
    plt.show()
Exemplo n.º 23
0
def createGroups(ds,fieldName):
    output = pairwise_tukeyhsd(ds[targetField],ds[fieldName])
    #output.plot_simultaneous()[0]
    d = output.summary()
    d = pd.DataFrame(d.data[1:],columns=d.data[0])
    o = getSimillarGroups(d)
    ds[fieldName]=ds[fieldName].map(o)
    ds=dummifyField(ds,fieldName)
    return ds
Exemplo n.º 24
0
def OnewayAnova(datas):
    df = pd.DataFrame(datas)
    cdir = os.path.dirname(os.path.realpath(__file__))

    job1 = df[df['job'] == '화이트칼라']['game_time']
    job2 = df[df['job'] == '블루칼라']['game_time']
    job3 = df[df['job'] == '학생']['game_time']
    job4 = df[df['job'] == '기타']['game_time']

    # 정규성 확인
    try:
        j1sp = stats.shapiro(job1)[1]
        j2sp = stats.shapiro(job2)[1]
        j3sp = stats.shapiro(job3)[1]
        j4sp = stats.shapiro(job4)[1]
    except Exception as e:
        print('error ', e)
        return 0

    # 시각화
    plt.rc('font', family='malgun gothic')  #한글깨짐 방지
    plt.rcParams['axes.unicode_minus'] = False  #마이너스부호 깨짐 방지
    explode = (0, 0.1, 0, 0)
    fig2 = plt.figure()
    plt.title('직업별 인원 비율', fontsize=14)
    plt.pie(df.groupby('job').size(),
            labels=df['job'].unique(),
            colors=["pink", "coral", "lightblue", "yellowgreen"],
            autopct='%0.1f%%',
            explode=explode)
    fig2 = plt.gcf()
    fig2.savefig('{}\\static\images\\ftest.png'.format(cdir))
    plt.close(fig2)

    # 등분산성 확인
    lev = stats.levene(job1, job2, job3, job4).pvalue
    print(lev)

    if j1sp > 0.05 and j2sp > 0.05 and j3sp > 0.05 and j4sp > 0.05:
        print('정규성 만족')
        if lev > 0.05:
            _, pv = stats.f_oneway(job1, job2, job3, job4)
            print('등분산성 만족 ', pv)

            turk = pairwise_tukeyhsd(df.game_time, df.job)
            print('사후검정\n', turk)
        else:
            _, pv = stats.f_oneway(job1, job2, job3, job4)
            pv = stats.levene(job1, job2, job3, job4, center='trimmed').pvalue
            print('등분산성 불만족')

    else:
        _, pv = stats.kruskal(job1, job2, job3, job4)
        print('정규성 불만족  ', pv)

    return pv
Exemplo n.º 25
0
 def PH(self, data):
     df = data.drop(columns=["Div", "HomeTeam", "AwayTeam", "RESULT"])
     for i in df.columns:
         posthoc = pairwise_tukeyhsd(data.iloc[:, [i]],
                                     data["RESULT"],
                                     alpha=0.05)
         plt.figure(figsize=(10, 10))
         posthoc.plot_simultaneous()
         plt.title("{}".format(data.columns[i]))
         plt.show()
Exemplo n.º 26
0
def statistics(scores):
    melted_scores = pd.melt(scores)
    posthoc = pairwise_tukeyhsd(melted_scores['value'],
                                melted_scores['variable'],
                                alpha=0.05)

    # Save post hoc results in a text file
    file = open('posthoc.txt', 'w')
    file.write(posthoc.summary().as_text())
    file.close()
def tukey_test(num, cat):
    from statsmodels.stats.multicomp import pairwise_tukeyhsd
    tukey = pairwise_tukeyhsd(
        endog=num,  # Numerical data
        groups=cat,  # Categorical data
        alpha=0.05)  # Significance level

    summary = tukey.summary()  # See test summary
    print("Tukey's Test Result between " + num.name, ' & ' + cat.name, ':')
    display(summary)
Exemplo n.º 28
0
def tukeyTest(data, groups, alpha=0.05):
    '''Perform pairwise Tukey test for data by groups
    '''
    # pairwise comparisons using Tukey's test, calculating p-values
    res = pairwise_tukeyhsd(data, groups, alpha)
    print('Summary of test:\n', res)
    # print(dir(results))# prints out all attributes of an object
    pVal = psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total)
    print('p values of all pair-wise tests:\n', pVal)

    return res
Exemplo n.º 29
0
def ANOVA(df, target_var_name, group_var_name):
    cmd = target_var_name + "~" + group_var_name
    work_df = pd.DataFrame()
    work_df[group_var_name] = df[group_var_name]
    work_df[target_var_name] = df[target_var_name].astype(float)
    work_df = work_df.dropna(axis=0)
    mod = ols(cmd, data=work_df).fit()
    aov_table = sm.stats.anova_lm(mod, typ=2)
    print(aov_table)
    res2 = pairwise_tukeyhsd(work_df[target_var_name], work_df[group_var_name])
    print(res2)
def pairwise_tukey(reduced_dataframe, groups, p_value_threshold=0.05):
    """Perform Tukey Test (with multiple testing) on a dataframe with one column per group."""
    reduced_dataframe = reduced_dataframe[groups]
    reduced_df_unpivot = reduced_dataframe\
        .stack()\
        .reset_index()\
        .rename(columns={"level_1": 'Pop', 0: 'Prob'})
    tukey = pairwise_tukeyhsd(endog=reduced_df_unpivot["Prob"],   # Data
                              groups=reduced_df_unpivot["Pop"],   # Groups
                              alpha=p_value_threshold)            # Significance level
    return tukey
Exemplo n.º 31
0
def ANOVA(df):
    target = df.columns[0]
    group_var_name = df.columns[1]
    cmd = target + "~" + group_var_name
    work_df = pd.DataFrame()
    work_df[group_var_name] = df[group_var_name]
    work_df = work_df.dropna(axis=0)
    mod = ols(cmd, data=work_df).fit()
    aov_table = sm.stats.anova_lm(mod, typ=2)
    print(aov_table)
    res2 = pairwise_tukeyhsd(work_df[target], work_df[group_var_name])
    print(res2)
Exemplo n.º 32
0
 def stats_test(self, gain, param):
     if gain == 10:
         N = len(self.lin10[param])
         data = np.hstack(
             (self.lin10[param], self.sqrt10[param], self.quad10[param]))
     if gain == 20:
         N = len(self.lin20[param])
         data = np.hstack(
             (self.lin20[param], self.sqrt20[param], self.quad20[param]))
     group = ["linear"] * N + ["sqrt"] * N + ["quad"] * N
     res = pairwise_tukeyhsd(data, group)
     print(res)
def doTukey(data, multiComp):    
    '''Do a pairwise comparison, and show the confidence intervals'''
    
    print((multiComp.tukeyhsd().summary()))
    
    # Calculate the p-values:
    res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment'])
    df = pd.DataFrame(data)
    numData = len(df)
    numTreatments = len(df.Treatment.unique())
    dof = numData - numTreatments
    
    # Show the group names
    print((multiComp.groupsunique))
    
    # Generate a print -------------------
    
    # Get the data
    xvals = np.arange(3)
    res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment'])
    errors = np.ravel(np.diff(res2.confint)/2)
    
    # Plot them
    plt.plot(xvals, res2.meandiffs, 'o')
    plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o')
    
    # Put on labels
    pair_labels = multiComp.groupsunique[np.column_stack(res2._multicomp.pairindices)]
    plt.xticks(xvals, pair_labels)
    
    # Format the plot
    xlim = -0.5, 2.5
    plt.hlines(0, *xlim)
    plt.xlim(*xlim)
    plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
              '\n Pairwise Mean Differences')          
    
    # Save to outfile, and show the data
    outFile = 'multComp.png'
    C2_8_mystyle.printout_plain(outFile)
Exemplo n.º 34
0
def multicomp(data, algos):
    groups = []
    lindata = []
    for i,a in enumerate(algos):
        groups.extend([i]*len(data[a]))
        lindata.extend(data[a])

    groups = np.array(groups)
    lindata = np.array(lindata)
    res = pairwise_tukeyhsd(lindata, groups, 0.05)

    # print(res)
    return res
Exemplo n.º 35
0
def get_pairwise_comparison_data(df, independent_variables_names, dependent_variables_names, significance_cutoff=0.05, NUM_GROUPS_CUTOFF=15):
    '''
        datasetId
        independentVariables - list names, must be categorical
        dependentVariables - list names, must be numerical
        numBins - number of bins for the independent quantitative variables (if they exist)
    '''
    considered_independent_variable_name = independent_variables_names[0]
    considered_dependent_variable_name = dependent_variables_names[0]

    # Only return pairwise comparison data if number of groups < THRESHOLD
    num_groups = len(get_unique(df[considered_independent_variable_name]))
    if num_groups > NUM_GROUPS_CUTOFF:
        return None

    hsd_result = pairwise_tukeyhsd(df[considered_dependent_variable_name], df[considered_independent_variable_name], alpha=significance_cutoff)
    hsd_raw_data = hsd_result.summary().data[1:]
    st_range = np.abs(hsd_result.meandiffs) / hsd_result.std_pairs
    p_values = psturng(st_range, len(hsd_result.groupsunique), hsd_result.df_total)

    hsd_headers = [
        'Group 1',
        'Group 2',
        'Group Mean Difference (2 - 1)',
        'Lower Bound',
        'Upper Bound',
        'p-value',
        'Distinct (p < %s)' % significance_cutoff
    ]
    hsd_data = []
    for i in range(0, len(hsd_raw_data)):
        if isinstance(p_values, float):
            p_value = p_values
        else:
            p_value = p_values[i] if i < len(p_values) else None
        hsd_data_row = [
            hsd_raw_data[i][0],
            hsd_raw_data[i][1],
            hsd_result.meandiffs[i],
            hsd_result.confint[i][0],
            hsd_result.confint[i][1],
            p_value,
            ( 'True' if (p_value <= significance_cutoff) else 'False' )
        ]
        hsd_data.append(hsd_data_row)

    return {
        'column_headers': hsd_headers,
        'rows': hsd_data
    }
Exemplo n.º 36
0
def test_ANOVA_and_T_HSD(array_of_arrays):
    ANOVA_f, ANOVA_p = scipy.stats.f_oneway(*array_of_arrays)

    if ANOVA_p < 0.05:
        x = []
        label = []
        for grp_num in range(len(array_of_arrays)):
            x.extend(array_of_arrays[grp_num])
            label.extend([grp_num]*len(array_of_arrays[grp_num]))

        tukey_HSD_result = pairwise_tukeyhsd(np.array(x), np.array(label))
        return ANOVA_p, tukey_HSD_result
    else:
        return ANOVA_p, False
Exemplo n.º 37
0
F ~ F dist with k -1, n-k degs of freedom 
'''
stat, pval = stats.f_oneway(sample1, sample2, sample3, ...)

'''
If we reject the null hypothesis, do all the pairwise comparisons to 
determine which means are different.
Do: Bonferroni correction. If the overall alpha we want is 0.05,
use alpha* = alpha / (k choose 2) as the significance level for 
each pairwise test
'''


# another option: use statmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd
print pairwise_tukeyhsd(Data, Group)

'''
Tukey: Create confidence intervals for all differences
of means, see if the confidence interval contains
0 or not.
It assumes independence of the observations being tested, 
as well as equal variation across observations (homoscedasticity).
Tukey's test is essentially a Student's t-test,
 except that it corrects for family-wise error-rate.

This is multicomparison. The output is like

Multiple Comparison of Means - Tukey HSD,FWER=0.05
================================================
group1 group2 meandiff   lower    upper   reject
Exemplo n.º 38
0
# First, do an one-way ANOVA
df = pd.DataFrame(dta2)
model = ols('StressReduction ~ C(Treatment)',df).fit()

anovaResults =  anova_lm(model)
print anovaResults
if anovaResults['PR(>F)'][0] < 0.05:
    print('One of the groups is different.')

#Then, do the multiple testing
mod = MultiComparison(dta2['StressReduction'], dta2['Treatment'])
print mod.tukeyhsd()[0]

# The following code produces the same printout
res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
#print res2[0]

# Show the group names
print mod.groupsunique

# Generate a print
import matplotlib.pyplot as plt
plt.plot([0,1,2], res2[1][2], 'o')
plt.errorbar([0,1,2], res2[1][2], yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
xlim = -0.5, 2.5
plt.hlines(0, *xlim)
plt.xlim(*xlim)
pair_labels = mod.groupsunique[np.column_stack(res2[1][0])]
plt.xticks([0,1,2], pair_labels)
plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
Exemplo n.º 39
0
 def test_shortcut_function(self):
     #check wrapper function
     res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha)
     assert_almost_equal(res[1][4], self.res[1][4], decimal=14)
Exemplo n.º 40
0
def anova(data_dict, language='en'):

    means_per_strat = {}
    join_vals = []
    join_group = []

    for key, v in data_dict.items():
        means_per_strat[key] = []
        for key2, v2 in v.items():
            mean = sum(v2)/float(len(v2))
            #print key, key2, mean
            means_per_strat[key].append(mean)
            join_vals.append(mean)
            join_group.append(key)
            #means_per_strat[key] = sum(v2)/float(len(v2))

    sorted_ci = sorted(means_per_strat.items(), key=operator.itemgetter(1))
    s_keys, s_values = zip(*sorted_ci)

    print 's_keys:', s_keys
    print 's_values:', s_values    

    res = scipy.stats.f_oneway(*s_values)
    statistic = res[0]
    pvalue = res[1]

    alpha = 0.05

    print 'ANOVA Analysis: F value:', statistic, 'P value:', pvalue, 'a:', alpha

    join_vals = np.asarray(join_vals)
    join_group = np.asarray(join_group)

    words = lang.get_vocabulary(language)

    for i in xrange(len(join_group)):
        v = join_group[i]

        join_group[i] = words[v]
        '''if v == 'E-greedy':
            join_group[i] = r'$\alpha$-greedy'
        elif v == 'E-Nash':
            join_group[i] = r'$\epsilon$-Nash'
        elif v == 'Reply-score':
            join_group[i] = 'Reply-last'
        elif v == 'Xelnaga':
            join_group[i] = 'Single choice'
        '''

    #mc = MultiComparison(np.asarray(s_values), np.asarray(s_keys))
    #result = mc.tukeyhsd()

    #print ' endog (len=%d): %s' % (len(join_vals), join_vals)
    #print 'groups (len=%d): %s' % (len(join_group), join_group)

    tukey = pairwise_tukeyhsd(endog=join_vals, groups=join_group, alpha=alpha)

    tukey.plot_simultaneous()    # Plot group confidence intervals
    # #plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red")

    print tukey.summary()              # See test summary
Exemplo n.º 41
0
print 'AUC: ' + str(auc)


from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats.mstats import kruskalwallis, friedmanchisquare

Array = ['NL','BPH','HGPIN','G3','G4','G5']
multiarea = magi_area['NL'].append(magi_area['BPH'])
multiarea = multiarea.append(magi_area['HGPIN'])
multiarea = multiarea.append(magi_area['G3'])
multiarea = multiarea.append(magi_area['G4'])
multiarea = multiarea.append(magi_area['G5'])
multiarea = multiarea.dropna()
multilesion = list()
a = 0
while a < 6:
    column = Array[a]
    coldata = magi_area[column]
    coldata = coldata.dropna()
    for deet in coldata:
        multilesion.append(a)
    a = a + 1

print pairwise_tukeyhsd(multiarea, multilesion) 
print kruskalwallis(magi_area['NL'].dropna(), magi_area['BPH'].dropna(), magi_area['HGPIN'].dropna(),
    magi_area['G3'].dropna(), magi_area['G4'].dropna(), magi_area['G5'].dropna())

print kruskalwallis(magi_stain['NL'].dropna(), magi_stain['BPH'].dropna(), magi_stain['HGPIN'].dropna(),
    magi_stain['G3'].dropna(), magi_stain['G4'].dropna(), magi_stain['G5'].dropna())

Exemplo n.º 42
0
            
    except KeyboardInterrupt:
        pool.close()
        print('stopping all simulations...')
    finally:
        pool.terminate()
        pool.join()

    c = dict(Counter(completed).most_common())
    for idx, sim in enumerate([s.__name__ for s in sim_list]):
        print('Test: {0}, Iterations {1}, Heuristic: {2}'.format(idx, c[sim], sim))
        
    for pair in pairs:
        print('')
        print(pair)
        f, p = f_oneway(*[[i[1] for i in container[pair] if i[0] == sim] for sim in [s.__name__ for s in sim_list]])
        print('F-stat: {0} at sig {1}: {2}'.format(str(round(f, 3)).ljust(7), 
                                                   str(round(p, 3)).ljust(7),
                                                   ['NULL','REJECT'][p <= .05]))
    
        if p <= .05:
            dta2 = numpy.rec.array(container[pair],  
                                dtype=[('test', '|S100'),
                                       ('wins', int)])
                                       
            print(pairwise_tukeyhsd(dta2['wins'], dta2['test']))

    

    
Exemplo n.º 43
0
def main():
    # Note: the statsmodels module is required here.
    from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
                                             MultiComparison)
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm
    
    # Set up the data, as a structured array.
    # The first and last field are 32-bit intergers; the second field is an
    # 8-byte string. Note that here we can also give names to the individual
    # fields!
    dta2 = np.rec.array([
    (  1,   'mental',  2 ),
    (  2,   'mental',  2 ),
    (  3,   'mental',  3 ),
    (  4,   'mental',  4 ),
    (  5,   'mental',  4 ),
    (  6,   'mental',  5 ),
    (  7,   'mental',  3 ),
    (  8,   'mental',  4 ),
    (  9,   'mental',  4 ),
    ( 10,   'mental',  4 ),
    ( 11, 'physical',  4 ),
    ( 12, 'physical',  4 ),
    ( 13, 'physical',  3 ),
    ( 14, 'physical',  5 ),
    ( 15, 'physical',  4 ),
    ( 16, 'physical',  1 ),
    ( 17, 'physical',  1 ),
    ( 18, 'physical',  2 ),
    ( 19, 'physical',  3 ),
    ( 20, 'physical',  3 ),
    ( 21,  'medical',  1 ),
    ( 22,  'medical',  2 ),
    ( 23,  'medical',  2 ),
    ( 24,  'medical',  2 ),
    ( 25,  'medical',  3 ),
    ( 26,  'medical',  2 ),
    ( 27,  'medical',  3 ),
    ( 28,  'medical',  1 ),
    ( 29,  'medical',  3 ),
    ( 30,  'medical',  1 )], dtype=[('idx', '<i4'),
                                    ('Treatment', '|S8'),
                                    ('StressReduction', '<i4')])
    
    # First, do an one-way ANOVA
    df = pd.DataFrame(dta2)
    model = ols('StressReduction ~ C(Treatment)',df).fit()
    
    anovaResults =  anova_lm(model)
    print(anovaResults)
    if anovaResults['PR(>F)'][0] < 0.05:
        print('One of the groups is different.')
    
    #Then, do the multiple testing
    mod = MultiComparison(dta2['StressReduction'], dta2['Treatment'])
    print((mod.tukeyhsd().summary()))
    
    # The following code produces the same printout
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    #print res2[0]
    
    # Show the group names
    print((mod.groupsunique))
    
    # Generate a print
    import matplotlib.pyplot as plt
    xvals = np.arange(3)
    plt.plot(xvals, res2.meandiffs, 'o')
    #plt.errorbar(xvals, res2.meandiffs, yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
    errors = np.ravel(np.diff(res2.confint)/2)
    plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o')
    xlim = -0.5, 2.5
    plt.hlines(0, *xlim)
    plt.xlim(*xlim)
    pair_labels = mod.groupsunique[np.column_stack(res2._multicomp.pairindices)]
    plt.xticks(xvals, pair_labels)
    plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
              '\n Pairwise Mean Differences')          
    
    # Save to outfile
    outFile = 'MultComp.png'
    plt.savefig('MultComp.png', dpi=200)
    print(('Figure written to {0}'.format(outFile)))
    
    plt.show()
    
    # Instead of the Tukey's test, we can do pairwise t-test
    # First, with the "Holm" correction
    rtp = mod.allpairtest(stats.ttest_rel, method='Holm')
    print((rtp[0]))
    
    # and then with the Bonferroni correction
    print((mod.allpairtest(stats.ttest_rel, method='b')[0]))
    
    # Done this way, the variance is calculated at each comparison.
    # If you want the joint variance across all samples, you have to 
    # use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html)
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    studentized_mean = res2.meandiffs
    studentized_variance = res2.variance
    
    t_stat = (studentized_mean / studentized_variance) / np.sqrt(2)
    dof = len(dta2) - len(mod.groupsunique)
    my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2  # two-sided
    
    # Now with the Bonferroni correction
    from statsmodels.stats.multitest import multipletests
    res_b = multipletests(my_pvalues, method='b')
    
    return res2.variance
        anova = pairwise_tukeyhsd(endog=df['diffs'], groups=df['qualification_performance'], alpha=0.05)

        print anova.summary()



        f = open("../data/anovaResults.txt", "w")
        f.write(anova.__str__())

################################################################################

pre_scores  = "data/pre_test_responses"
post_scores = "data/post_test_responses"
learners    = "data/class_data/class.json"

import numpy as np
import os
import pandas as pd
import scipy.stats as st
import scipy as sc
import json
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from copy import deepcopy

pre_json = []
file_list = os.listdir(pre_scores)
for f in file_list:
    of = open(os.path.join(pre_scores, f))
Exemplo n.º 45
0
#(a4, p4) = readData(p + 'sda/48-36-24,3-2-2,003-001,15-15,10/', 'sl3_')
#(a5, p5) = readData(p + 'sda-nolog/48-36-24,3-2-2,003-001,15-15,10/', 'snl3')
#(a6, p6) = readData(p + 'mlp/48-36-24,001,15,10/', 'mlp3')
(a7, p7) = readData(p + 'sda/36,3-2-2,003-001,15-15,10/', 'sl1_')
(a8, p8) = readData(p + 'sda-nolog/36,3-2-2,003-001,15-15,10/', 'snl1')
(a9, p9) = readData(p + 'mlp/36,001,15,10/', 'mlp1')
#(a1, p1) = readData(p + '36,001,15,10/', 'lay1')
#(a2, p2) = readData(p + '48-36-24,001,15,10/', 'lay3')
#(a3, p3) = readData(p + '48-36,001,15,10/', 'lay2')
a = np.vstack((p1, p2, p3, p7, p8, p9))

print 'Friedman chi-square test'
print scipy.stats.friedmanchisquare(a1, a2, a3, a7, a8, a9)

b = np.rec.array(a, dtype=[('val', '<f'), ('feature', '|S4')])
res2 = pairwise_tukeyhsd(b['val'], b['feature'])
mod = MultiComparison(b['val'], b['feature'])

print res2

import matplotlib.pyplot as plt
plt.plot([0,1,2, 3, 4, 5], res2[1][2], 'o')
plt.errorbar([0,1,2, 3, 4, 5], res2[1][2], yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
xlim = -0.5, 2.5
plt.hlines(0, *xlim)
plt.xlim(*xlim)
pair_labels = mod.groupsunique[np.column_stack(res2[1][0])]
plt.xticks([0,1,2,3,4,5], pair_labels)
plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
          '\n Pairwise Mean Differences')
plt.show()