def sign(df):
    des, res = researchpy.ttest(df['diff'], df['poll'])
    p = res.loc[res['Independent t-test']=='Two side test p value = '].iat[0, 1]
    if p < 0.05:
        print('The p value obtained from the t-test is significant(p < 0.05), and therefore, we conclude that there is a difference between the two variables.')
    else:
        print('The p value obtained from the t-test is not significant, and therefore, we conclude that There is no difference between the two variables.')
    print(researchpy.ttest(df['diff'], df['poll']))
def tTest(data, checking, group, group1, group2, nameGroup1, nameGroup2, x,
          output):
    output[x]['Variable'] = checking
    leveneResult = stats.levene(data[checking][data[group] == group1],
                                data[checking][data[group] == group2],
                                center='mean')

    summary, results = rp.ttest(group1=data[checking][data[group] == group1],
                                group1_name=nameGroup1,
                                group2=data[checking][data[group] == group2],
                                group2_name=nameGroup2)

    output[x][nameGroup1 + ' N'] = round(summary.iloc[0]['N'], 2)
    output[x][nameGroup2 + ' N'] = round(summary.iloc[1]['N'], 2)
    output[x][nameGroup1 + ' Mean'] = round(summary.iloc[0]['Mean'], 2)
    output[x][nameGroup2 + ' Mean'] = round(summary.iloc[1]['Mean'], 2)
    output[x][nameGroup1 + ' SD'] = round(summary.iloc[0]['SD'], 2)
    output[x][nameGroup2 + ' SD'] = round(summary.iloc[1]['SD'], 2)
    output[x][nameGroup1 + ' SE'] = round(summary.iloc[0]['SE'], 2)
    output[x][nameGroup2 + ' SE'] = round(summary.iloc[1]['SE'], 2)

    if leveneResult.pvalue < 0.05:
        output[x]['Leneve Value'] = str(round(leveneResult.pvalue, 2)) + "****"
    else:
        output[x]['Leneve Value'] = str(round(leveneResult.pvalue, 2))

    values = results.results
    output[x]["T-Test P Value"] = signifiant(float(values.loc[[3]]))
    output[x]["Cohen Effect Size"] = effectSize(float(values.loc[[6]]))
Пример #3
0
def test_for_side_difference_one_var(df_side,
                                     lvl,
                                     var,
                                     hue='value',
                                     plot_dist=True):
    """performs a ttest on each condition for every variables lvl0 : data = players mean of all valid moves"""
    import seaborn as sns
    import researchpy as rp
    import pandas as pd
    import matplotlib.pyplot as plt

    df = get_data_group_by_player_mean(df_side, lvl, var, hue=hue)

    describe, results = rp.ttest(df['right'], df['left'], paired=True)

    if plot_dist:
        sns.distplot(df['right'],
                     norm_hist=True,
                     color='b',
                     label='right - {}'.format(var))
        sns.distplot(df['left'],
                     norm_hist=True,
                     color='g',
                     label='left - {}'.format(var))
        plt.legend(loc='best')

    return df, describe, results
Пример #4
0
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_ref, df_resp_ref = get_prop_resp("t12a")
    prop_values = metric(df_resp["min_offer"], df_prop["offer"])
    prop_value = metrics.get_mean(prop_values)

    prop_dss_values = metric(df_resp["min_offer"], df_prop["offer_dss"])
    prop_dss_value = metrics.get_mean(prop_dss_values)


    prop_values_ref = metric(df_resp_ref["min_offer"], df_prop_ref["offer_dss"])
    prop_value_ref = metrics.get_mean(prop_values_ref)

    # auto_dss_values = metric(df_resp["min_offer"], df_prop["ai_offer"])
    # auto_dss_value = metrics.get_mean(auto_dss_values)

    dof = 0
    diff = None
    if is_categorical:
        table, res = rp.crosstab(pd.Series(prop_values_ref), pd.Series(prop_dss_values), test='chi-square')
        s, p, r = res.results.values
        
        test_label = f"(pearson chi2)"
        
        test_label = f"chi2"
        print("Conclusion: ", generate_cat_stat_sentence(np.mean(resp_dss_values), np.std(resp_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment+".dss", label2="t20.dss"))
    else:
        table, res = rp.ttest(pd.Series(prop_values_ref), pd.Series(prop_dss_values), paired=False)
        s = res.results[2]
        if alternative=="greater":
            p = res.results[4]
        elif alternative == "less":
            p = res.results[5]
        elif alternative in (None, 'two-sided'):
            p = res.results[3]
        r = res.results[9]
        
        diff = res.results[0] 
        dof = res.results[1]
        print("Conclusion: ", generate_stat_sentence(np.mean(resp_dss_values), np.std(resp_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment+".dss", label2="t20.dss"))

        
        test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}"
    print("RESUME: ", res)
    if as_percentage:
        res = {
            "Proposer + DSS": f'{100 * prop_dss_value:.2f} %',
            "T10": f'{100 * prop_value_ref:.2f} %',
        }
    else:
        res = {
            "Proposer + DSS": f'{prop_dss_value:.2f}',
            "T10": f'{prop_value_ref:.2f}',
        }
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
Пример #5
0
def get_rel_responder_min_offer(treatment,
                                con,
                                dfs=None,
                                use_percentage=None,
                                use_labels=None):
    if SELECTION != "resp":
        return
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop[df_resp.columns] = df_resp

    _, df_resp_ref = get_prop_resp("t12a")
    resp_values = df_resp["min_offer_final"]
    resp_ref_values = df_resp["min_offer"]

    table, res = rp.ttest(pd.Series(resp_values),
                          pd.Series(resp_ref_values),
                          paired=True)
    diff = res.results[0]
    dof = res.results[1]
    s = res.results[2]
    p = res.results[3]
    r = res.results[9]

    print(res)

    print(
        "Conclusion: ",
        generate_stat_sentence(np.mean(resp_ref_values),
                               np.std(resp_ref_values),
                               np.mean(resp_values),
                               np.std(resp_values),
                               s,
                               p,
                               dof,
                               diff=diff,
                               label1=treatment,
                               label2=treatment + ".dss"))
    resp_stat = stats.ttest_rel(df_resp["min_offer"],
                                df_resp["min_offer_final"])

    resp_stat_t00 = stats.ttest_ind(df_resp["min_offer_final"],
                                    df_resp_ref["min_offer"])

    resp_wc_stat = stats.wilcoxon(df_resp["min_offer"],
                                  df_resp["min_offer_final"])
    res = {
        "mean T12": metrics.get_mean(df_resp["min_offer"]),
        "mean T13": metrics.get_mean(df_resp["min_offer_final"]),

        # "rejection_ratio": rejection_ratio(df_prop)
    }
    test_label = f"(ttest independent) H0: equal"
    res = {
        k: (f"{v:.3f}" if pd.notnull(v) and v != int(v) else v)
        for k, v in res.items()
    }
    res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
Пример #6
0
def ttest(dataframe):
    male = dataframe[dataframe['Sex'] == 'Male'] 
    male.reset_index(inplace= True)
    male.rename(columns={'Value': 'ValueM'}, inplace=True)
    female = dataframe[dataframe['Sex'] == 'Female'] 
    female.reset_index(inplace= True)
    female.rename(columns={'Value': 'ValueF'}, inplace=True)
    descriptives, results = researchpy.ttest(male['ValueM'], female['ValueF'])
    print(descriptives)
    print(results)
    def analyze_manipulation(self):
        """ performs all manipulation check analyses across batches (section Ra)
        1. prints manip_acutal mean, std
        2. prints manip_chancen mean, std
        3. prints plot with mean manip_chance bar + standard error, line for manip_actual
        4. prints paired t-test results for manip_acutal and manip_chance
        returns nothing """
        # error checking
        if self.summary is None:
            print("You must run .summarize() before running this function")

        actual = self.summary['manip_actual']
        chance = self.summary['manip_chance']

        # 1. print manip_actual mean, std
        print("\n>>> manip_actual mean, standard deviation:")
        print(
            f"n: {actual.count()}, mean: {actual.mean()}, std: {actual.std()}")

        # 2. print manip_chance mean, std
        print("\n>>> manip_chance mean, standard deviation:")
        print(
            f"n: {chance.count()}, mean: {chance.mean()}, std: {chance.std()}")

        # 3. create barplot
        print("\n>>> barplot:")
        plt.figure(figsize=[7.13, 2])
        bar = plt.barh(np.arange(1),
                       actual.mean(),
                       align='center',
                       edgecolor="black",
                       height=0.5)
        plt.errorbar(actual.mean(),
                     np.arange(1),
                     xerr=actual.std(),
                     ecolor="black")
        plt.title('Manipulation Check Accuracy')
        plt.yticks(np.arange(1), '')
        plt.ylabel('Actual Accuracy')
        plt.xlabel('Accuracy')
        plt.xlim(left=0, right=1.0)
        plt.axvline(x=chance.mean(),
                    linewidth=4,
                    color="red",
                    label="Chance Accuracy")  #threshold line
        plt.legend()
        plt.tight_layout()
        plt.savefig("manip.pdf", bbox_inches="tight")
        plt.show()

        # 4. paired t-test
        print("\n>>> paired t-test between manip_acutal and manip_chance:")
        # print(stats.ttest_rel(actual, chance))
        tt1 = researchpy.ttest(actual, chance, paired=True)[1]
        print(tt1)
Пример #8
0
def raw_pairttest_generate_output_df(mod_raw_data_df):
    effect_size_df_row_lookup = {
        "Cohen's d": 6,
        "Hedge's g": 7,
        "Glass's delta": 8
    }
    dictionaries_list = []

    for pair in global_vars.raw_pairttest_var_pairs:
        series_time1 = mod_raw_data_df[pair[0]][(
            mod_raw_data_df[pair[0]].notnull()
        ) & (
            mod_raw_data_df[pair[1]].notnull()
        )]  #this also seems to be done withi nthe researchpy ttest func but it's fine
        series_time2 = mod_raw_data_df[
            pair[1]][(mod_raw_data_df[pair[0]].notnull())
                     & (mod_raw_data_df[pair[1]].notnull())]

        result = rp.ttest(series_time1,
                          series_time2,
                          group1_name=pair[0],
                          group2_name=pair[1],
                          equal_variances=True,
                          paired=True)

        ttest_stats_df = result[1]

        current_dict = {}
        current_dict["Variable"] = "{var1} - {var2}".format(var1=pair[0],
                                                            var2=pair[1])
        current_dict["Time1_N"] = series_time1.count()
        current_dict["Time1_Mean"] = np.mean(series_time1)
        current_dict["Time1_SD"] = np.std(series_time1)
        current_dict["Time2_N"] = series_time2.count()
        current_dict["Time2_Mean"] = np.mean(series_time2)
        current_dict["Time2_SD"] = np.std(series_time2)
        current_dict["Degrees of Freedom"] = ttest_stats_df.iloc[1, 1]
        current_dict["t"] = ttest_stats_df.iloc[2, 1]
        current_dict[
            global_vars.
            effect_size_choice] = np.nan if global_vars.effect_size_choice == "None" else ttest_stats_df.iloc[
                effect_size_df_row_lookup[global_vars.effect_size_choice], 1]
        current_dict["pvalues"] = ttest_stats_df.iloc[3, 1]

        dictionaries_list.append(current_dict)

    output_df = pd.DataFrame(dictionaries_list)

    return output_df
Пример #9
0
def get_rel_responder_abs_df(treatment, con, dfs=None, use_percentage=None, use_labels=None):
    if SELECTION != "resp":
        return
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop[df_resp.columns] = df_resp

    df_prop_full, df_resp_ref = get_prop_resp("t12a")
    resp_values = metrics.get_data(metrics.get_rel_responder_abs_df(df_prop))
    resp_ref_values = metrics.get_data(metrics.get_rel_responder_abs_df(df_prop_full))

    table, res = rp.ttest(pd.Series(resp_values), pd.Series(resp_ref_values), paired=False)
    s = res.results[2]
    p = res.results[3]
    r = res.results[9]
    diff = res.results[0] 
    dof = res.results[1]
    s = res.results[2]
    p = res.results[3]
    r = res.results[9]

    
    print("Conclusion: ", generate_stat_sentence(np.mean(resp_ref_values), np.std(resp_ref_values), np.mean(resp_values), np.std(resp_values), s, p, dof, diff=diff, label1="t12.dss",  label2=treatment+".dss"))


    print("Table:", table)        
    print("Res:", res)

    res = {
        "rel. min_offer T12": metrics.get_mean(resp_ref_values),
        "rel. min_offer T13": metrics.get_mean(resp_values),

        # "rejection_ratio": rejection_ratio(df_prop)
        }
    test_label = f"(ttest independent) H0: equal"
    res = {k: (f"{v:.3f}" if pd.notnull(v) and v!= int(v) else v) for k,v in res.items()}
    res["min_offer" + test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"


    print()

    return res
Пример #10
0
    def exp_lc_test(self):

        lcs = self.data[(self.data['type'] == 'lc')]
        lcs.reset_index(inplace=True)
        exps = self.data[(self.data['type'] == 'exp')]
        exps.reset_index(inplace=True)

        p_values = {}
        for param in lcs.columns[4:]:
            #print(param)
            descriptives, results = rp.ttest(lcs[param].dropna(how='all'),
                                             exps[param].dropna(how='all'))
            #print(descriptives)
            #print(results)
            p_values[param] = [
                results.iloc[3, 1], descriptives.loc[0, 'Mean'],
                descriptives.loc[1, 'Mean']
            ]

        statistics = pd.DataFrame(p_values, index=['p-value', 'LC', 'EXP'])
        statistics = statistics.reindex_axis(self.column_order, axis=1)
        print(statistics)
        statistics.to_csv('statistics.csv')
Пример #11
0
# make predictions for test dataset
y_pred = rf.predict(X_test)
# calculate accuracy score
print(accuracy_score(y_test, y_pred) * 100)

# create LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
                                                   feature_names=features,
                                                   class_names=class_names,
                                                   discretize_continuous=True)
# i is the record we explain
i = 0
exp = explainer.explain_instance(X_test.values[i],
                                 rf.predict_proba,
                                 num_features=8)
exp.show_in_notebook(show_table=True, show_all=True)

# separate postive and negative outcomes for independent t-test
positive_outcome = data[data['Outcome'] == 1]
negative_outcome = data[data['Outcome'] == 0]
# rename 'Glucose' field to clarify test results
positive_outcome = positive_outcome.rename({'Glucose': 'Positive_Glucose'},
                                           axis=1)
negative_outcome = negative_outcome.rename({'Glucose': 'Negative_Glucose'},
                                           axis=1)
# run t-test to determine difference of means in glucose for positive/negative outcomes
descriptive_stats, test_results = rp.ttest(
    negative_outcome['Negative_Glucose'], positive_outcome['Positive_Glucose'])
print(descriptive_stats)
print(test_results)
Пример #12
0
    # fisher
    #     resultDf = resultDf[resultDf['participantsType'] != "machine"]
    #     crosstab, res = researchpy.crosstab(resultDf['participantsType'], resultDf['firstIntentionConsistFinalGoal'], test='fisher')
    #     print(crosstab)
    #     print(res)

    #
    df['trialType'] = [
        'Critical Disruption' if trial == "special" else 'Random Disruptions'
        for trial in df['noiseNumber']
    ]

    statDF = pd.DataFrame()
    statDF['commitmentRatio'] = df.groupby(
        ['name', 'trialType', 'participantsType'],
        sort=False)["firstIntentionConsistFinalGoal"].mean()
    statDF['commitmentRatio'] = statDF.apply(
        lambda x: int(x["commitmentRatio"] * 100), axis=1)

    statDF = statDF.reset_index()

    # t-test
    humanDF = statDF[(statDF.participantsType == "Humans")
                     & (statDF.trialType == 'Critical Disruption')]
    rLDF = statDF[(statDF.participantsType == "RL")
                  & (statDF.trialType == 'Critical Disruption')]
    des, res = researchpy.ttest(humanDF['commitmentRatio'],
                                rLDF['commitmentRatio'])
    print(des)
    print(res)
Пример #13
0
def _get_prop_vs_prop_dss_score(treatment,
                                con,
                                dfs=None,
                                use_percentage=None,
                                use_labels=None,
                                metric=None,
                                as_percentage=None,
                                is_categorical=None,
                                alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_t10, df_resp_t10 = get_prop_resp("t10a")

    metric_values = metric(df_prop)
    metric_value = metrics.get_mean(metric_values)

    metric_t10_values = metric(df_prop_t10)
    metric_value_t10 = metrics.get_mean(metric_t10_values)

    metric_values = metrics.get_data(metric_values)
    metric_t10_values = metrics.get_data(metric_t10_values)
    #print(stats.chisquare(metric_values[:103], metric_t10_values[:103]))

    dof = 0
    diff = None

    print(metric.__name__)
    if is_categorical:
        #table, res = rp.crosstab(pd.Series(metric_values), pd.Series(metric_t10_values), test='g-test')
        table, res = rp.crosstab(pd.Series(metric_values),
                                 pd.Series(metric_t10_values),
                                 test='fisher')
        #print(table, res)
        #s, p, r = res.results
        s = res.results[0]
        p = res.results[1]
        r = res.results[4]

        test_label = f"(g-test chi2)"

        print(
            "Conclusion: ",
            generate_cat_stat_sentence(np.mean(metric_t10_values),
                                       np.std(metric_t10_values),
                                       np.mean(metric_values),
                                       np.std(metric_values),
                                       s,
                                       p,
                                       dof,
                                       diff=diff,
                                       label1="t10a.dss",
                                       label2=treatment + ".dss"))
        print(
            pd.crosstab(pd.Series(metric_t10_values),
                        pd.Series(metric_values)))
    else:

        table, res = rp.ttest(pd.Series(metric_t10_values),
                              pd.Series(metric_values),
                              paired=False)
        s = res.results[2]
        if alternative == "greater":
            p = res.results[4]
        elif alternative == "less":
            p = res.results[5]
        elif alternative in (None, 'two-sided'):
            p = res.results[3]
        r = res.results[9]

        diff = res.results[0]
        dof = res.results[1]
        s = res.results[2]
        p = res.results[3]
        r = res.results[9]

        print(
            "Conclusion: ",
            generate_stat_sentence(np.mean(metric_t10_values),
                                   np.std(metric_t10_values),
                                   np.mean(metric_values),
                                   np.std(metric_values),
                                   s,
                                   p,
                                   dof,
                                   diff=diff,
                                   label1="t10a.dss",
                                   label2=treatment + ".dss"))
        test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}"
    print("TABLE: ", table)
    print("TEST: ", res)

    if as_percentage:
        res = {
            "Proposer + DSS": f'{100 * metric_value:.2f} %',
            "T10": f'{100 * metric_value_t10:.2f} %',
        }
    else:
        res = {
            "Proposer + DSS": f'{metric_value:.2f}',
            "T10": f'{metric_value_t10:.2f}',
        }
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
Пример #14
0
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)

    prop_values = metric(df_resp["min_offer"], df_prop["offer"])
    prop_value = metrics.get_mean(prop_values)

    prop_dss_values = metric(df_resp["min_offer"], df_prop["offer_dss"])
    prop_dss_value = metrics.get_mean(prop_dss_values)

    auto_dss_values = metric(df_resp["min_offer"], df_prop["ai_offer"])
    auto_dss_value = metrics.get_mean(auto_dss_values)

    dof = 0
    diff = None

    print(metric)
    if is_categorical:
        table = pd.crosstab(prop_values, prop_dss_values)
        # print("TABLE: ", table)
        # checked using: http://vassarstats.net/propcorr.html
        # s, p = sms2.mcnemar(prop_values, prop_dss_values, exact=False, correction=False)
        table, res = rp.crosstab(prop_values, prop_dss_values, test='mcnemar')
        #chi, p, s = (res.results.values)
        s, p, r = (res.results.values)
        
        print("Conclusion: ", generate_stat_sentence(np.mean(prop_values), np.std(prop_values), np.mean(prop_dss_values), np.std(prop_dss_values), s, p, dof, diff=diff, label1=treatment, label2=treatment+".dss"))
        test_label = f"(mcnemar - chi2)"
    else:
        s, p =  stats.wilcoxon(prop_values, prop_dss_values, alternative=alternative or 'two-sided')


        table, res = rp.ttest(pd.Series(prop_values), pd.Series(prop_dss_values), paired=True)
        #res = rp.ttest(pd.Series(prop_values), pd.Series(prop_dss_values), paired=True)
        diff = res.results[0] 
        dof = res.results[1]
        s = res.results[2]
        p = res.results[3]
        r = res.results[9]

        test_label = f"(ttest dependent)"

        print("Conclusion: ", generate_stat_sentence(np.mean(prop_values), np.std(prop_values), np.mean(prop_dss_values), np.std(prop_dss_values), s, p, dof, diff=diff, label1=treatment, label2=treatment+".dss"))

    
    print("TABLE:", table)
    print("RES:",  res)
    if as_percentage:
        res = {
            "Proposer": f'{100 * prop_value:.2f} %',
            "Proposer + DSS": f'{100 * prop_dss_value:.2f} %',
            # "prop:dss - prop": f'{100 * (prop_dss_value - prop_value):.2f} %',
        }
    else:
        res = {
            "Proposer": f'{prop_value:.2f}',
            "Proposer + DSS": f'{prop_dss_value:.2f}',
            # "prop:dss - prop": f'{(prop_dss_value - prop_value):.2f} %',
        }
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
Пример #15
0
#collect data of control group
control_group_original = post_df[post_df['QUESTNNR'] == 'group1']
control_group = control_group_original.dropna(axis=1, how='all')
control_group = control_group.drop(['CASE','QUESTNNR','STARTED'], axis=1).reset_index(drop=True)
control_group.columns = share.CONTROL_COLUMN_NAME
share.export_to_csv(control_group, 'control_group.csv')

#collect data of experiment group
experiment_group_original = post_df[post_df['QUESTNNR'] == 'qnr2']
experiment_group = experiment_group_original.dropna(axis=1,how='all')
experiment_group = experiment_group.drop(['CASE', 'QUESTNNR','STARTED'], axis=1).reset_index(drop=True)
experiment_group.columns = share.EXPERIMENT_COLUMN_NAME
share.export_to_csv(experiment_group, 'experiment_group.csv')

#statistical analysis of scripts data by using t-test
script1_des, script1_res = researchpy.ttest(control_group['script1'].astype('int32'),experiment_group['script1'].astype('int32'),group1_name='control_script1',group2_name='exp_script1',equal_variances=False)
script2_des, script2_res = researchpy.ttest(control_group['script2'].astype('int32'),experiment_group['script2'].astype('int32'),group1_name='control_script2',group2_name='exp_script2',equal_variances=False)
script3_des, script3_res = researchpy.ttest(control_group['script3'].astype('int32'),experiment_group['script3'].astype('int32'),group1_name='control_script3',group2_name='exp_script3',equal_variances=False)
control_script = control_group.loc[:,['script1','script2','script3']].astype('int32').mean(axis=1)
experiment_script = experiment_group.loc[:,['script1','script2','script3']].astype('int32').mean(axis=1)
script_des, script_res = researchpy.ttest(control_script,experiment_script, group1_name ='control', group2_name='exp',equal_variances=False)

#%% export analysis results
share.export_to_csv(script1_des, 'script1_des.csv')
share.export_to_csv(script1_res, 'script1_res.csv')
share.export_to_csv(script2_des, 'script2_des.csv')
share.export_to_csv(script2_res, 'script2_des.csv')
share.export_to_csv(script3_des, 'script3_des.csv')
share.export_to_csv(script3_res, 'script3_res.csv')
share.export_to_csv(script_des, 'script_des.csv')
share.export_to_csv(script_res,'script_res.csv')
Пример #16
0
    def analyze_viability(self, *diff_args: [int, int, str]):
        """ performs all raw viability score analyses across batches (section Rb split)
        1. prints refPair1[0] mean, std
        2. prints refPair1[1] mean, std
        3. prints refPair2[0] mean, std
        4. prints refPair2[1] mean, std
        5. prints bar plot of v2=R/B, v2=D/W means + std error
        6. prints box plot of v2=R/B, v2=D/W 
        7. prints paired t-test results for initial scores R/B vs D/W
        8. prints paired t-test results for later scores R/B vs D/W  
        returns nothing """
        # error checking
        if self.summary is None:
            print("You must run .summarize() before running this function")

        r1 = self.summary["initial_" + self.viability_labels[0]]
        r2 = self.summary["later_" + self.viability_labels[0]]
        d1 = self.summary["initial_" + self.viability_labels[1]]
        d2 = self.summary["later_" + self.viability_labels[1]]

        # 1. print r1 mean, std
        print(
            f"\n>>> initial_{self.viability_labels[0]} mean, standard deviation:"
        )
        print(f"n: {r1.count()}, mean: {r1.mean()}, std: {r1.std()}")

        # 2. print r2 mean, std
        print(
            f"\n>>> later_{self.viability_labels[0]} mean, standard deviation:"
        )
        print(f"n: {r2.count()}, mean: {r2.mean()}, std: {r2.std()}")

        # 3. print d1 mean, std
        print(
            f"\n>>> initial_{self.viability_labels[1]} mean, standard deviation:"
        )
        print(f"n: {d1.count()}, mean: {d1.mean()}, std: {d1.std()}")

        # 4. print d2 mean, std
        print(
            f"\n>>> later_{self.viability_labels[1]} mean, standard deviation:"
        )
        print(f"n: {d2.count()}, mean: {d2.mean()}, std: {d2.std()}")

        title = "Viability Scores of Rounds"
        labels=[textwrap.fill(text, 12) for text in \
            ["Initial " + self.viability_labels[0],"Later " + self.viability_labels[0], \
            "Initial " + self.viability_labels[1], "Later " + self.viability_labels[1]]]
        ylabel = "Team Mean Viability"

        order = [r1, r2, d1, d2]
        means = [x.mean() for x in order]
        stds = [x.std() for x in order]
        maxs = [x.max() for x in order]

        # 5. create barplot
        print("\n>>> barplot:")
        plt.figure(1)
        bar = plt.bar(np.arange(4), means, yerr=stds, align='center')
        plt.title(title)
        plt.xticks(np.arange(4), labels)
        plt.ylabel(ylabel)
        plt.savefig("raw_bar.pdf")
        plt.show()

        # 6. create boxplot
        print("\n>>> boxplot:")
        plt.figure(2)
        box = plt.boxplot(order, positions=np.arange(4))
        plt.title(title)
        plt.xticks(np.arange(4), labels)
        plt.ylabel(ylabel)
        plt.ylim(top=70, bottom=14)

        # label diffs
        for args in diff_args:
            label_diff(*args, maxs)

        plt.savefig("raw_box.pdf")
        plt.show()

        # 7. paired t-test initial
        print(
            f"\n>>> paired t-test between initial_{self.viability_labels[0]} and initial_{self.viability_labels[1]}:"
        )
        tt1 = researchpy.ttest(r1, d1, paired=True)[1]
        print(tt1)

        # 8. paired t-test later
        print(
            f"\n>>> paired t-test between later_{self.viability_labels[0]} and later_{self.viability_labels[1]}:"
        )
        tt1 = researchpy.ttest(r2, d2, paired=True)[1]
        print(tt1)
Пример #17
0
#The fisrt distribution is far from normal but the second one is normal


#t_test
def t_test(x, y):
    if stats.ttest_ind(x, y)[1] < 0.05:
        return print('Greece is flattening the curve!')


t_test(df2['Total Confirm of new cases'], df1['Total Confirm of new cases'])

#By the statistical test we made observe that there is difference between the two curves and the second one which denotes
#the after the measures period is more flatten

descriptives, results = rp.ttest(df2['Total Confirm of new cases'],
                                 df1['Total Confirm of new cases'])

descriptives

# In[21]:

#fitting the initial dataset
plt.figure(figsize=(11, 7))

plt.plot(xdata, ydata, 'ko', label='data')  #the original datapoints

#model1
popt, pcov = curve_fit(f=func, xdata=xdata, ydata=ydata, p0=None, sigma=None)

print(popt)  # parameters
print(pcov)  # covariance
Пример #18
0
import pandas
from scipy import stats
import researchpy

# %%
control_df = pandas.read_csv(share.CHAT_MSG_CONTROL)
experiment_df = pandas.read_csv(share.CHAT_MSG_EXP).drop(columns='marker')

# %%
T2_control_df = control_df.loc[control_df['chat_id'] == 'T2']['num_msg']
T2_experiment_df = experiment_df.loc[experiment_df['chat_id'] ==
                                     'T2']['num_msg']
T2_df = pandas.DataFrame({'con_T2': T2_control_df, 'exp_T2': T2_experiment_df})

# %%
T2_des, T2_res = researchpy.ttest(T2_df['exp_T2'], T2_df['con_T2'])

#%%
T3_control_df = control_df.loc[control_df['chat_id'] == 'T3']['num_msg']
T3_experiment_df = experiment_df.loc[experiment_df['chat_id'] ==
                                     'T3']['num_msg']
T3_df = pandas.DataFrame({'con_T3': T3_control_df, 'exp_T3': T3_experiment_df})

#%%
T3_des, T3_res = researchpy.ttest(T3_df['con_T3'], T3_df['exp_T3'])

# %%
control_group = control_df.groupby('user_name')
control_group = control_group.apply(
    lambda df: df.iloc[[1, 2]]['num_msg'].sum()).reset_index(drop=True)
# %%
Пример #19
0
def raw_indttest_generate_output_df(mod_raw_data_df):
    group1_df = mod_raw_data_df[mod_raw_data_df[
        global_vars.raw_indttest_groupvar] ==
                                global_vars.raw_indttest_grouplevel1]
    group2_df = mod_raw_data_df[mod_raw_data_df[
        global_vars.raw_indttest_groupvar] ==
                                global_vars.raw_indttest_grouplevel2]

    effect_size_df_row_lookup = {
        "Cohen's d": 6,
        "Hedge's g": 7,
        "Glass's delta": 8
    }
    dictionaries_list = []
    for var in global_vars.raw_indttest_dv:
        result = rp.ttest(
            group1_df[var],
            group2_df[var],
            group1_name=global_vars.raw_indttest_grouplevel1,
            group2_name=global_vars.raw_indttest_grouplevel2,
            equal_variances=stats.levene(
                group1_df[var].dropna().reset_index(drop=True),
                group2_df[var].dropna().reset_index(drop=True))[1] > 0.05,
            paired=False)
        ttest_stats_df1 = result[0]
        ttest_stats_df2 = result[1]

        current_dict = {}
        current_dict["Variable"] = var
        current_dict["All_N"] = int(ttest_stats_df1[
            ttest_stats_df1["Variable"] == "combined"].iloc[0]["N"])
        current_dict["All_Mean"] = ttest_stats_df1[
            ttest_stats_df1["Variable"] == "combined"].iloc[0]["Mean"]
        current_dict["All_SD"] = ttest_stats_df1[ttest_stats_df1["Variable"] ==
                                                 "combined"].iloc[0]["SD"]
        current_dict[global_vars.raw_indttest_grouplevel1 + "_N"] = int(
            ttest_stats_df1[ttest_stats_df1["Variable"] ==
                            global_vars.raw_indttest_grouplevel1].iloc[0]["N"])
        current_dict[global_vars.raw_indttest_grouplevel1 +
                     "_Mean"] = ttest_stats_df1[
                         ttest_stats_df1["Variable"] ==
                         global_vars.raw_indttest_grouplevel1].iloc[0]["Mean"]
        current_dict[global_vars.raw_indttest_grouplevel1 +
                     "_SD"] = ttest_stats_df1[
                         ttest_stats_df1["Variable"] ==
                         global_vars.raw_indttest_grouplevel1].iloc[0]["SD"]
        current_dict[global_vars.raw_indttest_grouplevel2 + "_N"] = int(
            ttest_stats_df1[ttest_stats_df1["Variable"] ==
                            global_vars.raw_indttest_grouplevel2].iloc[0]["N"])
        current_dict[global_vars.raw_indttest_grouplevel2 +
                     "_Mean"] = ttest_stats_df1[
                         ttest_stats_df1["Variable"] ==
                         global_vars.raw_indttest_grouplevel2].iloc[0]["Mean"]
        current_dict[global_vars.raw_indttest_grouplevel2 +
                     "_SD"] = ttest_stats_df1[
                         ttest_stats_df1["Variable"] ==
                         global_vars.raw_indttest_grouplevel2].iloc[0]["SD"]
        current_dict["Degrees_of_Freedom"] = ttest_stats_df2.iloc[1, 1]
        current_dict["t"] = ttest_stats_df2.iloc[2, 1]
        current_dict[
            global_vars.
            effect_size_choice] = np.nan if global_vars.effect_size_choice == "None" else ttest_stats_df2.iloc[
                effect_size_df_row_lookup[global_vars.effect_size_choice], 1]
        current_dict["pvalues"] = ttest_stats_df2.iloc[3, 1]

        dictionaries_list.append(current_dict)

    output_df = pd.DataFrame(dictionaries_list)

    return output_df
Пример #20
0
	def t_test_for_two_EX_with_unknown_but_equal_variance(self, p_group1, p_group2):
		# n = p_group1.count() * p_group2.count()
		# std_ = ( (p_group1.std()**2 * (p_group1.count()-1) + p_group2.std()**2 * (p_group2.count()-1)) / ( n - 2) )**0.5
		# test_statitik = ((p_group1.mean() - p_group2.mean()) / std_) * (p_group1.count() * p_group2.count() / n )**0.5 
		# t_quantil = scipy.stats.t.ppf(1 - alpha/2, n - 2)
		return rp.ttest(group1 = p_group1, group1_name = p_group1.name, group2 = p_group2, group2_name = p_group2.name)
Пример #21
0
    def analyze_viability_early(self, *diff_args: [int, int, str]):
        """ performs analyze_viability but with early viability score (average of initial)
        really just a study 1 figure generator
        1. prints refPair1[0] mean, std
        2. prints refPair1[1] mean, std
        3. prints median mean, std
        4. prints bar plot
        5. prints box plot 
        6. prints paired t-test results for refPair1[0] and refPair1[1] 
        7. prints paired t-test results for median and refPair1[1]
        returns nothing """
        # error checking
        if self.summary is None:
            print("You must run .summarize() before running this function")

        r1 = self.summary["initial_" + self.viability_labels[0]]
        r2 = self.summary["later_" + self.viability_labels[0]]
        m = self.summary["median"]

        # 1. print r1 mean, std
        print(
            f"\n>>> initial_{self.viability_labels[0]} mean, standard deviation:"
        )
        print(f"n: {r1.count()}, mean: {r1.mean()}, std: {r1.std()}")

        # 2. print r2 mean, std
        print(
            f"\n>>> later_{self.viability_labels[0]} mean, standard deviation:"
        )
        print(f"n: {r2.count()}, mean: {r2.mean()}, std: {r2.std()}")

        # 3. print m mean, std
        print(f"\n>>> median round mean, standard deviation:")
        print(f"n: {m.count()}, mean: {m.mean()}, std: {m.std()}")

        title = "Viability Scores of Rounds"
        labels=[textwrap.fill(text, 12) for text in \
            ["Best Initial Round","Reconvened Round","Median Round"]]
        ylabel = "Team Mean Viability"

        order = [r1, r2, m]
        means = [x.mean() for x in order]
        stds = [x.std() for x in order]
        maxs = [x.max() for x in order]

        # 4. create barplot
        print("\n>>> barplot:")
        plt.figure(1)
        bar = plt.bar(np.arange(3), means, yerr=stds, align='center')
        plt.title(title)
        plt.xticks(np.arange(3), labels)
        plt.ylabel(ylabel)
        plt.savefig("early_bar.pdf")
        plt.show()

        # 5. create boxplot
        print("\n>>> boxplot:")
        plt.figure(2)
        box = plt.boxplot(order, positions=np.arange(3))
        plt.title(title)
        plt.xticks(np.arange(3), labels)
        plt.ylabel(ylabel)
        plt.ylim(top=70, bottom=14)

        # label diffs
        for args in diff_args:
            label_diff(*args, maxs)

        plt.savefig("early_box.pdf")
        plt.show()

        # 6. paired t-test r1 and r2
        print(
            f"\n>>> paired t-test between initial and reconvened {self.viability_labels[0]}:"
        )
        # print(stats.ttest_rel(r2, e))
        tt1 = researchpy.ttest(r1, r2, paired=True)[1]
        print(tt1)

        # 7. paired t-test median and r2
        print(
            f"\n>>> paired t-test between median and reconvened {self.viability_labels[0]}:"
        )
        # print(stats.ttest_rel(r2, e))
        tt1 = researchpy.ttest(m, r2, paired=True)[1]
        print(tt1)

        # return data for exterior processing
        return order
Пример #22
0
def _get_prop_vs_prop_dss_score(treatment,
                                con,
                                dfs=None,
                                use_percentage=None,
                                use_labels=None,
                                metric=None,
                                as_percentage=None,
                                is_categorical=None,
                                alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_t20, df_resp_t20 = get_prop_resp("t20a")

    # prop_values = metric(df_resp["min_offer_dss"], df_prop["offer"])
    # prop_value = metrics.get_mean(prop_values)

    prop_dss_values = metric(df_resp["min_offer_dss"], df_prop["offer_dss"])
    prop_dss_value = metrics.get_mean(prop_dss_values)

    auto_dss_values = metric(df_resp_t20["min_offer_dss"],
                             df_prop_t20["ai_offer"])
    auto_dss_value = metrics.get_mean(auto_dss_values)

    dof = 0
    diff = None

    if is_categorical:
        # table = np.array([np.bincount(prop_values), np.bincount(prop_dss_values)])
        # print("TABLE: ", table)
        # checked using: http://vassarstats.net/propcorr.html
        # s, p = sms2.mcnemar(prop_values, prop_dss_values, exact=False, correction=False)
        table, res = rp.crosstab(prop_dss_values,
                                 auto_dss_values,
                                 test='mcnemar')
        s, p, r = res.results.values

        test_label = f"(mcnemar) H0: equal, Ha: {'two-sided'}"

        print(
            "Conclusion: ",
            generate_cat_stat_sentence(np.mean(prop_dss_values),
                                       np.std(prop_dss_values),
                                       np.mean(auto_dss_values),
                                       np.std(auto_dss_values),
                                       s,
                                       p,
                                       dof,
                                       diff=diff,
                                       label1=treatment + ".dss",
                                       label2="t20.dss"))
    else:
        #s, p =  stats.wilcoxon(prop_values, auto_dss_values, alternative=alternative or 'two-sided')

        table, res = rp.ttest(pd.Series(prop_dss_values),
                              pd.Series(auto_dss_values),
                              paired=False)

        test_label = f"(wilcoxon) H0: equal, Ha: {alternative or 'two-sided'}"
        diff = res.results[0]
        dof = res.results[1]
        s = res.results[2]
        p = res.results[3]
        r = res.results[9]

        print(
            "Conclusion: ",
            generate_stat_sentence(np.mean(prop_dss_values),
                                   np.std(prop_dss_values),
                                   np.mean(auto_dss_values),
                                   np.std(auto_dss_values),
                                   s,
                                   p,
                                   dof,
                                   diff=diff,
                                   label1=treatment + ".dss",
                                   label2="t20.dss"))

    if as_percentage:
        res = {
            # "Proposer": f'{100 * prop_value:.2f} %',
            "Proposer + DSS":
            f'{100 * prop_dss_value:.2f} %',
            "T20 Auto DSS":
            f'{100 * auto_dss_value:.2f} %',
            "prop:dss - auto prop":
            f'{100 * (prop_dss_value - auto_dss_value):.2f} %',
        }
    else:
        res = {
            # "Proposer": f'{prop_value:.2f}',
            "Proposer + DSS": f'{prop_dss_value:.2f}',
            "T20 Auto DSS": f'{auto_dss_value:.2f}',
            "prop:dss - auto prop":
            f'{(prop_dss_value - auto_dss_value):.2f} %',
        }
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
Пример #23
0
def _get_prop_vs_prop_dss_score(treatment,
                                con,
                                dfs=None,
                                use_percentage=None,
                                use_labels=None,
                                metric=None,
                                as_percentage=None,
                                is_categorical=None,
                                alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_ref, df_resp_ref = get_prop_resp("t11a")

    print(metric.__name__)

    metric_values = metric(df_prop)
    metric_value = metrics.get_mean(metric_values)

    metric_ref_values = metric(df_prop_ref)
    metric_value_ref = metrics.get_mean(metric_ref_values)

    metric_values = metrics.get_data(metric_values)
    metric_ref_values = metrics.get_data(metric_ref_values)

    dof = 0
    diff = None
    if is_categorical:
        table, res = rp.crosstab(pd.Series(metric_ref_values),
                                 pd.Series(metric_values),
                                 test='chi-square')
        s, p, r = res.results.values

        print(
            "Conclusion: ",
            generate_cat_stat_sentence(np.mean(metric_ref_values),
                                       np.std(metric_ref_values),
                                       np.mean(metric_values),
                                       np.std(metric_values),
                                       s,
                                       p,
                                       dof,
                                       diff=diff,
                                       label1="t11a.dss",
                                       label2=treatment + ".dss"))
        test_label = f"(pearson chi2)"
    else:

        #print("Ranksums", stats.ranksums(metric_ref_values, metric_values))

        table, res = rp.ttest(pd.Series(metric_ref_values),
                              pd.Series(metric_values),
                              paired=False)
        s = res.results[2]
        if alternative == "greater":
            p = res.results[4]
        elif alternative == "less":
            p = res.results[5]
        elif alternative in (None, 'two-sided'):
            p = res.results[3]
        r = res.results[9]
        diff = res.results[0]
        dof = res.results[1]
        s = res.results[2]
        p = res.results[3]
        r = res.results[9]

        print(
            "Conclusion: ",
            generate_stat_sentence(np.mean(metric_ref_values),
                                   np.std(metric_ref_values),
                                   np.mean(metric_values),
                                   np.std(metric_values),
                                   s,
                                   p,
                                   dof,
                                   diff=diff,
                                   label1="t11a.dss",
                                   label2=treatment + ".dss"))

        test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}"
    print("RESUME: ", res)
    print("TABLE: ", table)

    if as_percentage:
        res = {
            # "Proposer": f'{100 * prop_value:.2f} %',
            "Proposer + DSS":
            f'{100 * metric_value:.2f} %',
            "T11A ":
            f'{100 * metric_value_ref:.2f} %',
            "prop:dss - auto prop":
            f'{100 * (metric_value - metric_value_ref):.2f} %',
        }
    else:
        res = {
            # "Proposer": f'{prop_value:.2f}',
            "Proposer + DSS": f'{metric_value:.2f}',
            "T11A": f'{metric_value_ref:.2f}',
            "prop:dss - auto prop":
            f'{(metric_value - metric_value_ref):.2f} %',
        }
    res[test_label] = f"{s:.3f} ({p:.3f})"
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
Пример #24
0
def find_info_for_all_tVst(key, count):
    df_tvst = df.loc[df.prev_treat_next == key]
    print(Color.BOLD, count,
          ". The following pre_treat_post combination is being searched :",
          Color.END, Color.PURPLE, key, "[treat-post-pre] :: ", df_tvst.shape,
          Color.END)

    df_tvst.drop(['prev_treat_next', 'Unnamed: 0', 'user_id'],
                 axis=1,
                 inplace=True)

    # listing the set of columns, just to be sure, after dropping the irrelevant ones
    # print(df_tvst.columns)

    # This function gets the unique instances in the following columns
    # 1. ts_owner_id        : id of the teacher who made the tutoring
    # 2. pl_p_problem_id    : id of the pre test
    # 3. plta_problem_id    : id of the treatment problem
    # 4. pl_n_problem_id    : id of the post test
    def get_Unique_info(column_name):
        unique_instances = df_tvst[column_name].unique()
        # print("--------more info on " + column_name + "------------")
        # print(df_tvst[column_name].value_counts())
        return unique_instances

    unique_teacher_ids = get_Unique_info('ts_owner_id')
    pre_problem_ids = get_Unique_info('pl_p_problem_id')
    treatment_problem_ids = get_Unique_info('plta_problem_id')
    post_problem_ids = get_Unique_info('pl_n_problem_id')

    # print("--------teacher-ids------------------", unique_teacher_ids)
    # print("--------previous-ids------------------", pre_problem_ids)
    # print("--------treatment-ids------------------", treatment_problem_ids)
    # print("--------post-ids------------------", post_problem_ids)
    # print("------------teacher counts---------------------")
    # print(df_tvst['ts_owner_id'].value_counts())

    df_tvst["avg_p_score_question"] = 0
    df_tvst["avg_a_score_question"] = 0
    df_tvst["avg_n_score_question"] = 0

    # This function finds the average question score for all pre, treatment and post
    # we did this because the average score is indicative of the difficulty of the problem
    def find_avg_question_score(unique_id, column_name, avg_col,
                                correctness_column):
        mean = df_tvst.loc[(
            df_tvst[column_name] == unique_id)][correctness_column].mean()
        df_tvst.loc[df_tvst[column_name] == unique_id, avg_col] = mean

    # this was done because initially there could have been multiple ids in a single treatment/pre/post condition
    # which is no longer the case
    # TODO: Hence, we might actually remove this whole code block during code refactoring
    def calculate_the_average(ids, column_name, avg_for_column,
                              correctness_column):
        for val in ids:
            find_avg_question_score(val, column_name, avg_for_column,
                                    correctness_column)

    calculate_the_average(pre_problem_ids, "pl_p_problem_id",
                          "avg_p_score_question", "pl_p_correct")
    calculate_the_average(treatment_problem_ids, "plta_problem_id",
                          "avg_a_score_question", "plta_correct")
    calculate_the_average(post_problem_ids, "pl_n_problem_id",
                          "avg_n_score_question", "pl_n_correct")

    # average score is indicative of the difficulty of the problem
    avg_p_score = df_tvst["avg_p_score_question"].unique()
    avg_a_score = df_tvst["avg_a_score_question"].unique()
    avg_n_score = df_tvst["avg_n_score_question"].unique()

    # print("average pre score", avg_p_score)
    # print("average treatment score", avg_a_score)
    # print("average post score", avg_n_score)

    df_teacher_sign_test = pd.DataFrame(columns=[
        "teacher_id", "total_treated_exposed", "pre_test", "post_test", "+/-"
    ])
    df_teacher_sign_test_treatment = pd.DataFrame(columns=[
        "teacher_id", "total_treated_used", "pre_test", "post_test", "+/-"
    ])

    for i in range(unique_teacher_ids.size):
        df_teacher_sign_test = df_teacher_sign_test.append(
            {
                "teacher_id": unique_teacher_ids[i],
                "post_test": 0,
                "post_score(avg)": 0,
                "pre_test": 0,
                "pre_score(avg)": 0,
                "+/-": 0,
                "total_treated_exposed": 0
            },
            ignore_index=True)
        df_teacher_sign_test_treatment = df_teacher_sign_test_treatment.append(
            {
                "teacher_id": unique_teacher_ids[i],
                "post_test": 0,
                "post_score(avg)": 0,
                "pre_test": 0,
                "pre_score(avg)": 0,
                "+/-": 0,
                "total_treated_used": 0
            },
            ignore_index=True)

    def extract_information_perTeacher_perTreatmentQuestion(teacher_id):
        df_teacher_specific = df_tvst.loc[df_tvst.ts_owner_id == teacher_id]

        pre_count = (df_teacher_specific.pl_p_correct == 1).sum()
        post_count = (df_teacher_specific.pl_n_correct == 1).sum()

        mean_p = df_teacher_specific.loc[
            df_teacher_specific.ts_owner_id ==
            teacher_id]["avg_p_score_question"].mean()
        mean_n = df_teacher_specific.loc[
            df_teacher_specific.ts_owner_id ==
            teacher_id]["avg_n_score_question"].mean()

        df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id,
                                 "pre_test"] = pre_count
        df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id,
                                 "pre_score(avg)"] = mean_p

        df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id,
                                 "post_test"] = post_count
        df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id,
                                 "post_score(avg)"] = mean_n
        df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id,
                                 "total_treated_exposed"] = len(
                                     df_teacher_specific.pl_n_correct)

        df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id,
                                 "+/-"] = (post_count - pre_count)

        df_teacher_specific = df_teacher_specific.loc[
            (df_teacher_specific.plta_hint_count > 0) |
            (df_teacher_specific.plta_bottom_hint > 0)]

        pre_count = (df_teacher_specific.pl_p_correct == 1).sum()
        post_count = (df_teacher_specific.pl_n_correct == 1).sum()

        mean_p_t = df_teacher_specific.loc[
            df_teacher_specific.ts_owner_id ==
            teacher_id]["avg_p_score_question"].mean()
        mean_n_t = df_teacher_specific.loc[
            df_teacher_specific.ts_owner_id ==
            teacher_id]["avg_n_score_question"].mean()
        df_teacher_sign_test_treatment.loc[
            df_teacher_sign_test_treatment.teacher_id == teacher_id,
            "pre_test"] = pre_count
        df_teacher_sign_test_treatment.loc[
            df_teacher_sign_test_treatment.teacher_id == teacher_id,
            "pre_score(avg)"] = mean_p_t
        df_teacher_sign_test_treatment.loc[
            df_teacher_sign_test_treatment.teacher_id == teacher_id,
            "post_test"] = post_count
        df_teacher_sign_test_treatment.loc[
            df_teacher_sign_test_treatment.teacher_id == teacher_id,
            "post_score(avg)"] = mean_n_t
        df_teacher_sign_test_treatment.loc[
            df_teacher_sign_test_treatment.teacher_id == teacher_id,
            "+/-"] = (post_count - pre_count)
        df_teacher_sign_test_treatment.loc[
            df_teacher_sign_test_treatment.teacher_id == teacher_id,
            "total_treated_used"] = len(df_teacher_specific.pl_n_correct)

        df_teacher_sign_test.fillna(0, inplace=True)
        df_teacher_sign_test_treatment.fillna(0, inplace=True)

    for val in unique_teacher_ids:
        extract_information_perTeacher_perTreatmentQuestion(val)

    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    n_largest = df_teacher_sign_test_treatment['total_treated_used'].nlargest(
        2)
    # if (df_teacher_sign_test_treatment['total_treated_used'].max()) > 20:
    if (n_largest.min()) > 40:
        print(Color.RED, "\n",
              'This is a good Data more than 15 used instances', Color.END)
        print(n_largest.max(), " min : ", n_largest.min())
        # print("--------------------------------That were in the treatment condition-----------------------------------")
        # print(df_teacher_sign_test)
        print(
            "---------------------------------That used the treatment condition-------------------------------------"
        )
        print(df_teacher_sign_test_treatment)

        X = np.arange(len(df_teacher_sign_test_treatment.teacher_id))
        plt.bar(X + 0.0,
                df_teacher_sign_test_treatment.pre_test,
                color='b',
                width=0.3,
                label='pretest score')
        plt.bar(X + 0.3,
                df_teacher_sign_test_treatment.post_test,
                color='g',
                width=0.3,
                label='posttest score')
        plt.bar(X + 0.6,
                df_teacher_sign_test_treatment.total_treated_used,
                color='r',
                width=0.3,
                label='total_treatment_used')

        for id in df_teacher_sign_test_treatment.teacher_id:
            if id in id_name.keys():
                df_teacher_sign_test_treatment.loc[
                    df_teacher_sign_test_treatment.teacher_id == id,
                    "teacher_id"] = id_name[id]

        plt.xticks(X + 0.15, df_teacher_sign_test_treatment.teacher_id)
        plt.title(str(count) + ". figure")
        plt.legend()

        plt.show()

        for teacher_id in unique_teacher_ids:
            df_teacher_specific = df_tvst.loc[df_tvst.ts_owner_id ==
                                              teacher_id]
            df_teacher_specific = df_teacher_specific.loc[
                (df_teacher_specific.plta_hint_count > 0) |
                (df_teacher_specific.plta_bottom_hint > 0)]

            print(
                "-------------------------------------------------------------------------------------------------------"
            )
            print(Color.BOLD, Color.GREEN, id_name[teacher_id], Color.END,
                  Color.END)

            descriptives, results = rp.ttest(df_teacher_specific.pl_n_correct,
                                             df_teacher_specific.pl_p_correct)

            print(
                "-------------------------------------------------------------------------------------------------------"
            )
            print(descriptives)
            print(
                "-------------------------------------------------------------------------------------------------------"
            )
            print(results)
            print(
                "-------------------------------------------------------------------------------------------------------"
            )
        print("\n")
		features corrected based on Bonferroni
			Documentation can be found at https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html
		reject_t: true for hypothesis that can be rejected for given alpha
		pvals_corrected_t: p-values corrected for multiple tests
		corrected_hypotesis: corrected alpha for Bonferroni method
		rejected_t_dic: dictionary filtered by reject_t values equals True
		   
    """

   features_pv_05=dict()
    print(type(features_pv_05))
    exceptions=dict()

    for i in data_train_anatomy._get_numeric_data():
        try:  
            descriptives, results = rp.ttest(df_ASD[i], df_TC[i])     

            p_val=(results["results"][3])                
            if (p_val<=0.05) and (p_val!=0.0):   
                features_pv_05[i]=p_val

        except ZeroDivisionError:
            exceptions[i]=p_val
            
    features_dict_p_val_05=(dict(sorted(features_pv_05.items(), key = lambda x : x[1])))
    ## Correction Bonferroni 
    reject_t, pvals_corrected_t,alphacSidak_t, alphacBonf_t = smm.multipletests((list(features_dict_p_val_05.values())),alpha=0.05, method='b',returnsorted=True)
    corrected_hypotesis= dict((key, value) for (key, value) in zip(features_pv_05.keys(), reject_t))
    rejected_t_dic = {k: v for k, v in corrected_hypotesis.items() if v==True}
	corrected_pvalues=dict((key, value) for (key, value) in zip(rejected_t_dic.keys(), pvals_corrected_t))
    corrected_p_dic = {k: v for k, v in corrected_pvalues.items() }
Пример #26
0
 def stat_groups(self, group1, group2):
     """Returns statistic analysis of two groups"""
     descriptive_table, result_table = researchpy.ttest(group1, group2)
     descriptive_table = descriptive_table.rename(index={0: 'ApoE3', 1: 'ApoE4', 2: 'ApoE3 + ApoE4'})
     return descriptive_table, result_table
Пример #27
0
import pandas as pd
import researchpy as rp
import scipy.stats as stats

df = pd.read_csv("interfaceResultsT-Test.csv")
df.info()

summary, results = rp.ttest(group1= df['time_after'][df['sex'] == 'Male'], group1_name= "Male",
                            group2= df['time_after'][df['sex'] == 'Female'], group2_name= "Female")
print(summary)
print(results)