def test_fleis_randolph():
    # reference numbers from online calculator
    # http://justusrandolph.net/kappa/#dInfo
    table = [[7, 0], [7, 0]]
    assert_equal(fleiss_kappa(table, method='unif'), 1)

    table = [[6.99, 0.01], [6.99, 0.01]]
    # % Overall Agreement 0.996671
    # Fixed Marginal Kappa: -0.166667
    # Free Marginal Kappa: 0.993343
    assert_allclose(fleiss_kappa(table), -0.166667, atol=6e-6)
    assert_allclose(fleiss_kappa(table, method='unif'), 0.993343, atol=6e-6)

    table = [[7, 1], [3, 5]]
    # % Overall Agreement 0.607143
    # Fixed Marginal Kappa: 0.161905
    # Free Marginal Kappa: 0.214286
    assert_allclose(fleiss_kappa(table, method='fleiss'), 0.161905, atol=6e-6)
    assert_allclose(fleiss_kappa(table, method='randolph'), 0.214286, atol=6e-6)

    table = [[7, 0], [0, 7]]
    # % Overall Agreement 1.000000
    # Fixed Marginal Kappa: 1.000000
    # Free Marginal Kappa: 1.000000
    assert_allclose(fleiss_kappa(table), 1)
    assert_allclose(fleiss_kappa(table, method='uniform'), 1)

    table = [[6, 1, 0], [0, 7, 0]]
    # % Overall Agreement 0.857143
    # Fixed Marginal Kappa: 0.708333
    # Free Marginal Kappa: 0.785714
    assert_allclose(fleiss_kappa(table), 0.708333, atol=6e-6)
    assert_allclose(fleiss_kappa(table, method='rand'), 0.785714, atol=6e-6)
def test_fleiss_kappa_irr():
    fleiss = Holder()
    #> r = kappam.fleiss(diagnoses)
    #> cat_items(r, pref="fleiss.")
    fleiss.method = "Fleiss' Kappa for m Raters"
    fleiss.irr_name = 'Kappa'
    fleiss.value = 0.4302445
    fleiss.stat_name = 'z'
    fleiss.statistic = 17.65183
    fleiss.p_value = 0
    data_ = aggregate_raters(diagnoses)[0]
    res1_kappa = fleiss_kappa(data_)
    assert_almost_equal(res1_kappa, fleiss.value, decimal=7)
示例#3
0
def kappa(y_true, y_pred, type='cohens'):
    import statsmodels.stats.inter_rater as irater
    yy = (y_true & y_pred).sum()
    yn = (y_true & (~y_pred)).sum()
    nn = ((~y_true) & (~y_pred)).sum()
    ny = ((~y_true) & (y_pred)).sum()
    result = np.array([[yy, yn], [ny, nn]])
    if type == 'cohens':
        stat = irater.cohens_kappa(result)
        score = stat['kappa']
    elif type == 'fleiss':
        score = irater.fleiss_kappa(result)
    return score, result
示例#4
0
  END as verdict, annotation.user, verdict_line.page, verdict_line.line_number, annotation.id as aid, testing, isOracle,isReval, isTestMode,isOracleMaster,isDiscounted from annotation
inner join claim on annotation.claim_id = claim.id
left join annotation_verdict on annotation.id = annotation_verdict.annotation_id
left join verdict_line on annotation_verdict.id = verdict_line.verdict_id
where isForReportingOnly = 0 and isTestMode = 0 and testing= 0 and isReval=1)
as a group by id, user

        """)


    def row_ct(row):
        rowct = []
        for i in range(3):
            rowct.append(row.count(i))

        return rowct

    claims = cursor.fetchall()

    for claim in claims:
        if claim['verifiable'] == "NOT ENOUGH INFO":
            claims_dict[claim['id']].append(0)
        elif claim['verifiable'] == "VERIFIABLE":
            claims_dict[claim['id']].append(1 if claim["verdict"]=="SUPPORTS" else 2)


    fkt1 = [row_ct(claims_dict[key]) for key in claims_dict if len(claims_dict[key]) == 5]
    print(fkt1)
    print(len(fkt1))
    print(fleiss_kappa(fkt1))
示例#5
0
def reducer_(key, values):
    # key : string of intermediary key
    # load return dict correspondning to mapper ouput. they need to be loaded.
    # DEBUG
    import glob, mapreduce
    BASE = "/neurospin/brainomics/2013_adni/ADAS11-MCIc-CTL/rndperm"
    INPUT = BASE + "/%i/%s"
    OUTPUT = BASE + "/../results/rndperm"
    keys = ["0.001_0.3335_0.3335_0.333_-1",  "0.001_0.5_0_0.5_-1",  "0.001_0.5_0.5_0_-1",  "0.001_1_0_0_-1"]
    for key in keys:
        #key = keys[0]
        paths_5cv_all = [INPUT % (perm, key) for perm in xrange(NFOLDS * NRNDPERMS)]
        idx_5cv_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS)
        cpt = 0
        qc = dict()
        r2_perms = np.zeros(NRNDPERMS)
        corr_perms = np.zeros(NRNDPERMS)
        r_bar_perms = np.zeros(NRNDPERMS)
        fleiss_kappa_stat_perms = np.zeros(NRNDPERMS)
        dice_bar_perms = np.zeros(NRNDPERMS)
        for perm_i in xrange(len(idx_5cv_blocks)-1):
            paths_5cv = paths_5cv_all[idx_5cv_blocks[perm_i]:idx_5cv_blocks[perm_i+1]]
            for p in paths_5cv:
                if os.path.exists(p) and not(p in qc):
                    if p in qc:
                        qc[p] += 1
                    else:
                        qc[p] = 1
                    cpt += 1
            #
            values = [mapreduce.OutputCollector(p) for p in paths_5cv]
            values = [item.load() for item in values]
            y_true = [item["y_true"].ravel() for item in values]
            y_pred = [item["y_pred"].ravel() for item in values]
            y_true = np.concatenate(y_true)
            y_pred = np.concatenate(y_pred)
            r2 = r2_score(y_true, y_pred)
            corr = np.corrcoef(y_true.ravel(), y_pred.ravel())[0, 1]
            betas = np.hstack([item["beta"] for item in values]).T
            #
            ## Compute beta similarity measures
            #
            # Correlation
            R = np.corrcoef(betas)
            R = R[np.triu_indices_from(R, 1)]
            # Fisher z-transformation / average
            z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
            # bracktransform
            r_bar = (np.exp(2 * z_bar) - 1) /  (np.exp(2 * z_bar) + 1)
            #
            # threshold betas to compute fleiss_kappa and DICE
            try:
                betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0])])
                print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
                print np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5,
                                   rtol=0, atol=1e-02)
                #
                # Compute fleiss kappa statistics
                beta_signed = np.sign(betas_t)
                table = np.zeros((beta_signed.shape[1], 3))
                table[:, 0] = np.sum(beta_signed == 0, 0)
                table[:, 1] = np.sum(beta_signed == 1, 0)
                table[:, 2] = np.sum(beta_signed == -1, 0)
                fleiss_kappa_stat = fleiss_kappa(table)
                #
                # Paire-wise Dice coeficient
                beta_n0 = betas_t != 0
                ij = [[i, j] for i in xrange(5) for j in xrange(i+1, 5)]
                #print [[idx[0], idx[1]] for idx in ij]
                dice_bar = np.mean([float(np.sum(beta_signed[idx[0], :] == beta_signed[idx[1], :])) /\
                     (np.sum(beta_n0[idx[0], :]) + np.sum(beta_n0[idx[1], :]))
                     for idx in ij])
            except:
                dice_bar = fleiss_kappa_stat = 0.
            #
            r2_perms[perm_i] = r2
            corr_perms[perm_i] = corr
            r_bar_perms[perm_i] = r_bar
            fleiss_kappa_stat_perms[perm_i] = fleiss_kappa_stat
            dice_bar_perms[perm_i] = dice_bar
        # END PERMS
        print "save", key
        np.savez_compressed(OUTPUT+"/perms_"+key+".npz",
                            r2=r2_perms, corr=corr_perms,
                            r_bar=r_bar_perms, fleiss_kappa=fleiss_kappa_stat_perms,
                            dice_bar=dice_bar_perms)
        #
        perms = dict()
        fig, axis = plt.subplots(len(keys), 4)#, sharex='col')
        for i, key in enumerate(keys):
            perms[key] = np.load(OUTPUT+"/perms_"+key+".npz")
            n, bins, patches = axis[i, 0].hist(perms[key]['r2'], 50, normed=1, histtype='stepfilled')
            axis[i, 0].set_title(key + "_r2")
            n, bins, patches = axis[i, 1].hist(perms[key]['r_bar'], 50, normed=1, histtype='stepfilled')
            axis[i, 1].set_title(key + "_r_bar")
            n, bins, patches = axis[i, 2].hist(perms[key]['fleiss_kappa'], 50, histtype='stepfilled')
            axis[i, 2].set_title(key + "_fleiss_kappa")
            n, bins, patches = axis[i, 3].hist(perms[key]['dice_bar'], 50)#, 50, normed=1, histtype='stepfilled')
            axis[i, 3].set_title(key + "_dice_bar")
        plt.show()

        l1l2tv, l1tv, l1l2, l1 = ["0.001_0.3335_0.3335_0.333_-1",  "0.001_0.5_0_0.5_-1",  
                             "0.001_0.5_0.5_0_-1",  "0.001_1_0_0_-1"]

        # Read true scores
        import pandas as pd
        true = pd.read_csv(os.path.join(BASE, "..", "ADAS11-MCIc-CTL.csv"))
        true = true[true.a == 0.001]
        true_l1l2tv = true[true.l1 == 0.3335].iloc[0]
        true_l1l2 = true[(true.l1 == 0.5) & (true.l2 == 0.5)].iloc[0]
        true_l1tv = true[(true.l1 == 0.5) & (true.tv == 0.5)].iloc[0]
        true_l1 = true[(true.l1 == 1.)].iloc[0]

        # pvals
        nperms = float(len(perms[l1]['r2']))
        from collections import OrderedDict
        pvals = OrderedDict()
        pvals["cond"] = ['l1', 'l1tv', 'l1l2', 'l1l2tv'] * 4 + \
                ['l1 vs l1tv'] * 4  + ['l1l2 vs l1l2tv'] * 4
        pvals["stat"] = ['r2'] * 4 + ['r_bar'] * 4 + ['fleiss_kappa'] * 4 + ['dice_bar'] * 4 +\
                ['r2', 'r_bar', 'fleiss_kappa', 'dice_bar'] * 2
        pvals["pval"] = [
            np.sum(perms[l1]['r2'] > true_l1["r2"]),
            np.sum(perms[l1tv]['r2'] > true_l1tv["r2"]),
            np.sum(perms[l1l2]['r2'] > true_l1l2["r2"]),
            np.sum(perms[l1l2tv]['r2'] > true_l1l2tv["r2"]),
    
            np.sum(perms[l1]['r_bar'] > true_l1["beta_r_bar"]),
            np.sum(perms[l1tv]['r_bar'] > true_l1tv["beta_r_bar"]),
            np.sum(perms[l1l2]['r_bar'] > true_l1l2["beta_r_bar"]),
            np.sum(perms[l1l2tv]['r_bar'] > true_l1l2tv["beta_r_bar"]),
    
            np.sum(perms[l1]['fleiss_kappa'] > true_l1["beta_fleiss_kappa"]),
            np.sum(perms[l1tv]['fleiss_kappa'] > true_l1tv["beta_fleiss_kappa"]),
            np.sum(perms[l1l2]['fleiss_kappa'] > true_l1l2["beta_fleiss_kappa"]),
            np.sum(perms[l1l2tv]['fleiss_kappa'] > true_l1l2tv["beta_fleiss_kappa"]),
    
            np.sum(perms[l1]['dice_bar'] > true_l1["beta_dice_bar"]),
            np.sum(perms[l1tv]['dice_bar'] > true_l1tv["beta_dice_bar"]),
            np.sum(perms[l1l2]['dice_bar'] > true_l1l2["beta_dice_bar"]),
            np.sum(perms[l1l2tv]['dice_bar'] > true_l1l2tv["beta_dice_bar"]),
    
            # l1 vs l1tv
            np.sum((perms[l1tv]['r2'] - perms[l1]['r2']) > (true_l1tv["r2"] - true_l1["r2"])),
            np.sum((perms[l1tv]['r_bar'] - perms[l1]['r_bar']) > (true_l1tv["beta_r_bar"] - true_l1["beta_r_bar"])),
            np.sum((perms[l1tv]['fleiss_kappa'] - perms[l1]['fleiss_kappa']) > (true_l1tv["beta_fleiss_kappa"] - true_l1["beta_fleiss_kappa"])),
            np.sum((perms[l1tv]['dice_bar'] - perms[l1]['dice_bar']) > (true_l1tv["beta_dice_bar"] - true_l1["beta_dice_bar"])),
    
            # l1l2 vs l1l2tv
            np.sum((perms[l1l2]['r2'] - perms[l1l2tv]['r2']) > (true_l1l2["r2"] - true_l1l2tv["r2"])),
            np.sum((perms[l1l2tv]['r_bar'] - perms[l1l2]['r_bar']) > (true_l1l2tv["beta_r_bar"] - true_l1l2["beta_r_bar"])),
            np.sum((perms[l1l2tv]['fleiss_kappa'] - perms[l1l2]['fleiss_kappa']) > (true_l1l2tv["beta_fleiss_kappa"] - true_l1l2["beta_fleiss_kappa"])),
            np.sum((perms[l1l2tv]['dice_bar'] - perms[l1l2]['dice_bar']) > (true_l1l2tv["beta_dice_bar"] - true_l1l2["beta_dice_bar"]))]

        pvals = pd.DataFrame(pvals)
        pvals["pval"] /= nperms
        pvals.to_csv(os.path.join(OUTPUT, "pvals_stats_permutations.csv"), index=False)
示例#6
0
if __name__ == '__main__':
    arguments = docopt(__doc__)

    batch_result_file1 = arguments['--f']
    workers, results = load_results(batch_result_file1)

    N = len(results)
    k = 7
    n = 3
    label_index = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 5, 8: 6}
    mat = np.zeros((N, k))
    for ind, (_, v) in enumerate(results.items()):
        for a in v.values():
            mat[ind][label_index[a[0]]] += 1
    print(fleiss_kappa(mat))

    full_ag = 0
    part_ag = 0

    for row in mat:
        non_zero = len(np.nonzero(row)[0])
        if non_zero == 1:
            full_ag += 1
        elif non_zero == 2:
            part_ag += 1

    print('full agreement', full_ag)
    print('partial agreement', part_ag)
    print('no agreement', len(mat) - full_ag - part_ag)
示例#7
0
def reducer(key, values):
    # key : string of intermediary key
    # load return dict correspondning to mapper ouput. they need to be loaded.
    # DEBUG
    #import glob, mapreduce
    #values = [mapreduce.OutputCollector(p) for p in glob.glob("/neurospin/brainomics/2013_adni/AD-CTL/results/*/0.1_0.0_0.0_1.0_-1.0/")]
    #values = [mapreduce.OutputCollector(p) for p in glob.glob("/home/ed203246/tmp/MCIc-MCInc_cs/results/*/0.1_0.0_0.0_1.0_-1.0/")]
    # values = [mapreduce.OutputCollector(p) for p in glob.glob("/home/ed203246/tmp/MCIc-CTL_cs/results/*/0.1_0.0_1.0_0.0_-1.0/")]
    # values = [mapreduce.OutputCollector(p) for p in glob.glob("/home/ed203246/tmp/MCIc-CTL_cs/results/*/0.1_0.0_0.5_0.5_-1.0/")]
    # Compute sd; ie.: compute results on each folds
    print key
    values = [item.load() for item in values[1:]]
    recall_mean_std = np.std([
        np.mean(
            precision_recall_fscore_support(
                item["y_true"].ravel(), item["y_pred"])[1]) for item in values
    ]) / np.sqrt(len(values))
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    prob_pred = [item["proba_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = np.concatenate(prob_pred)
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)  #area under curve score.
    n_ite = None
    betas = np.hstack([item["beta"] for item in values]).T
    ## Compute beta similarity measures
    # Correlation
    R = np.corrcoef(betas)
    #print R
    R = R[np.triu_indices_from(R, 1)]
    print R
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in xrange(betas.shape[0])
        ])
        print "--", np.sqrt(np.sum(betas_t**2, 1)) / np.sqrt(
            np.sum(betas**2, 1))
        print np.allclose(np.sqrt(np.sum(betas_t**2, 1)) /
                          np.sqrt(np.sum(betas**2, 1)), [0.99] * 5,
                          rtol=0,
                          atol=1e-02)

        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        beta_n0 = betas_t != 0
        ij = [[i, j] for i in xrange(5) for j in xrange(i + 1, 5)]
        #print [[idx[0], idx[1]] for idx in ij]
        dice_bar = np.mean([float(np.sum(beta_signed[idx[0], :] == beta_signed[idx[1], :])) /\
             (np.sum(beta_n0[idx[0], :]) + np.sum(beta_n0[idx[1], :]))
             for idx in ij])
    except:
        dice_bar = fleiss_kappa_stat = 0.

    a, l1, l2, tv, k = key  #[float(par) for par in key.split("_")]
    scores = OrderedDict()
    scores['a'] = a
    scores['l1'] = l1
    scores['l2'] = l2
    scores['tv'] = tv
    left = float(1 - tv)
    if left == 0: left = 1.
    scores['l1l2_ratio'] = float(l1) / left
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores['recall_mean_std'] = recall_mean_std
    scores['auc'] = auc
    #    scores['beta_cor_mean'] = beta_cor_mean
    scores['precision_0'] = p[0]
    scores['precision_1'] = p[1]
    scores['precision_mean'] = p.mean()
    scores['f1_0'] = f[0]
    scores['f1_1'] = f[1]
    scores['f1_mean'] = f.mean()
    scores['support_0'] = s[0]
    scores['support_1'] = s[1]
    #    scores['corr']= corr
    scores['beta_r'] = str(R)
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar
    scores['n_ite'] = n_ite
    scores['k'] = k
    scores['key'] = key
    return scores
示例#8
0
4 	0 	3 	9 	2 	0 	0.440
5 	2 	2 	8 	1 	1 	0.330
6 	7 	7 	0 	0 	0 	0.462
7 	3 	2 	6 	3 	0 	0.242
8 	2 	5 	3 	2 	2 	0.176
9 	6 	5 	2 	1 	0 	0.286
10 	0 	2 	2 	3 	7 	0.286'''.split(), float).reshape(10,-1)


Total = np.asarray("20 	28 	39 	21 	32".split('\t'), int)
Pj = np.asarray("0.143 	0.200 	0.279 	0.150 	0.229".split('\t'), float)
kappa_wp = 0.210
table1 = table0[:, 1:-1]


print(fleiss_kappa(table1))
table4 = np.array([[20,5], [10, 15]])
print('res', cohens_kappa(table4), 0.4) #wikipedia

table5 = np.array([[45, 15], [25, 15]])
print('res', cohens_kappa(table5), 0.1304) #wikipedia

table6 = np.array([[25, 35], [5, 35]])
print('res', cohens_kappa(table6), 0.2593)  #wikipedia
print('res', cohens_kappa(table6, weights=np.arange(2)), 0.2593)  #wikipedia
t7 = np.array([[16, 18, 28],
               [10, 27, 13],
               [28, 20, 24]])
print(cohens_kappa(t7, weights=[0, 1, 2]))

table8 = np.array([[25, 35], [5, 35]])
示例#9
0
def scores(key, paths, config):
    key_parts = key.split("_")
    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)
    prob_pred_splits = [item["prob_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred_splits)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)

    # balanced accuracy (recall_mean)
    bacc_splits = [
        recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean()
        for f in range(len(y_true_splits))
    ]
    auc_splits = [
        roc_auc_score(y_true_splits[f], prob_pred_splits[f])
        for f in range(len(y_true_splits))
    ]

    print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits)))
    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_bacc = binom_test(success[0] + success[1],
                             s[0] + s[1],
                             p=0.5,
                             alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T
    # Correlation
    R = np.corrcoef(betas)
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in range(betas.shape[0])
        ])
        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0])
              for j in range(i + 1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    # Proportion of selection within the support accross the CV
    support_count = (betas_t != 0).sum(axis=0)
    support_count = support_count[support_count > 0]
    support_prop = support_count / betas_t.shape[0]

    scores = OrderedDict()
    scores['key'] = key
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['bacc'] = r.mean()
    scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits))
    scores["auc"] = auc
    scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits))
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_bacc_mean'] = pvalue_bacc
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar
    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)
    scores['beta_support_prop_select_mean'] = support_prop.mean()
    scores['beta_support_prop_select_sd'] = support_prop.std()

    return scores
示例#10
0
2 	0 	2 	6 	4 	2 	0.253
3 	0 	0 	3 	5 	6 	0.308
4 	0 	3 	9 	2 	0 	0.440
5 	2 	2 	8 	1 	1 	0.330
6 	7 	7 	0 	0 	0 	0.462
7 	3 	2 	6 	3 	0 	0.242
8 	2 	5 	3 	2 	2 	0.176
9 	6 	5 	2 	1 	0 	0.286
10 	0 	2 	2 	3 	7 	0.286'''.split(), float).reshape(10, -1)

Total = np.asarray("20 	28 	39 	21 	32".split('\t'), int)
Pj = np.asarray("0.143 	0.200 	0.279 	0.150 	0.229".split('\t'), float)
kappa_wp = 0.210
table1 = table0[:, 1:-1]

print(fleiss_kappa(table1))
table4 = np.array([[20, 5], [10, 15]])
print('res', cohens_kappa(table4), 0.4)  #wikipedia

table5 = np.array([[45, 15], [25, 15]])
print('res', cohens_kappa(table5), 0.1304)  #wikipedia

table6 = np.array([[25, 35], [5, 35]])
print('res', cohens_kappa(table6), 0.2593)  #wikipedia
print('res', cohens_kappa(table6, weights=np.arange(2)), 0.2593)  #wikipedia
t7 = np.array([[16, 18, 28], [10, 27, 13], [28, 20, 24]])
print(cohens_kappa(t7, weights=[0, 1, 2]))

table8 = np.array([[25, 35], [5, 35]])
print('res', cohens_kappa(table8))
def scores(key, paths, config, ret_y=False):
    import glob, mapreduce
    print key
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    recall_mean_std = np.std([
        np.mean(
            precision_recall_fscore_support(
                item["y_true"].ravel(), item["y_pred"])[1]) for item in values
    ]) / np.sqrt(len(values))
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    prob_pred = [item["proba_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = np.concatenate(prob_pred)
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)  #area under curve score.
    n_ite = None
    betas = np.hstack(
        [item["beta"][config['penalty_start']:, :] for item in values]).T
    ## Compute beta similarity measures
    # Correlation
    R = np.corrcoef(betas)
    #print R
    R = R[np.triu_indices_from(R, 1)]
    print R
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in xrange(betas.shape[0])
        ])
        #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
        print np.allclose(np.sqrt(np.sum(betas_t**2, 1)) /
                          np.sqrt(np.sum(betas**2, 1)), [0.99] * 5,
                          rtol=0,
                          atol=1e-02)

        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in xrange(5) for j in xrange(i + 1, 5)]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0.

    scores = OrderedDict()
    try:
        a, l1, l2, tv, k = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        scores['tv'] = tv
        left = float(1 - tv)
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
        scores['k'] = k
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores['recall_mean_std'] = recall_mean_std
    scores['auc'] = auc
    #    scores['beta_cor_mean'] = beta_cor_mean
    scores['precision_0'] = p[0]
    scores['precision_1'] = p[1]
    scores['precision_mean'] = p.mean()
    scores['f1_0'] = f[0]
    scores['f1_1'] = f[1]
    scores['f1_mean'] = f.mean()
    scores['support_0'] = s[0]
    scores['support_1'] = s[1]
    #    scores['corr']= corr
    scores['beta_r'] = str(R)
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice'] = str(dices)
    scores['beta_dice_bar'] = dice_bar
    scores['n_ite'] = n_ite
    scores['param_key'] = key
    if ret_y:
        scores["y_true"], scores["y_pred"], scores[
            "prob_pred"] = y_true, y_pred, prob_pred
    return scores
        question_arr = []
        for question, responses in question_responses.iteritems():
            if question not in ["humanlike_text", "correct_text", "strategic_text", "cooperative_text", "fluent_text"]:
                responses = np.array(responses[:5])
                question_arr.append(bin(responses))

                avg = responses.mean()
                median = np.median(responses)
                std = responses.std()

                dialogue_to_stats[dialogue_id][agent_id][question].append(avg)
                dialogue_to_stats[dialogue_id][agent_id][question].append(median)
                dialogue_to_stats[dialogue_id][agent_id][question].append(std)

        question_arr = np.array(question_arr)
        kappa = fleiss_kappa(question_arr)
        dialogue_to_stats[dialogue_id][agent_id]["kappa"].append(kappa)


dialogue_eval_info = []
dialogue_eval_info.append(dialogue_to_agent_mapping)
dialogue_eval_info.append(dialogue_to_responses)
dialogue_eval_info.append(dialogue_to_stats)

scenario_id_to_mappings = defaultdict(list)


# Name of eval results file
eval_results_file = None

# Dump dialogue to average
def main():
    input_file = sys.argv[1]
    fliess_table = generate_fliess_table(input_file)
    print(len(fliess_table))
    print(fleiss_kappa(fliess_table, method='fleiss'))
def test_fleiss_kappa():
    #currently only example from Wikipedia page
    kappa_wp = 0.210
    assert_almost_equal(fleiss_kappa(table1), kappa_wp, decimal=3)
def compute_fleiss_kappa(data):
    """
    Computes label agreement between crowd workers according to Fleiss' Kappa
    w.r.t relevance of a tweet to the topic and sentiment separately.
    We have M rows representing tweets and N labels (from which a label was
    selected) as columns our matrix.
    Fleiss' Kappa is known to be a conservative metric as it sometimes yields
    low agreement, although the agreement is quite high in reality, see
    https://link.springer.com/article/10.1007/s11135-014-0003-1#page-1

    Parameters
    ----------
    data: dict - {tid: [label1, label2, label3...]}.

    Returns
    -------
    float, float.
    Fleiss' Kappa between 0 (no agreement) - 1 (perfect agreement) over all
    labels.
    Fleiss' Kappa between 0 (no agreement) - 1 (perfect agreement) investigating
    annotator agreement w.r.t. relevance only (irrelevant vs. rest).

    """
    # http://www.tau.ac.il/~tsirel/dump/Static/knowino.org/wiki/Fleiss%27_kappa.html
    # print """Agreement levels:
    #         < 0 	No agreement
    #         0.0 - 0.19 	Poor agreement
    #         0.20 - 0.39 	Fair agreement
    #         0.40 - 0.59 	Moderate agreement
    #         0.60 - 0.79 	Substantial agreement
    #         0.80 - 1.00 	Almost perfect agreement"""
    ############################
    # 1. Overall Fleiss' Kappa #
    ############################
    mat = np.zeros((len(data), len(LABEL_MAPPING)))
    # For each tweet
    for idx, tid in enumerate(data):
        labels = Counter(data[tid])
        # Count which labels exist for a tweet
        for label in labels:
            # Get the column of the label (= column to update)
            label_col = NUMBER_MAPPING[label]
            # Update the column with the votes of the crowd worers
            mat[idx, label_col] += labels[label]
    kappa_total = fleiss_kappa(mat)
    print "Overall Fleiss kappa:", kappa_total

    ########################################
    # 2. Fleiss' Kappa for tweet relevance #
    ########################################
    # We compare relevant (i.e. assigning a sentiment label) vs. irrelevant
    mat = np.zeros((len(data), 2))
    # Indices of the columns in the matrix for the two labels
    rel_col = 0
    irrel_col = 1
    # For each tweet
    for idx, tid in enumerate(data):
        labels = Counter(data[tid])
        # Count which labels exist for a tweet
        for label in labels:
            # Update the column with the votes of the crowd worers
            # a) Relevant
            if label != "Irrelevant":
                mat[idx, rel_col] += labels[label]
            # b) Irrelevant
            else:
                mat[idx, irrel_col] += labels[label]
    kappa_relevance = fleiss_kappa(mat)
    print "Relevance Fleiss kappa:", kappa_relevance
    return kappa_total, kappa_relevance
示例#16
0
def compute_overall_scores(coder_df, document_column, outcome_column,
                           coder_column):
    """
    Computes overall inter-rater reliability scores (Krippendorf's Alpha and Fleiss' Kappa). Allows for more than two \
    coders and code values. The input data must consist of a :py:class:`pandas.DataFrame` with the following columns:

        - A column with values that indicate the coder (like a name)
        - A column with values that indicate the document (like an ID)
        - A column with values that indicate the code value

    :param coder_df: A :py:class:`pandas.DataFrame` of codes
    :type coder_df: :py:class:`pandas.DataFrame`
    :param document_column: The column that contains IDs for the documents
    :type document_column: str
    :param outcome_column: The column that contains the codes
    :type outcome_column: str
    :param coder_column: The column containing values that indicate which coder assigned the code
    :type coder_column: str
    :return: A dictionary containing the scores
    :rtype: dict

    Usage::

        from pewanalytics.stats.irr import compute_overall_scores
        import pandas as pd

        df = pd.DataFrame([
            {"coder": "coder1", "document": 1, "code": "2"},
            {"coder": "coder2", "document": 1, "code": "2"},
            {"coder": "coder1", "document": 2, "code": "1"},
            {"coder": "coder2", "document": 2, "code": "2"},
            {"coder": "coder1", "document": 3, "code": "0"},
            {"coder": "coder2", "document": 3, "code": "0"},
        ])

        >>> compute_overall_scores(df, "document", "code", "coder")
        {'alpha': 0.5454545454545454, 'fleiss_kappa': 0.4545454545454544}

    """

    alpha = AnnotationTask(
        data=coder_df[[coder_column, document_column, outcome_column]].values)
    try:
        alpha = alpha.alpha()
    except (ZeroDivisionError, ValueError):
        alpha = None

    grouped = coder_df.groupby(document_column).count()
    complete_docs = grouped[grouped[coder_column] == len(
        coder_df[coder_column].unique())].index
    dataset = coder_df[coder_df[document_column].isin(complete_docs)]
    df = dataset.groupby([outcome_column,
                          document_column]).count()[[coder_column]]
    df = df.unstack(outcome_column).fillna(0)

    if len(df) > 0:
        kappa = fleiss_kappa(df)
    else:
        kappa = None

    return {"alpha": alpha, "fleiss_kappa": kappa}
示例#17
0
def scores(key, paths, config, as_dataframe=False, algo_idx=None):
    # print(key, paths)
    # key = 'enettv_0.1_0.5_0.1'
    # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1']
    key_parts = key.split("_")
    algo = key_parts[algo_idx] if algo_idx is not None else None
    key_parts.remove(algo)
    if len(key_parts) > 0:
        try:
            params = [float(p) for p in key_parts]
        except:
            params = [None, None, None]
    print(algo, params)
    #Comment out, because it's a 4 x 5 cross validation; it creates failed
    #    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
    #        print("Failed for key %s" % key)
    #        return None

    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)
    prob_pred_splits = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred_splits)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)

    # balanced accuracy (recall_mean)
    bacc_splits = [
        recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean()
        for f in range(len(y_true_splits))
    ]
    auc_splits = [
        roc_auc_score(y_true_splits[f], prob_pred_splits[f])
        for f in range(len(y_true_splits))
    ]

    print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits)))
    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_bacc = binom_test(success[0] + success[1],
                             s[0] + s[1],
                             p=0.5,
                             alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T

    # Correlation
    R = np.corrcoef(betas)
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in range(betas.shape[0])
        ])
        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0])
              for j in range(i + 1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    # Proportion of selection within the support accross the CV
    support_count = (betas_t != 0).sum(axis=0)
    support_count = support_count[support_count > 0]
    support_prop = support_count / betas_t.shape[0]

    scores = OrderedDict()
    scores['key'] = key
    scores['algo'] = algo
    scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params

    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['bacc'] = r.mean()
    scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits))
    scores["auc"] = auc
    scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits))
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_bacc_mean'] = pvalue_bacc
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar
    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)
    scores['beta_support_prop_select_mean'] = support_prop.mean()
    scores['beta_support_prop_select_sd'] = support_prop.std()

    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())],
                              columns=list(scores.keys()))

    return scores
示例#18
0
def calculate_inter_annotator_agreement(annotations):
    matrix = get_annotations_matrix(annotations)
    fleiss_kappa_score = fleiss_kappa(matrix)
    return fleiss_kappa_score
示例#19
0
4 	0 	3 	9 	2 	0 	0.440
5 	2 	2 	8 	1 	1 	0.330
6 	7 	7 	0 	0 	0 	0.462
7 	3 	2 	6 	3 	0 	0.242
8 	2 	5 	3 	2 	2 	0.176
9 	6 	5 	2 	1 	0 	0.286
10 	0 	2 	2 	3 	7 	0.286'''.split(), float).reshape(10,-1)


Total = np.asarray("20 	28 	39 	21 	32".split('\t'), int)
Pj = np.asarray("0.143 	0.200 	0.279 	0.150 	0.229".split('\t'), float)
kappa_wp = 0.210
table1 = table0[:, 1:-1]


print fleiss_kappa(table1)
table4 = np.array([[20,5], [10, 15]])
print 'res', cohens_kappa(table4), 0.4 #wikipedia

table5 = np.array([[45, 15], [25, 15]])
print 'res', cohens_kappa(table5), 0.1304 #wikipedia

table6 = np.array([[25, 35], [5, 35]])
print 'res', cohens_kappa(table6), 0.2593  #wikipedia
print 'res', cohens_kappa(table6, weights=np.arange(2)), 0.2593  #wikipedia
t7 = np.array([[16, 18, 28],
               [10, 27, 13],
               [28, 20, 24]])
print cohens_kappa(t7, weights=[0, 1, 2])

table8 = np.array([[25, 35], [5, 35]])
示例#20
0
def scores(key, paths, config, as_dataframe=False):
    import mapreduce
    print(key)
    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
        print("Failed for key %s" % key)
        return None
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)  #area under curve score.

    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_recall_mean = binom_test(success[0] + success[1],
                                    s[0] + s[1],
                                    p=0.5,
                                    alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T

    # Correlation
    R = np.corrcoef(betas)
    #print R
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in range(betas.shape[0])
        ])
        #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
        #print(np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5,
        #                   rtol=0, atol=1e-02))

        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0])
              for j in range(i + 1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    scores = OrderedDict()
    scores['key'] = key
    try:
        a, l1, l2, tv = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        scores['tv'] = tv
        left = float(1 - tv)
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores["auc"] = auc
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_recall_mean'] = pvalue_recall_mean
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar

    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)

    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())],
                              columns=list(scores.keys()))

    return scores
示例#21
0
def test_fleiss_kappa():
    #currently only example from Wikipedia page
    kappa_wp = 0.210
    assert_almost_equal(fleiss_kappa(table1), kappa_wp, decimal=3)
    df_tmp = df_count_agreement[df_count_agreement.task_id == task].iloc[:,
                                                                         1:3]
    df_merged = pd.merge_ordered(df_tmp,
                                 df_agreement_frame,
                                 fill_method='ffill',
                                 right_by="answer_csagreement",
                                 how="left")
    list_count_agreement.append(df_merged['size'].tolist())

df_kappaTable = pd.DataFrame(list_count_agreement)
df_disagree = df_kappaTable[0] + df_kappaTable[1]
df_agree = df_kappaTable[2] + df_kappaTable[3]

df_kappaTable = pd.concat([df_disagree, df_agree], axis=1)

ir.fleiss_kappa(df_kappaTable, method='fleiss')

#%%
df_kappaTable.columns = ['disagree', 'agree']

df_kappaTable

#%% [markdown]
# ## Evaluation effort

#%%
answer_no_dummy_dt

labels = ['DT', 'No DT']

answerDtMinutes = answer_no_dummy_dt.secondsToAnswer
示例#23
0
def analyze_interrater_reliability(phase, labels):
    results_csv = pd.DataFrame()
    labels_list = labels
    metadata_phase = pd.read_csv(
        f'{dataset_location}/metadata_phase_{phase}.csv')
    all_images = metadata_phase['image'].unique()

    # convert metadata table information to a more compact format to use for calculations
    full_array = np.full([5, len(all_images), len(labels)], True)
    for i, image in enumerate(all_images):
        j = 0
        for _, row in metadata_phase[metadata_phase['image'] ==
                                     image].iterrows():
            for k, label in enumerate(labels):
                if label.lower() in ['support devices', 'quality issue']:
                    full_array[j, i, k] = row[label]
                else:
                    # for labels that had a certainty chosen, only consider as present for Possibly or higher
                    full_array[j, i, k] = row[label] >= 3
            j += 1
        assert (j == 5)

    #calculate fleiss kappa for every label, as shown in part of Table 2
    for k in range(len(labels)):
        array_to_use = full_array

        # numpy.savetxt(f"rating_{labels[k].replace('/','_').replace(' ','_').lower()}_phase_{phase}.csv", convert_to_stats_table(array_to_use[:,:,k], 5), delimiter=",")
        # numpy.savetxt(f"rating_2_{labels[k].replace('/','_').replace(' ','_').lower()}_phase_{phase}.csv", array_to_use[:,:,k], delimiter=",")
        table_answers = convert_to_stats_table(array_to_use[:, :, k], 5)
        value = inter_rater.fleiss_kappa(table_answers, method='fleiss')

        #calculates the Fleiss Kappa standard error using the equation from the original paper (Fleiss, 1971)
        se = fleiss_kappa_standard_error(table_answers)

        new_row = {
            'label': labels_list[k],
            'title': 'Fleiss Kappa',
            'value': value
        }
        results_csv = results_csv.append(new_row, ignore_index=True)
        new_row = {
            'label': labels_list[k],
            'title': 'Fleiss Kappa standard error',
            'value': se
        }
        results_csv = results_csv.append(new_row, ignore_index=True)

        #calculate how much would the Fleiss Kappa be without the answers for each specific chest x-ray, for understanding what cases were the worst for that label
        for trial_index, _ in enumerate(all_images):
            value = inter_rater.fleiss_kappa(convert_to_stats_table(
                np.delete(array_to_use[:, :, k], trial_index, axis=1), 5),
                                             method='fleiss')
            new_row = {
                'label': labels_list[k],
                'trial': trial_index,
                'title': 'Fleiss Kappa (except trial)',
                'value': value
            }
            results_csv = results_csv.append(new_row, ignore_index=True)

    #get IoU for chest bounding boxes, used to calculate the numbers presented in Technical Validation > Validation Labels > Chest bounding boxes
    for image_index, image in enumerate(all_images):
        this_case = metadata_phase[metadata_phase['image'] == image]
        all_chest_boxes = []
        for id in this_case['id'].values:
            chest_box_table = pd.read_csv(
                f'{dataset_location}/{id}/chest_bounding_box.csv')
            chest_box_coordinates = chest_box_table.values[0]
            assert (len(chest_box_coordinates) == 4)
            all_chest_boxes.append(chest_box_coordinates)
        for index_1 in range(len(all_chest_boxes)):
            for index_2 in range(len(all_chest_boxes)):
                if index_1 != index_2:
                    value = get_iou([all_chest_boxes[index_1]],
                                    [all_chest_boxes[index_2]], create_box)
                    new_row = {
                        'trial': image_index,
                        'title': 'Chest Box IoU',
                        'value': value
                    }
                    results_csv = results_csv.append(new_row,
                                                     ignore_index=True)

    #get IoU for drawn ellipses, used to calculate part of Table 2
    for k in range(len(labels_list)):
        print(labels_list[k])
        for image_index, image in enumerate(all_images):
            ellipses_iou_k = []
            this_case = metadata_phase[metadata_phase['image'] == image]
            ellipses = []
            for id in this_case['id'].values:

                ellipse_table = pd.read_csv(
                    f'{dataset_location}/{id}/anomaly_location_ellipses.csv')

                #only use labels with certainty Possibly or higher
                ellipse_table = ellipse_table[ellipse_table['certainty'] > 2]
                #only use the currently selected label
                ellipse_table = ellipse_table[ellipse_table[labels_list[k]]]

                if len(ellipse_table) > 0:
                    ellipses.append(
                        ellipse_table[['xmin', 'ymin', 'xmax', 'ymax']].values)
                else:
                    ellipses.append([])
            for user_index in range(len(ellipses)):
                # do IoU for label BBox for every pairs of users who drew at least one ellipse for this label
                for user_index_2 in range(len(ellipses)):
                    if user_index_2 != user_index:
                        if len(ellipses[user_index]) > 0 and len(
                                ellipses[user_index_2]) > 0:
                            value = get_iou(ellipses[user_index],
                                            ellipses[user_index_2],
                                            create_ellipse)
                            ellipses_iou_k.append(value)
            # calculates the average IoU for all the readings of this specific chest x-ray and label
            if len(ellipses_iou_k) > 0:
                average_iou = np.mean(ellipses_iou_k)
                new_row = {
                    'label': labels_list[k],
                    'trial': image_index,
                    'title': 'Ellipse IoU',
                    'value': average_iou
                }
                results_csv = results_csv.append(new_row, ignore_index=True)

    results_csv.to_csv(f'interrater_phase_{phase}.csv', index=False)
示例#24
0
def reducer(key, values):
    global N_COMP, N_FOLDS
    # N_FOLDS is the number of true folds (not the number of resamplings)
    # key : string of intermediary key
    # load return dict corresponding to mapper ouput. they need to be loaded.]
    # Avoid taking into account the fold 0
    values = [item.load() for item in values[1:]]

    # Load components: each file is 4096xN_COMP matrix.
    # We stack them on the third dimension (folds)
    components = np.dstack([item["components"] for item in values])
    # Thesholded components (list of tuples (comp, threshold))
    thresh_components = np.empty(components.shape)
    thresholds = np.empty((N_COMP, N_FOLDS))
    for l in range(N_FOLDS):
        for k in range(N_COMP):
            thresh_comp, t = array_utils.arr_threshold_from_norm2_ratio(
                components[:, k, l], .99)
            thresh_components[:, k, l] = thresh_comp
            thresholds[k, l] = t
    frobenius_train = np.vstack([item["frobenius_train"] for item in values])
    frobenius_test = np.vstack([item["frobenius_test"] for item in values])
    l0 = np.vstack([item["l0"] for item in values])
    l1 = np.vstack([item["l1"] for item in values])
    l2 = np.vstack([item["l2"] for item in values])
    tv = np.vstack([item["tv"] for item in values])
    evr_train = np.vstack([item["evr_train"] for item in values])
    evr_test = np.vstack([item["evr_test"] for item in values])
    times = [item["time"] for item in values]

    # Average precision/recall across folds for each component
    av_frobenius_train = frobenius_train.mean(axis=0)
    av_frobenius_test = frobenius_test.mean(axis=0)
    av_evr_train = evr_train.mean(axis=0)
    av_evr_test = evr_test.mean(axis=0)
    av_l0 = l0.mean(axis=0)
    av_l1 = l1.mean(axis=0)
    av_l2 = l2.mean(axis=0)
    av_tv = tv.mean(axis=0)

    # Compute correlations of components between all folds
    n_corr = N_FOLDS * (N_FOLDS - 1) / 2
    correlations = np.zeros((N_COMP, n_corr))
    for k in range(N_COMP):
        R = np.corrcoef(np.abs(components[:, k, :].T))
        # Extract interesting coefficients (upper-triangle)
        correlations[k] = R[np.triu_indices_from(R, 1)]

    # Transform to z-score
    Z = 1. / 2. * np.log((1 + correlations) / (1 - correlations))
    # Average for each component
    z_bar = np.mean(Z, axis=1)
    # Transform back to average correlation for each component
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # Compute fleiss_kappa and DICE on thresholded components
    fleiss_kappas = np.empty(N_COMP)
    dice_bars = np.empty(N_COMP)
    for k in range(N_COMP):
        # One component accross folds
        thresh_comp = thresh_components[:, k, :]
        try:
            # Compute fleiss kappa statistics
            # The "raters" are the folds and we have 3 variables:
            #  - number of null coefficients
            #  - number of > 0 coefficients
            #  - number of < 0 coefficients
            # We build a (N_FOLDS, 3) table
            thresh_comp_signed = np.sign(thresh_comp)
            table = np.zeros((N_FOLDS, 3))
            table[:, 0] = np.sum(thresh_comp_signed == 0, 0)
            table[:, 1] = np.sum(thresh_comp_signed == 1, 0)
            table[:, 2] = np.sum(thresh_comp_signed == -1, 0)
            fleiss_kappa_stat = fleiss_kappa(table)
        except:
            fleiss_kappa_stat = 0.
        fleiss_kappas[k] = fleiss_kappa_stat
        try:
            # Paire-wise DICE coefficient (there is the same number than
            # pair-wise correlations)
            thresh_comp_n0 = thresh_comp != 0
            # Index of lines (folds) to use
            ij = [[i, j] for i in xrange(N_FOLDS)
                  for j in xrange(i + 1, N_FOLDS)]
            num = [
                np.sum(thresh_comp[idx[0], :] == thresh_comp[idx[1], :])
                for idx in ij
            ]
            denom = [(np.sum(thresh_comp_n0[idx[0], :]) + \
                      np.sum(thresh_comp_n0[idx[1], :]))
                     for idx in ij]
            dices = np.array([float(num[i]) / denom[i] for i in range(n_corr)])
            dice_bar = dices.mean()
        except:
            dice_bar = 0.
        dice_bars[k] = dice_bar

    scores = OrderedDict(
        (('model', key[0]), ('global_pen', key[1]), ('tv_ratio', key[2]),
         ('l1_ratio', key[3]), ('frobenius_train', av_frobenius_train[0]),
         ('frobenius_test', av_frobenius_test[0]), ('correlation_0', r_bar[0]),
         ('correlation_1', r_bar[1]), ('correlation_2', r_bar[2]),
         ('correlation_mean', np.mean(r_bar)), ('kappa_0', fleiss_kappas[0]),
         ('kappa_1', fleiss_kappas[1]), ('kappa_2', fleiss_kappas[2]),
         ('kappa_mean', np.mean(fleiss_kappas)), ('dice_bar_0', dice_bars[0]),
         ('dice_bar_1', dice_bars[1]), ('dice_bar_2',
                                        dice_bars[2]), ('dice_bar_mean',
                                                        np.mean(dice_bar)),
         ('evr_train_0', av_evr_train[0]), ('evr_train_1', av_evr_train[1]),
         ('evr_train_2', av_evr_train[2]), ('evr_test_0', av_evr_test[0]),
         ('evr_test_1', av_evr_test[1]), ('evr_test_2',
                                          av_evr_test[2]), ('l0_0', av_l0[0]),
         ('l0_1', av_l0[1]), ('l0_2', av_l0[2]), ('l1_0', av_l1[0]),
         ('l1_1', av_l1[1]), ('l1_2', av_l1[2]), ('l2_0', av_l2[0]),
         ('l2_1', av_l2[1]), ('l2_2', av_l2[2]), ('tv_0', av_tv[0]),
         ('tv_1', av_tv[1]), ('tv_2', av_tv[2]), ('time', np.mean(times))))

    return scores