Exemplo n.º 1
0
def scores(key, paths, config):
    import mapreduce
    print(key)
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = [item["prob_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred)
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)  #area under curve score.
    #betas = np.hstack([item["beta"] for item in values]).T
    # threshold betas to compute fleiss_kappa and DICE
    #betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0])])
    #Compute pvalue
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_recall_mean = binom_test(success[0] + success[1],
                                    s[0] + s[1],
                                    p=0.5,
                                    alternative='greater')
    scores = OrderedDict()
    try:
        a, l1, l2, tv = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        scores['tv'] = tv
        left = float(1 - tv)
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores["auc"] = auc
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_recall_mean'] = pvalue_recall_mean
    #scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
    #                               float(np.prod(betas.shape))
    scores['param_key'] = key
    return scores
Exemplo n.º 2
0
def scores(key, paths, config, ret_y=False):
    import mapreduce
    print(key)
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    #prob_pred = [item["proba_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    #prob_pred = np.concatenate(prob_pred)
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, y_pred)  #area under curve score.
    betas = np.hstack([item["beta"] for item in values]).T
    #Compute pvalue
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_recall_mean = binom_test(success[0] + success[1],
                                    s[0] + s[1],
                                    p=0.5,
                                    alternative='greater')
    scores = OrderedDict()
    try:
        c = float(key[0])

        scores['c'] = c
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores["auc"] = auc
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_recall_mean'] = pvalue_recall_mean
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas)) / \
                                    float(np.prod(betas.shape))
    scores['param_key'] = key
    if ret_y:
        scores["y_true"], scores["y_pred"] = y_true, y_pred
    return scores
Exemplo n.º 3
0
def scores(key, paths, config):
    key_parts = key.split("_")
    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)
    prob_pred_splits = [item["prob_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred_splits)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)

    # balanced accuracy (recall_mean)
    bacc_splits = [
        recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean()
        for f in range(len(y_true_splits))
    ]
    auc_splits = [
        roc_auc_score(y_true_splits[f], prob_pred_splits[f])
        for f in range(len(y_true_splits))
    ]

    print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits)))
    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_bacc = binom_test(success[0] + success[1],
                             s[0] + s[1],
                             p=0.5,
                             alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T
    # Correlation
    R = np.corrcoef(betas)
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in range(betas.shape[0])
        ])
        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0])
              for j in range(i + 1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    # Proportion of selection within the support accross the CV
    support_count = (betas_t != 0).sum(axis=0)
    support_count = support_count[support_count > 0]
    support_prop = support_count / betas_t.shape[0]

    scores = OrderedDict()
    scores['key'] = key
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['bacc'] = r.mean()
    scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits))
    scores["auc"] = auc
    scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits))
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_bacc_mean'] = pvalue_bacc
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar
    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)
    scores['beta_support_prop_select_mean'] = support_prop.mean()
    scores['beta_support_prop_select_sd'] = support_prop.std()

    return scores
Exemplo n.º 4
0
def scores(key, paths, config):
    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)
    prob_pred_splits = [item["prob_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred_splits)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)

    # balanced accuracy (recall_mean)
    bacc_splits = [
        recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean()
        for f in range(len(y_true_splits))
    ]
    auc_splits = [
        roc_auc_score(y_true_splits[f], prob_pred_splits[f])
        for f in range(len(y_true_splits))
    ]

    print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits)))
    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_bacc = binom_test(success[0] + success[1],
                             s[0] + s[1],
                             p=0.5,
                             alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T
    # Correlation
    R = np.corrcoef(betas)
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    scores = OrderedDict()
    scores['key'] = key
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['bacc'] = r.mean()
    scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits))
    scores["auc"] = auc
    scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits))
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_bacc_mean'] = pvalue_bacc
    return scores
Exemplo n.º 5
0
def scores(key, paths, config, as_dataframe=False, algo_idx=None):
    # print(key, paths)
    # key = 'enettv_0.1_0.5_0.1'
    # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1']
    key_parts = key.split("_")
    algo = key_parts[algo_idx] if algo_idx is not None else None
    key_parts.remove(algo)
    if len(key_parts) > 0:
        try:
            params = [float(p) for p in key_parts]
        except:
            params = [None, None, None]
    print(algo, params)
    #Comment out, because it's a 4 x 5 cross validation; it creates failed
    #    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
    #        print("Failed for key %s" % key)
    #        return None

    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)
    prob_pred_splits = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred_splits)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)

    # balanced accuracy (recall_mean)
    bacc_splits = [
        recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean()
        for f in range(len(y_true_splits))
    ]
    auc_splits = [
        roc_auc_score(y_true_splits[f], prob_pred_splits[f])
        for f in range(len(y_true_splits))
    ]

    print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits)))
    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_bacc = binom_test(success[0] + success[1],
                             s[0] + s[1],
                             p=0.5,
                             alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T

    # Correlation
    R = np.corrcoef(betas)
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in range(betas.shape[0])
        ])
        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0])
              for j in range(i + 1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    # Proportion of selection within the support accross the CV
    support_count = (betas_t != 0).sum(axis=0)
    support_count = support_count[support_count > 0]
    support_prop = support_count / betas_t.shape[0]

    scores = OrderedDict()
    scores['key'] = key
    scores['algo'] = algo
    scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params

    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['bacc'] = r.mean()
    scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits))
    scores["auc"] = auc
    scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits))
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_bacc_mean'] = pvalue_bacc
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar
    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)
    scores['beta_support_prop_select_mean'] = support_prop.mean()
    scores['beta_support_prop_select_sd'] = support_prop.std()

    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())],
                              columns=list(scores.keys()))

    return scores
Exemplo n.º 6
0
def reducer(key, values):
    # key : string of intermediary key
    # load return dict correspondning to mapper ouput. they need to be loaded.
    # DEBUG
    import mapreduce as GLOBAL
    criteria = {'recall_mean': [np.argmax, np.max],
                'min_recall': [np.argmax, np.max],
                'accuracy': [np.argmax, np.max]}
    output_summary = GLOBAL.OUTPUT_SUMMARY
    output_path = GLOBAL.OUTPUT_PATH
    map_output = GLOBAL.MAP_OUTPUT
    roi = GLOBAL.ROI
    BASE = os.path.join("/neurospin/brainomics/2014_deptms/results_enettv/",
                        "MRI_" + roi,
                        map_output)
    INPUT = BASE + "/%i/%s"
    OUTPUT = BASE + "/../" + output_path
    if not os.path.exists(OUTPUT):
        os.makedirs(OUTPUT)
    params = GLOBAL.PARAMS
    keys = ['_'.join(str(e) for e in a) for a in params]
    compt = 0
    print "Model Construction, first cross-validation"
    for key in keys:
        if not os.path.isfile(OUTPUT + "/perms_selection_" + key + ".npz"):
            print "key: ", key
            paths_dCV_all = [INPUT % (perm, key) \
                    for perm in xrange(NFOLDS * NFOLDS * NRNDPERMS)]
            idx_dCV_blocks = range(0,
                             (NFOLDS * NFOLDS * NRNDPERMS) + NFOLDS * NFOLDS,
                             NFOLDS * NFOLDS)
            permutation_perms = np.zeros(NRNDPERMS * NFOLDS)
            n_fold_perms = np.zeros(NRNDPERMS * NFOLDS)
            parameters_perms = np.zeros(NRNDPERMS * NFOLDS, dtype='a50')
            recall_0_perms = np.zeros(NRNDPERMS * NFOLDS)
            recall_1_perms = np.zeros(NRNDPERMS * NFOLDS)
            min_recall_perms = np.zeros(NRNDPERMS * NFOLDS)
            recall_mean_perms = np.zeros(NRNDPERMS * NFOLDS)
            accuracy_perms = np.zeros(NRNDPERMS * NFOLDS)
            compt = 0
            for perm in xrange(NRNDPERMS):
                print "perm: ", perm
                paths_dCV_blocks = paths_dCV_all[idx_dCV_blocks[perm]:\
                                                idx_dCV_blocks[perm + 1]]
                idx_fold_blocks = range(0, NFOLDS * NFOLDS + NFOLDS, NFOLDS)
                # for each outer fold
                for fold in xrange(0, NFOLDS):
                    path_fold_blocks = paths_dCV_blocks[idx_fold_blocks[fold]:\
                                                    idx_fold_blocks[fold + 1]]
                    values = [GLOBAL.OutputCollector(p) \
                                    for p in path_fold_blocks]
                    values = [item.load() for item in values]
                    n_fold = [item["nfold"] for item in values]
                    assert n_fold == ([fold for i in xrange(NFOLDS)])
                    y_true = [item["y_true"].ravel() for item in values]
                    y_true = np.hstack(y_true)
                    y_pred = [item["y_pred"].ravel() for item in values]
                    y_pred = np.hstack(y_pred)
                    prob_pred = [item["proba_pred"].ravel() for item in values]
                    prob_pred = np.hstack(prob_pred)
                    p, r, f, s = precision_recall_fscore_support(y_true,
                                                                 y_pred,
                                                                 average=None)
                    accuracy = (r[0] * s[0] + r[1] * s[1])
                    accuracy = accuracy.astype('int')
                    permutation_perms[compt] = perm
                    n_fold_perms[compt] = n_fold[0]
                    parameters_perms[compt] = key
                    recall_0_perms[compt] = r[0]
                    recall_1_perms[compt] = r[1]
                    min_recall_perms[compt] = np.minimum(r[0], r[1])
                    recall_mean_perms[compt] = r.mean()
                    accuracy_perms[compt] = accuracy / float(s[0] + s[1])
                    compt += 1
            print "compt = ", compt
            print "save", key
            np.savez_compressed(OUTPUT + "/perms_selection_" + key + ".npz",
                                permutation=permutation_perms,
                                n_fold=n_fold_perms,
                                parameters=parameters_perms,
                                recall_0=recall_0_perms,
                                recall_1=recall_1_perms,
                                min_recall=min_recall_perms,
                                recall_mean=recall_mean_perms,
                                accuracy=accuracy_perms)

    if not os.path.isfile(os.path.join(OUTPUT, output_summary)):
        print "Model Selection"
        perms = dict()
        scores = OrderedDict()
        scores['permutation'] = []
        scores['n_fold'] = []
        scores['parameters'] = []
        scores['recall_0'] = []
        scores['recall_1'] = []
        scores['min_recall'] = []
        scores['recall_mean'] = []
        scores['accuracy'] = []
        for i, key in enumerate(keys):
            print "key: ", key
            perms = np.load(OUTPUT + "/perms_selection_" + key + ".npz")
            for s in perms:
                scores[s] += perms[s].tolist()
        compt = 0
        scores_tab = pd.DataFrame(scores)
        perm_groups = scores_tab.groupby('permutation')
        for perm_val, perm_group in perm_groups:
            fold_groups = perm_group.groupby('n_fold')
            for fold_val, fold_group in fold_groups:
                scores_dCV = OrderedDict()
                scores_dCV['permutation'] = perm_val
                scores_dCV['n_fold'] = fold_val
                n_crit = 0
                for item, val in criteria.items():
                    n_crit += 1
                    scores_dCV['criteria_' + item] = item
                    loc_opt = val[0](fold_group[item])
                    value_opt = val[1](fold_group[item])
                    scores_dCV['value_opt_' + item] = value_opt
                    param_opt = fold_group.parameters[loc_opt]
                    scores_dCV['param_opt_' + item] = param_opt
                if compt == 0:
                    scores_select_model = pd.DataFrame(
                                                   columns=scores_dCV.keys())
                scores_select_model.loc[compt, ] = scores_dCV.values()
                compt += 1
        scores_select_model.to_csv(os.path.join(OUTPUT, output_summary),
                                   index=False)
    return {}
Exemplo n.º 7
0
def scores(key, paths, config, as_dataframe=False):
    import mapreduce
    print(key)
    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
        print("Failed for key %s" % key)
        return None
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = [item["proba_pred"].ravel() for item in values]
    prob_pred = np.concatenate(prob_pred)

    # Prediction performances
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)  #area under curve score.

    # P-values
    success = r * s
    success = success.astype('int')
    prob_class1 = np.count_nonzero(y_true) / float(len(y_true))
    pvalue_recall0_true_prob = binom_test(success[0],
                                          s[0],
                                          1 - prob_class1,
                                          alternative='greater')
    pvalue_recall1_true_prob = binom_test(success[1],
                                          s[1],
                                          prob_class1,
                                          alternative='greater')
    pvalue_recall0_unknwon_prob = binom_test(success[0],
                                             s[0],
                                             0.5,
                                             alternative='greater')
    pvalue_recall1_unknown_prob = binom_test(success[1],
                                             s[1],
                                             0.5,
                                             alternative='greater')
    pvalue_recall_mean = binom_test(success[0] + success[1],
                                    s[0] + s[1],
                                    p=0.5,
                                    alternative='greater')

    # Beta's measures of similarity
    betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T

    # Correlation
    R = np.corrcoef(betas)
    #print R
    R = R[np.triu_indices_from(R, 1)]
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in range(betas.shape[0])
        ])
        #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
        #print(np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5,
        #                   rtol=0, atol=1e-02))

        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in range(betas.shape[0])
              for j in range(i + 1, betas.shape[0])]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0

    scores = OrderedDict()
    scores['key'] = key
    try:
        a, l1, l2, tv = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        scores['tv'] = tv
        left = float(1 - tv)
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores["auc"] = auc
    scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob
    scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob
    scores[
        'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob
    scores[
        'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob
    scores['pvalue_recall_mean'] = pvalue_recall_mean
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice_bar'] = dice_bar

    scores['beta_dice'] = str(dices)
    scores['beta_r'] = str(R)

    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())],
                              columns=list(scores.keys()))

    return scores
Exemplo n.º 8
0
def scores(key, paths, config, as_dataframe=False, algo_idx=None):
    # print(key, paths)
    # key = 'enettv_0.1_0.5_0.1'
    # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1']
    key_parts = key.split("_")
    algo = key_parts[algo_idx] if algo_idx is not None else None
    key_parts.remove(algo)
    if len(key_parts) > 0:
        try:
            params = [float(p) for p in key_parts]
        except:
            params = [None, None, None]
    print(algo, params)
    if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER):
        print("Failed for key %s" % key)
        return None

    values = [mapreduce.OutputCollector(p) for p in paths]
    try:
        values = [item.load() for item in values]
    except Exception as e:
        print(e)
        return None

    y_true_splits = [item["y_true"].ravel() for item in values]
    y_pred_splits = [item["y_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true_splits)
    y_pred = np.concatenate(y_pred_splits)

    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        y_true, y_pred)
    betas = np.hstack([item["beta"] for item in values]).T
    # threshold betas to compute fleiss_kappa and DICE
    betas_t = np.vstack([
        array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
        for i in range(betas.shape[0])
    ])
    #Compute pvalue
    scores = OrderedDict()
    scores['key'] = key
    scores['algo'] = algo
    scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params
    scores['algo'] = algo
    try:
        a, l1, l2, tv = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        scores['tv'] = tv
        left = float(1 - tv)
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
    except:
        pass
    scores['slope'] = slope
    scores['intercept'] = intercept
    scores['r_value'] = r_value
    scores['p_value'] = p_value
    scores['std_err'] = std_err
    scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \
                                    float(np.prod(betas.shape))
    scores['param_key'] = key
    if as_dataframe:
        scores = pd.DataFrame([list(scores.values())],
                              columns=list(scores.keys()))

    return scores
Exemplo n.º 9
0
def reducer(key, values):
    import mapreduce as GLOBAL
    # key : string of intermediary key
    # load return dict corresponding to mapper ouput. they need to be loaded.
    # Compute sd; ie.: compute results on each folds
    roi = GLOBAL.ROI
    criteria = {
        'recall_mean': [np.argmax, np.max],
        'min_recall': [np.argmax, np.max],
        'accuracy': [np.argmax, np.max]
    }
    output_selection = GLOBAL.OUTPUT_SELECTION
    output_summary = GLOBAL.OUTPUT_SUMMARY
    map_output = GLOBAL.MAP_OUTPUT
    BASE = os.path.join("/neurospin/brainomics/2014_deptms/results_enettv/",
                        "MRI_" + roi, map_output)
    INPUT = BASE + "/%i/%s"
    penalty_start = GLOBAL.PENALTY_START
    prob_class1 = GLOBAL.PROB_CLASS1
    params = GLOBAL.PARAMS
    # load all keys (sets of parameters)
    keys = ['_'.join(str(e) for e in a) for a in params]
    compt = 0
    if not os.path.isfile(output_selection):
        print "Model Construction, first cross-validation"
        # loop for the selection of the model
        for fold in xrange(0, NFOLDS + 1):  # outer folds
            # inner folds (NFOLDS) associated to the outer fold
            idx_block = range(fold * (NFOLDS + 1),
                              (fold + 1) * (NFOLDS + 1) - 1)
            for key in keys:
                # paths of the map results of all inner folds associated to
                # a key and an outer fold
                paths_dCV = [INPUT % (idx, key) for idx in idx_block]
                scores_CV = OrderedDict()
                # get values
                values = [GLOBAL.OutputCollector(p) for p in paths_dCV]
                values = [item.load() for item in values]
                n_fold = [item["n_fold"] for item in values]
                assert n_fold == ([fold for i in xrange(NFOLDS)])
                recall_mean_std = np.std([np.mean(
                    precision_recall_fscore_support(
                    item["y_true"].ravel(), item["y_pred"])[1]) \
                    for item in values]) \
                    / np.sqrt(len(values))
                recall = [
                    precision_recall_fscore_support(item["y_true"].ravel(),
                                                    item["y_pred"].ravel(),
                                                    average=None)[1]
                    for item in values
                ]
                support = [
                    precision_recall_fscore_support(item["y_true"].ravel(),
                                                    item["y_pred"].ravel(),
                                                    average=None)[3]
                    for item in values
                ]
                accuracy_std = np.std([((recall[i][0] * support[i][0] + \
                         recall[i][1] * support[i][1]) \
                                 / (float(support[i][0] + support[i][1]))) \
                                 for i in xrange(len(values))]) \
                         / np.sqrt(len(values))
                y_true = [item["y_true"].ravel() for item in values]
                y_true = np.hstack(y_true)
                y_pred = [item["y_pred"].ravel() for item in values]
                y_pred = np.hstack(y_pred)
                prob_pred = [item["proba_pred"].ravel() for item in values]
                prob_pred = np.hstack(prob_pred)
                p, r, f, s = precision_recall_fscore_support(y_true,
                                                             y_pred,
                                                             average=None)
                auc = roc_auc_score(y_true, prob_pred)
                betas = [item["beta"][penalty_start:] for item in values]
                betas = np.hstack(betas).T
                n_ite = np.mean(np.array([item["n_iter"] for item in values]))
                R = np.corrcoef(betas)
                beta_cor_mean = np.mean(R[np.triu_indices_from(R, 1)])
                success = r * s
                success = success.astype('int')
                accuracy = (r[0] * s[0] + r[1] * s[1])
                accuracy = accuracy.astype('int')
                pvalue_class0 = binom_test(success[0], s[0], 1 - prob_class1)
                pvalue_class1 = binom_test(success[1], s[1], prob_class1)
                pvalue_accuracy = binom_test(accuracy, s[0] + s[1], p=0.5)
                k = key.split('_')
                a, l1 = float(k[0]), float(k[1])
                l2, tv = float(k[2]), float(k[3])
                left = float(1 - tv)
                if left == 0:
                    left = 1.
                scores_CV['n_fold'] = n_fold[0]
                scores_CV['parameters'] = key
                scores_CV['a'] = a
                scores_CV['l1'] = l1
                scores_CV['l2'] = l2
                scores_CV['tv'] = tv
                scores_CV['recall_0'] = r[0]
                scores_CV['pvalue_recall_0'] = pvalue_class0
                scores_CV['recall_1'] = r[1]
                scores_CV['pvalue_recall_1'] = pvalue_class1
                scores_CV['min_recall'] = np.minimum(r[0], r[1])
                scores_CV['max_pvalue_recall'] = np.maximum(
                    pvalue_class0, pvalue_class1)
                scores_CV['recall_mean'] = r.mean()
                scores_CV['recall_mean_std'] = recall_mean_std
                scores_CV['accuracy'] = accuracy / float(s[0] + s[1])
                scores_CV['pvalue_accuracy'] = pvalue_accuracy
                scores_CV['accuracy_std'] = accuracy_std
                scores_CV['precision_0'] = p[0]
                scores_CV['precision_1'] = p[1]
                scores_CV['precision_mean'] = p.mean()
                scores_CV['f1_0'] = f[0]
                scores_CV['f1_1'] = f[1]
                scores_CV['f1_mean'] = f.mean()
                scores_CV['support_0'] = s[0]
                scores_CV['support_1'] = s[1]
                scores_CV['n_ite_mean'] = n_ite
                scores_CV['auc'] = auc
                scores_CV['beta_cor_mean'] = beta_cor_mean
                scores_CV['prop_non_zeros_mean'] = float(np.count_nonzero(betas)) \
                                                / float(np.prod(betas.shape))
                # stock results in dataframe scores_tab
                if compt == 0:
                    scores_tab = pd.DataFrame(columns=scores_CV.keys())
                scores_tab.loc[compt, ] = scores_CV.values()
                compt += 1
        print "save results of the inner cross-validation : ", output_selection
        scores_tab.to_csv(output_selection, index=False)

    if not os.path.isfile(output_summary):
        print "Model Selection"
        scores_tab = pd.read_csv(output_selection)
        fold_groups = scores_tab.groupby('n_fold')
        compt = 0
        for fold_val, fold_group in fold_groups:
            scores_dCV = OrderedDict()
            scores_dCV['n_fold'] = fold_val
            # for each outer fold and ecah criterion, select the set of
            # parameters that optimizes the criterion
            for item, val in criteria.items():
                scores_dCV['criteria_' + item] = item
                loc_opt = val[0](fold_group[item])
                value_opt = val[1](fold_group[item])
                scores_dCV['value_opt_' + item] = value_opt
                param_opt = fold_group.parameters[loc_opt]
                a_opt = fold_group.a[loc_opt]
                l1_opt = fold_group.l1[loc_opt]
                tv_opt = fold_group.tv[loc_opt]
                scores_dCV['param_opt_' + item] = param_opt
                scores_dCV['a_opt_' + item] = a_opt
                scores_dCV['l1_opt_' + item] = l1_opt
                scores_dCV['tv_opt_' + item] = tv_opt
            # stock results in dataframe scores_select_model
            if compt == 0:
                scores_select_model = pd.DataFrame(columns=scores_dCV.keys())
            scores_select_model.loc[compt, ] = scores_dCV.values()
            compt += 1
        print "save results of the model selection : ", output_summary
        scores_select_model.to_csv(output_summary, index=False)
    return {}
Exemplo n.º 10
0
def reducer_(key, values):
    # key : string of intermediary key
    # load return dict correspondning to mapper ouput. they need to be loaded.
    # DEBUG
    import glob, mapreduce
    BASE = "/neurospin/brainomics/2013_adni/ADAS11-MCIc-CTL/rndperm"
    INPUT = BASE + "/%i/%s"
    OUTPUT = BASE + "/../results/rndperm"
    keys = ["0.001_0.3335_0.3335_0.333_-1",  "0.001_0.5_0_0.5_-1",  "0.001_0.5_0.5_0_-1",  "0.001_1_0_0_-1"]
    for key in keys:
        #key = keys[0]
        paths_5cv_all = [INPUT % (perm, key) for perm in xrange(NFOLDS * NRNDPERMS)]
        idx_5cv_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS)
        cpt = 0
        qc = dict()
        r2_perms = np.zeros(NRNDPERMS)
        corr_perms = np.zeros(NRNDPERMS)
        r_bar_perms = np.zeros(NRNDPERMS)
        fleiss_kappa_stat_perms = np.zeros(NRNDPERMS)
        dice_bar_perms = np.zeros(NRNDPERMS)
        for perm_i in xrange(len(idx_5cv_blocks)-1):
            paths_5cv = paths_5cv_all[idx_5cv_blocks[perm_i]:idx_5cv_blocks[perm_i+1]]
            for p in paths_5cv:
                if os.path.exists(p) and not(p in qc):
                    if p in qc:
                        qc[p] += 1
                    else:
                        qc[p] = 1
                    cpt += 1
            #
            values = [mapreduce.OutputCollector(p) for p in paths_5cv]
            values = [item.load() for item in values]
            y_true = [item["y_true"].ravel() for item in values]
            y_pred = [item["y_pred"].ravel() for item in values]
            y_true = np.concatenate(y_true)
            y_pred = np.concatenate(y_pred)
            r2 = r2_score(y_true, y_pred)
            corr = np.corrcoef(y_true.ravel(), y_pred.ravel())[0, 1]
            betas = np.hstack([item["beta"] for item in values]).T
            #
            ## Compute beta similarity measures
            #
            # Correlation
            R = np.corrcoef(betas)
            R = R[np.triu_indices_from(R, 1)]
            # Fisher z-transformation / average
            z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
            # bracktransform
            r_bar = (np.exp(2 * z_bar) - 1) /  (np.exp(2 * z_bar) + 1)
            #
            # threshold betas to compute fleiss_kappa and DICE
            try:
                betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0])])
                print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
                print np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5,
                                   rtol=0, atol=1e-02)
                #
                # Compute fleiss kappa statistics
                beta_signed = np.sign(betas_t)
                table = np.zeros((beta_signed.shape[1], 3))
                table[:, 0] = np.sum(beta_signed == 0, 0)
                table[:, 1] = np.sum(beta_signed == 1, 0)
                table[:, 2] = np.sum(beta_signed == -1, 0)
                fleiss_kappa_stat = fleiss_kappa(table)
                #
                # Paire-wise Dice coeficient
                beta_n0 = betas_t != 0
                ij = [[i, j] for i in xrange(5) for j in xrange(i+1, 5)]
                #print [[idx[0], idx[1]] for idx in ij]
                dice_bar = np.mean([float(np.sum(beta_signed[idx[0], :] == beta_signed[idx[1], :])) /\
                     (np.sum(beta_n0[idx[0], :]) + np.sum(beta_n0[idx[1], :]))
                     for idx in ij])
            except:
                dice_bar = fleiss_kappa_stat = 0.
            #
            r2_perms[perm_i] = r2
            corr_perms[perm_i] = corr
            r_bar_perms[perm_i] = r_bar
            fleiss_kappa_stat_perms[perm_i] = fleiss_kappa_stat
            dice_bar_perms[perm_i] = dice_bar
        # END PERMS
        print "save", key
        np.savez_compressed(OUTPUT+"/perms_"+key+".npz",
                            r2=r2_perms, corr=corr_perms,
                            r_bar=r_bar_perms, fleiss_kappa=fleiss_kappa_stat_perms,
                            dice_bar=dice_bar_perms)
        #
        perms = dict()
        fig, axis = plt.subplots(len(keys), 4)#, sharex='col')
        for i, key in enumerate(keys):
            perms[key] = np.load(OUTPUT+"/perms_"+key+".npz")
            n, bins, patches = axis[i, 0].hist(perms[key]['r2'], 50, normed=1, histtype='stepfilled')
            axis[i, 0].set_title(key + "_r2")
            n, bins, patches = axis[i, 1].hist(perms[key]['r_bar'], 50, normed=1, histtype='stepfilled')
            axis[i, 1].set_title(key + "_r_bar")
            n, bins, patches = axis[i, 2].hist(perms[key]['fleiss_kappa'], 50, histtype='stepfilled')
            axis[i, 2].set_title(key + "_fleiss_kappa")
            n, bins, patches = axis[i, 3].hist(perms[key]['dice_bar'], 50)#, 50, normed=1, histtype='stepfilled')
            axis[i, 3].set_title(key + "_dice_bar")
        plt.show()

        l1l2tv, l1tv, l1l2, l1 = ["0.001_0.3335_0.3335_0.333_-1",  "0.001_0.5_0_0.5_-1",  
                             "0.001_0.5_0.5_0_-1",  "0.001_1_0_0_-1"]

        # Read true scores
        import pandas as pd
        true = pd.read_csv(os.path.join(BASE, "..", "ADAS11-MCIc-CTL.csv"))
        true = true[true.a == 0.001]
        true_l1l2tv = true[true.l1 == 0.3335].iloc[0]
        true_l1l2 = true[(true.l1 == 0.5) & (true.l2 == 0.5)].iloc[0]
        true_l1tv = true[(true.l1 == 0.5) & (true.tv == 0.5)].iloc[0]
        true_l1 = true[(true.l1 == 1.)].iloc[0]

        # pvals
        nperms = float(len(perms[l1]['r2']))
        from collections import OrderedDict
        pvals = OrderedDict()
        pvals["cond"] = ['l1', 'l1tv', 'l1l2', 'l1l2tv'] * 4 + \
                ['l1 vs l1tv'] * 4  + ['l1l2 vs l1l2tv'] * 4
        pvals["stat"] = ['r2'] * 4 + ['r_bar'] * 4 + ['fleiss_kappa'] * 4 + ['dice_bar'] * 4 +\
                ['r2', 'r_bar', 'fleiss_kappa', 'dice_bar'] * 2
        pvals["pval"] = [
            np.sum(perms[l1]['r2'] > true_l1["r2"]),
            np.sum(perms[l1tv]['r2'] > true_l1tv["r2"]),
            np.sum(perms[l1l2]['r2'] > true_l1l2["r2"]),
            np.sum(perms[l1l2tv]['r2'] > true_l1l2tv["r2"]),
    
            np.sum(perms[l1]['r_bar'] > true_l1["beta_r_bar"]),
            np.sum(perms[l1tv]['r_bar'] > true_l1tv["beta_r_bar"]),
            np.sum(perms[l1l2]['r_bar'] > true_l1l2["beta_r_bar"]),
            np.sum(perms[l1l2tv]['r_bar'] > true_l1l2tv["beta_r_bar"]),
    
            np.sum(perms[l1]['fleiss_kappa'] > true_l1["beta_fleiss_kappa"]),
            np.sum(perms[l1tv]['fleiss_kappa'] > true_l1tv["beta_fleiss_kappa"]),
            np.sum(perms[l1l2]['fleiss_kappa'] > true_l1l2["beta_fleiss_kappa"]),
            np.sum(perms[l1l2tv]['fleiss_kappa'] > true_l1l2tv["beta_fleiss_kappa"]),
    
            np.sum(perms[l1]['dice_bar'] > true_l1["beta_dice_bar"]),
            np.sum(perms[l1tv]['dice_bar'] > true_l1tv["beta_dice_bar"]),
            np.sum(perms[l1l2]['dice_bar'] > true_l1l2["beta_dice_bar"]),
            np.sum(perms[l1l2tv]['dice_bar'] > true_l1l2tv["beta_dice_bar"]),
    
            # l1 vs l1tv
            np.sum((perms[l1tv]['r2'] - perms[l1]['r2']) > (true_l1tv["r2"] - true_l1["r2"])),
            np.sum((perms[l1tv]['r_bar'] - perms[l1]['r_bar']) > (true_l1tv["beta_r_bar"] - true_l1["beta_r_bar"])),
            np.sum((perms[l1tv]['fleiss_kappa'] - perms[l1]['fleiss_kappa']) > (true_l1tv["beta_fleiss_kappa"] - true_l1["beta_fleiss_kappa"])),
            np.sum((perms[l1tv]['dice_bar'] - perms[l1]['dice_bar']) > (true_l1tv["beta_dice_bar"] - true_l1["beta_dice_bar"])),
    
            # l1l2 vs l1l2tv
            np.sum((perms[l1l2]['r2'] - perms[l1l2tv]['r2']) > (true_l1l2["r2"] - true_l1l2tv["r2"])),
            np.sum((perms[l1l2tv]['r_bar'] - perms[l1l2]['r_bar']) > (true_l1l2tv["beta_r_bar"] - true_l1l2["beta_r_bar"])),
            np.sum((perms[l1l2tv]['fleiss_kappa'] - perms[l1l2]['fleiss_kappa']) > (true_l1l2tv["beta_fleiss_kappa"] - true_l1l2["beta_fleiss_kappa"])),
            np.sum((perms[l1l2tv]['dice_bar'] - perms[l1l2]['dice_bar']) > (true_l1l2tv["beta_dice_bar"] - true_l1l2["beta_dice_bar"]))]

        pvals = pd.DataFrame(pvals)
        pvals["pval"] /= nperms
        pvals.to_csv(os.path.join(OUTPUT, "pvals_stats_permutations.csv"), index=False)
Exemplo n.º 11
0
def scores(key, paths, config, ret_y=False):
    import glob, mapreduce
    print key
    values = [mapreduce.OutputCollector(p) for p in paths]
    values = [item.load() for item in values]
    recall_mean_std = np.std([
        np.mean(
            precision_recall_fscore_support(
                item["y_true"].ravel(), item["y_pred"])[1]) for item in values
    ]) / np.sqrt(len(values))
    y_true = [item["y_true"].ravel() for item in values]
    y_pred = [item["y_pred"].ravel() for item in values]
    prob_pred = [item["proba_pred"].ravel() for item in values]
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    prob_pred = np.concatenate(prob_pred)
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, prob_pred)  #area under curve score.
    n_ite = None
    betas = np.hstack(
        [item["beta"][config['penalty_start']:, :] for item in values]).T
    ## Compute beta similarity measures
    # Correlation
    R = np.corrcoef(betas)
    #print R
    R = R[np.triu_indices_from(R, 1)]
    print R
    # Fisher z-transformation / average
    z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R)))
    # bracktransform
    r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1)

    # threshold betas to compute fleiss_kappa and DICE
    try:
        betas_t = np.vstack([
            array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0]
            for i in xrange(betas.shape[0])
        ])
        #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1))
        print np.allclose(np.sqrt(np.sum(betas_t**2, 1)) /
                          np.sqrt(np.sum(betas**2, 1)), [0.99] * 5,
                          rtol=0,
                          atol=1e-02)

        # Compute fleiss kappa statistics
        beta_signed = np.sign(betas_t)
        table = np.zeros((beta_signed.shape[1], 3))
        table[:, 0] = np.sum(beta_signed == 0, 0)
        table[:, 1] = np.sum(beta_signed == 1, 0)
        table[:, 2] = np.sum(beta_signed == -1, 0)
        fleiss_kappa_stat = fleiss_kappa(table)

        # Paire-wise Dice coeficient
        ij = [[i, j] for i in xrange(5) for j in xrange(i + 1, 5)]
        dices = list()
        for idx in ij:
            A, B = beta_signed[idx[0], :], beta_signed[idx[1], :]
            dices.append(
                float(np.sum((A == B)[(A != 0) & (B != 0)])) /
                (np.sum(A != 0) + np.sum(B != 0)))
        dice_bar = np.mean(dices)
    except:
        dice_bar = fleiss_kappa_stat = 0.

    scores = OrderedDict()
    try:
        a, l1, l2, tv, k = [float(par) for par in key.split("_")]
        scores['a'] = a
        scores['l1'] = l1
        scores['l2'] = l2
        scores['tv'] = tv
        left = float(1 - tv)
        if left == 0: left = 1.
        scores['l1_ratio'] = float(l1) / left
        scores['k'] = k
    except:
        pass
    scores['recall_0'] = r[0]
    scores['recall_1'] = r[1]
    scores['recall_mean'] = r.mean()
    scores['recall_mean_std'] = recall_mean_std
    scores['auc'] = auc
    #    scores['beta_cor_mean'] = beta_cor_mean
    scores['precision_0'] = p[0]
    scores['precision_1'] = p[1]
    scores['precision_mean'] = p.mean()
    scores['f1_0'] = f[0]
    scores['f1_1'] = f[1]
    scores['f1_mean'] = f.mean()
    scores['support_0'] = s[0]
    scores['support_1'] = s[1]
    #    scores['corr']= corr
    scores['beta_r'] = str(R)
    scores['beta_r_bar'] = r_bar
    scores['beta_fleiss_kappa'] = fleiss_kappa_stat
    scores['beta_dice'] = str(dices)
    scores['beta_dice_bar'] = dice_bar
    scores['n_ite'] = n_ite
    scores['param_key'] = key
    if ret_y:
        scores["y_true"], scores["y_pred"], scores[
            "prob_pred"] = y_true, y_pred, prob_pred
    return scores
def reducer(key, values):
    # key : string of intermediary key
    # load return dict correspondning to mapper ouput. they need to be loaded.
    # DEBUG
    import mapreduce as GLOBAL
    output_permutations = GLOBAL.OUTPUT_PERMUTATIONS
    map_output = GLOBAL.MAP_OUTPUT
    output_path = GLOBAL.OUTPUT_PATH
    roi = GLOBAL.ROI
    BASE = os.path.join("/neurospin/brainomics/2014_deptms/results_enettv/",
                        "MRI_" + roi,
                        map_output)
    INPUT = BASE + "/%i/%s"
    OUTPUT = BASE + "/../" + output_path
    if not os.path.exists(OUTPUT):
        os.makedirs(OUTPUT)
    criteria = GLOBAL.CRITERIA
    keys = ['_'.join(str(e) for e in a) for a in criteria]
    OK = 0
    # params = criteria = ['recall_mean', 'min_recall', 'max_pvalue_recall',
    #                     'accuracy', 'pvalue_accuracy']
    if not OK:
        for key in keys:
            print "key: ", key
            paths_CV_all = [INPUT % (perm, key) \
                    for perm in xrange(NFOLDS * NRNDPERMS)]
            idx_CV_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS)
            recall_0_perms = np.zeros(NRNDPERMS)
            recall_1_perms = np.zeros(NRNDPERMS)
            recall_mean_perms = np.zeros(NRNDPERMS)
            accuracy_perms = np.zeros(NRNDPERMS)
            auc_perms = np.zeros(NRNDPERMS)
            crit = key[0:len(key):2]
            if not os.path.isfile(OUTPUT + \
                                  "/perms_validation_" + crit + ".npz"):
                for perm in xrange(NRNDPERMS):
                    print "perm: ", perm
                    paths_CV_blocks = paths_CV_all[idx_CV_blocks[perm]:\
                                                    idx_CV_blocks[perm + 1]]
                    values = [GLOBAL.OutputCollector(p) \
                                for p in paths_CV_blocks]
                    values = [item.load() for item in values]
                    y_true = [item["y_true"].ravel() for item in values]
                    y_pred = [item["y_pred"].ravel() for item in values]
                    prob_pred = [item["proba_pred"].ravel() for item in values]
                    y_true = np.concatenate(y_true)
                    y_pred = np.concatenate(y_pred)
                    prob_pred = np.concatenate(prob_pred)
                    p, r, f, s = precision_recall_fscore_support(y_true,
                                                                 y_pred,
                                                                 average=None)
                    auc = roc_auc_score(y_true, prob_pred)
                    success = r * s
                    success = success.astype('int')
                    accuracy = (r[0] * s[0] + r[1] * s[1])
                    accuracy = accuracy.astype('int')
                    recall_0_perms[perm] = r[0]
                    recall_1_perms[perm] = r[1]
                    recall_mean_perms[perm] = r.mean()
                    accuracy_perms[perm] = accuracy / float(s[0] + s[1])
                    auc_perms[perm] = auc
                # END PERMS
                print "save", crit
                np.savez_compressed(OUTPUT + \
                                    "/perms_validation_" + crit + ".npz",
                                recall_0=recall_0_perms,
                                recall_1=recall_1_perms,
                                recall_mean=recall_mean_perms,
                                accuracy=accuracy_perms,
                                auc=auc_perms)
        OK = 1
    #pvals
    if  not os.path.isfile(os.path.join(OUTPUT, output_permutations)):
        print "Derive p-values"
        perms = dict()
        for i, key in enumerate(keys):
            print "crit: ", crit
            crit = key[0:len(key):2]
            perms[crit] = np.load(OUTPUT + \
                                    "/perms_validation_" + crit + ".npz")
        print keys
        [recall_mean, min_recall, accuracy] = [keys[0][0:len(keys[0]):2],
                                               keys[1][0:len(keys[1]):2],
                                               keys[2][0:len(keys[2]):2]]
        print [recall_mean, min_recall, accuracy]
        # Read true scores
        true = pd.read_csv(os.path.join(BASE, "..",
                                        "results_dCV_validation.csv"))
        true_recall_mean = true[true.params == recall_mean].iloc[0]
        true_min_recall = true[true.params == min_recall].iloc[0]
        true_accuracy = true[true.params == accuracy].iloc[0]
        # pvals corrected for multiple comparisons
        nperms = float(len(perms[recall_mean]['recall_0']))
        from collections import OrderedDict
        pvals = OrderedDict()
        #cond: criterion used to select the model
        pvals["cond"] = ['recall_mean'] * 5 + ['min_recall'] * 5 + \
                        ['accuracy'] * 5
        #stat: statitics associated to the p-value
        pvals["stat"] = ['recall_0', 'recall_1', 'recall_mean',
                         'accuracy', 'auc'] * 3
        pvals["pval"] = [
        np.sum(perms[recall_mean]['recall_0'] > true_recall_mean["recall_0"]),
        np.sum(perms[recall_mean]['recall_1'] > true_recall_mean["recall_1"]),
        np.sum(perms[recall_mean]['recall_mean'] > true_recall_mean["recall_mean"]),
        np.sum(perms[recall_mean]['accuracy'] > true_recall_mean["accuracy"]),
        np.sum(perms[recall_mean]['auc'] > true_recall_mean["auc"]),
    
        np.sum(perms[min_recall]['recall_0'] > true_min_recall["recall_0"]),
        np.sum(perms[min_recall]['recall_1'] > true_min_recall["recall_1"]),
        np.sum(perms[min_recall]['recall_mean'] > true_min_recall["recall_mean"]),
        np.sum(perms[min_recall]['accuracy'] > true_min_recall["accuracy"]),
        np.sum(perms[min_recall]['auc'] > true_min_recall["auc"]),
    
        np.sum(perms[accuracy]['recall_0'] > true_accuracy["recall_0"]),
        np.sum(perms[accuracy]['recall_1'] > true_accuracy["recall_1"]),
        np.sum(perms[accuracy]['recall_mean'] > true_accuracy["recall_mean"]),
        np.sum(perms[accuracy]['accuracy'] > true_accuracy["accuracy"]),
        np.sum(perms[accuracy]['auc'] > true_accuracy["auc"])]
    
        pvals = pd.DataFrame(pvals)
        pvals["pval"] /= float(nperms)
        pvals.to_csv(os.path.join(OUTPUT, output_permutations),
                     index=False)
    return {}