def scores(key, paths, config): import mapreduce print(key) values = [mapreduce.OutputCollector(p) for p in paths] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = [item["prob_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) #area under curve score. #betas = np.hstack([item["beta"] for item in values]).T # threshold betas to compute fleiss_kappa and DICE #betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0])]) #Compute pvalue success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_recall_mean = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') scores = OrderedDict() try: a, l1, l2, tv = [float(par) for par in key.split("_")] scores['a'] = a scores['l1'] = l1 scores['l2'] = l2 scores['tv'] = tv left = float(1 - tv) if left == 0: left = 1. scores['l1_ratio'] = float(l1) / left except: pass scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['recall_mean'] = r.mean() scores["auc"] = auc scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_recall_mean'] = pvalue_recall_mean #scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ # float(np.prod(betas.shape)) scores['param_key'] = key return scores
def scores(key, paths, config, ret_y=False): import mapreduce print(key) values = [mapreduce.OutputCollector(p) for p in paths] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] #prob_pred = [item["proba_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) #prob_pred = np.concatenate(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, y_pred) #area under curve score. betas = np.hstack([item["beta"] for item in values]).T #Compute pvalue success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_recall_mean = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') scores = OrderedDict() try: c = float(key[0]) scores['c'] = c except: pass scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['recall_mean'] = r.mean() scores["auc"] = auc scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_recall_mean'] = pvalue_recall_mean scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas)) / \ float(np.prod(betas.shape)) scores['param_key'] = key if ret_y: scores["y_true"], scores["y_pred"] = y_true, y_pred return scores
def scores(key, paths, config): key_parts = key.split("_") values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) prob_pred_splits = [item["prob_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred_splits) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) # balanced accuracy (recall_mean) bacc_splits = [ recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits)) ] auc_splits = [ roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits)) ] print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits))) # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_bacc = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i + 1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 # Proportion of selection within the support accross the CV support_count = (betas_t != 0).sum(axis=0) support_count = support_count[support_count > 0] support_prop = support_count / betas_t.shape[0] scores = OrderedDict() scores['key'] = key scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['bacc'] = r.mean() scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits)) scores["auc"] = auc scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits)) scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_bacc_mean'] = pvalue_bacc scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) scores['beta_support_prop_select_mean'] = support_prop.mean() scores['beta_support_prop_select_sd'] = support_prop.std() return scores
def scores(key, paths, config): values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) prob_pred_splits = [item["prob_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred_splits) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) # balanced accuracy (recall_mean) bacc_splits = [ recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits)) ] auc_splits = [ roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits)) ] print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits))) # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_bacc = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) scores = OrderedDict() scores['key'] = key scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['bacc'] = r.mean() scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits)) scores["auc"] = auc scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits)) scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_bacc_mean'] = pvalue_bacc return scores
def scores(key, paths, config, as_dataframe=False, algo_idx=None): # print(key, paths) # key = 'enettv_0.1_0.5_0.1' # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1'] key_parts = key.split("_") algo = key_parts[algo_idx] if algo_idx is not None else None key_parts.remove(algo) if len(key_parts) > 0: try: params = [float(p) for p in key_parts] except: params = [None, None, None] print(algo, params) #Comment out, because it's a 4 x 5 cross validation; it creates failed # if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER): # print("Failed for key %s" % key) # return None values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) prob_pred_splits = [item["proba_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred_splits) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) # balanced accuracy (recall_mean) bacc_splits = [ recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits)) ] auc_splits = [ roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits)) ] print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits))) # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_bacc = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i + 1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 # Proportion of selection within the support accross the CV support_count = (betas_t != 0).sum(axis=0) support_count = support_count[support_count > 0] support_prop = support_count / betas_t.shape[0] scores = OrderedDict() scores['key'] = key scores['algo'] = algo scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['bacc'] = r.mean() scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits)) scores["auc"] = auc scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits)) scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_bacc_mean'] = pvalue_bacc scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) scores['beta_support_prop_select_mean'] = support_prop.mean() scores['beta_support_prop_select_sd'] = support_prop.std() if as_dataframe: scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys())) return scores
def reducer(key, values): # key : string of intermediary key # load return dict correspondning to mapper ouput. they need to be loaded. # DEBUG import mapreduce as GLOBAL criteria = {'recall_mean': [np.argmax, np.max], 'min_recall': [np.argmax, np.max], 'accuracy': [np.argmax, np.max]} output_summary = GLOBAL.OUTPUT_SUMMARY output_path = GLOBAL.OUTPUT_PATH map_output = GLOBAL.MAP_OUTPUT roi = GLOBAL.ROI BASE = os.path.join("/neurospin/brainomics/2014_deptms/results_enettv/", "MRI_" + roi, map_output) INPUT = BASE + "/%i/%s" OUTPUT = BASE + "/../" + output_path if not os.path.exists(OUTPUT): os.makedirs(OUTPUT) params = GLOBAL.PARAMS keys = ['_'.join(str(e) for e in a) for a in params] compt = 0 print "Model Construction, first cross-validation" for key in keys: if not os.path.isfile(OUTPUT + "/perms_selection_" + key + ".npz"): print "key: ", key paths_dCV_all = [INPUT % (perm, key) \ for perm in xrange(NFOLDS * NFOLDS * NRNDPERMS)] idx_dCV_blocks = range(0, (NFOLDS * NFOLDS * NRNDPERMS) + NFOLDS * NFOLDS, NFOLDS * NFOLDS) permutation_perms = np.zeros(NRNDPERMS * NFOLDS) n_fold_perms = np.zeros(NRNDPERMS * NFOLDS) parameters_perms = np.zeros(NRNDPERMS * NFOLDS, dtype='a50') recall_0_perms = np.zeros(NRNDPERMS * NFOLDS) recall_1_perms = np.zeros(NRNDPERMS * NFOLDS) min_recall_perms = np.zeros(NRNDPERMS * NFOLDS) recall_mean_perms = np.zeros(NRNDPERMS * NFOLDS) accuracy_perms = np.zeros(NRNDPERMS * NFOLDS) compt = 0 for perm in xrange(NRNDPERMS): print "perm: ", perm paths_dCV_blocks = paths_dCV_all[idx_dCV_blocks[perm]:\ idx_dCV_blocks[perm + 1]] idx_fold_blocks = range(0, NFOLDS * NFOLDS + NFOLDS, NFOLDS) # for each outer fold for fold in xrange(0, NFOLDS): path_fold_blocks = paths_dCV_blocks[idx_fold_blocks[fold]:\ idx_fold_blocks[fold + 1]] values = [GLOBAL.OutputCollector(p) \ for p in path_fold_blocks] values = [item.load() for item in values] n_fold = [item["nfold"] for item in values] assert n_fold == ([fold for i in xrange(NFOLDS)]) y_true = [item["y_true"].ravel() for item in values] y_true = np.hstack(y_true) y_pred = [item["y_pred"].ravel() for item in values] y_pred = np.hstack(y_pred) prob_pred = [item["proba_pred"].ravel() for item in values] prob_pred = np.hstack(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) accuracy = (r[0] * s[0] + r[1] * s[1]) accuracy = accuracy.astype('int') permutation_perms[compt] = perm n_fold_perms[compt] = n_fold[0] parameters_perms[compt] = key recall_0_perms[compt] = r[0] recall_1_perms[compt] = r[1] min_recall_perms[compt] = np.minimum(r[0], r[1]) recall_mean_perms[compt] = r.mean() accuracy_perms[compt] = accuracy / float(s[0] + s[1]) compt += 1 print "compt = ", compt print "save", key np.savez_compressed(OUTPUT + "/perms_selection_" + key + ".npz", permutation=permutation_perms, n_fold=n_fold_perms, parameters=parameters_perms, recall_0=recall_0_perms, recall_1=recall_1_perms, min_recall=min_recall_perms, recall_mean=recall_mean_perms, accuracy=accuracy_perms) if not os.path.isfile(os.path.join(OUTPUT, output_summary)): print "Model Selection" perms = dict() scores = OrderedDict() scores['permutation'] = [] scores['n_fold'] = [] scores['parameters'] = [] scores['recall_0'] = [] scores['recall_1'] = [] scores['min_recall'] = [] scores['recall_mean'] = [] scores['accuracy'] = [] for i, key in enumerate(keys): print "key: ", key perms = np.load(OUTPUT + "/perms_selection_" + key + ".npz") for s in perms: scores[s] += perms[s].tolist() compt = 0 scores_tab = pd.DataFrame(scores) perm_groups = scores_tab.groupby('permutation') for perm_val, perm_group in perm_groups: fold_groups = perm_group.groupby('n_fold') for fold_val, fold_group in fold_groups: scores_dCV = OrderedDict() scores_dCV['permutation'] = perm_val scores_dCV['n_fold'] = fold_val n_crit = 0 for item, val in criteria.items(): n_crit += 1 scores_dCV['criteria_' + item] = item loc_opt = val[0](fold_group[item]) value_opt = val[1](fold_group[item]) scores_dCV['value_opt_' + item] = value_opt param_opt = fold_group.parameters[loc_opt] scores_dCV['param_opt_' + item] = param_opt if compt == 0: scores_select_model = pd.DataFrame( columns=scores_dCV.keys()) scores_select_model.loc[compt, ] = scores_dCV.values() compt += 1 scores_select_model.to_csv(os.path.join(OUTPUT, output_summary), index=False) return {}
def scores(key, paths, config, as_dataframe=False): import mapreduce print(key) if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER): print("Failed for key %s" % key) return None values = [mapreduce.OutputCollector(p) for p in paths] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = [item["proba_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) #area under curve score. # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_recall_mean = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T # Correlation R = np.corrcoef(betas) #print R R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)) #print(np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5, # rtol=0, atol=1e-02)) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i + 1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 scores = OrderedDict() scores['key'] = key try: a, l1, l2, tv = [float(par) for par in key.split("_")] scores['a'] = a scores['l1'] = l1 scores['l2'] = l2 scores['tv'] = tv left = float(1 - tv) if left == 0: left = 1. scores['l1_ratio'] = float(l1) / left except: pass scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['recall_mean'] = r.mean() scores["auc"] = auc scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_recall_mean'] = pvalue_recall_mean scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) if as_dataframe: scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys())) return scores
def scores(key, paths, config, as_dataframe=False, algo_idx=None): # print(key, paths) # key = 'enettv_0.1_0.5_0.1' # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1'] key_parts = key.split("_") algo = key_parts[algo_idx] if algo_idx is not None else None key_parts.remove(algo) if len(key_parts) > 0: try: params = [float(p) for p in key_parts] except: params = [None, None, None] print(algo, params) if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER): print("Failed for key %s" % key) return None values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( y_true, y_pred) betas = np.hstack([item["beta"] for item in values]).T # threshold betas to compute fleiss_kappa and DICE betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) #Compute pvalue scores = OrderedDict() scores['key'] = key scores['algo'] = algo scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params scores['algo'] = algo try: a, l1, l2, tv = [float(par) for par in key.split("_")] scores['a'] = a scores['l1'] = l1 scores['l2'] = l2 scores['tv'] = tv left = float(1 - tv) if left == 0: left = 1. scores['l1_ratio'] = float(l1) / left except: pass scores['slope'] = slope scores['intercept'] = intercept scores['r_value'] = r_value scores['p_value'] = p_value scores['std_err'] = std_err scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['param_key'] = key if as_dataframe: scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys())) return scores
def reducer(key, values): import mapreduce as GLOBAL # key : string of intermediary key # load return dict corresponding to mapper ouput. they need to be loaded. # Compute sd; ie.: compute results on each folds roi = GLOBAL.ROI criteria = { 'recall_mean': [np.argmax, np.max], 'min_recall': [np.argmax, np.max], 'accuracy': [np.argmax, np.max] } output_selection = GLOBAL.OUTPUT_SELECTION output_summary = GLOBAL.OUTPUT_SUMMARY map_output = GLOBAL.MAP_OUTPUT BASE = os.path.join("/neurospin/brainomics/2014_deptms/results_enettv/", "MRI_" + roi, map_output) INPUT = BASE + "/%i/%s" penalty_start = GLOBAL.PENALTY_START prob_class1 = GLOBAL.PROB_CLASS1 params = GLOBAL.PARAMS # load all keys (sets of parameters) keys = ['_'.join(str(e) for e in a) for a in params] compt = 0 if not os.path.isfile(output_selection): print "Model Construction, first cross-validation" # loop for the selection of the model for fold in xrange(0, NFOLDS + 1): # outer folds # inner folds (NFOLDS) associated to the outer fold idx_block = range(fold * (NFOLDS + 1), (fold + 1) * (NFOLDS + 1) - 1) for key in keys: # paths of the map results of all inner folds associated to # a key and an outer fold paths_dCV = [INPUT % (idx, key) for idx in idx_block] scores_CV = OrderedDict() # get values values = [GLOBAL.OutputCollector(p) for p in paths_dCV] values = [item.load() for item in values] n_fold = [item["n_fold"] for item in values] assert n_fold == ([fold for i in xrange(NFOLDS)]) recall_mean_std = np.std([np.mean( precision_recall_fscore_support( item["y_true"].ravel(), item["y_pred"])[1]) \ for item in values]) \ / np.sqrt(len(values)) recall = [ precision_recall_fscore_support(item["y_true"].ravel(), item["y_pred"].ravel(), average=None)[1] for item in values ] support = [ precision_recall_fscore_support(item["y_true"].ravel(), item["y_pred"].ravel(), average=None)[3] for item in values ] accuracy_std = np.std([((recall[i][0] * support[i][0] + \ recall[i][1] * support[i][1]) \ / (float(support[i][0] + support[i][1]))) \ for i in xrange(len(values))]) \ / np.sqrt(len(values)) y_true = [item["y_true"].ravel() for item in values] y_true = np.hstack(y_true) y_pred = [item["y_pred"].ravel() for item in values] y_pred = np.hstack(y_pred) prob_pred = [item["proba_pred"].ravel() for item in values] prob_pred = np.hstack(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) betas = [item["beta"][penalty_start:] for item in values] betas = np.hstack(betas).T n_ite = np.mean(np.array([item["n_iter"] for item in values])) R = np.corrcoef(betas) beta_cor_mean = np.mean(R[np.triu_indices_from(R, 1)]) success = r * s success = success.astype('int') accuracy = (r[0] * s[0] + r[1] * s[1]) accuracy = accuracy.astype('int') pvalue_class0 = binom_test(success[0], s[0], 1 - prob_class1) pvalue_class1 = binom_test(success[1], s[1], prob_class1) pvalue_accuracy = binom_test(accuracy, s[0] + s[1], p=0.5) k = key.split('_') a, l1 = float(k[0]), float(k[1]) l2, tv = float(k[2]), float(k[3]) left = float(1 - tv) if left == 0: left = 1. scores_CV['n_fold'] = n_fold[0] scores_CV['parameters'] = key scores_CV['a'] = a scores_CV['l1'] = l1 scores_CV['l2'] = l2 scores_CV['tv'] = tv scores_CV['recall_0'] = r[0] scores_CV['pvalue_recall_0'] = pvalue_class0 scores_CV['recall_1'] = r[1] scores_CV['pvalue_recall_1'] = pvalue_class1 scores_CV['min_recall'] = np.minimum(r[0], r[1]) scores_CV['max_pvalue_recall'] = np.maximum( pvalue_class0, pvalue_class1) scores_CV['recall_mean'] = r.mean() scores_CV['recall_mean_std'] = recall_mean_std scores_CV['accuracy'] = accuracy / float(s[0] + s[1]) scores_CV['pvalue_accuracy'] = pvalue_accuracy scores_CV['accuracy_std'] = accuracy_std scores_CV['precision_0'] = p[0] scores_CV['precision_1'] = p[1] scores_CV['precision_mean'] = p.mean() scores_CV['f1_0'] = f[0] scores_CV['f1_1'] = f[1] scores_CV['f1_mean'] = f.mean() scores_CV['support_0'] = s[0] scores_CV['support_1'] = s[1] scores_CV['n_ite_mean'] = n_ite scores_CV['auc'] = auc scores_CV['beta_cor_mean'] = beta_cor_mean scores_CV['prop_non_zeros_mean'] = float(np.count_nonzero(betas)) \ / float(np.prod(betas.shape)) # stock results in dataframe scores_tab if compt == 0: scores_tab = pd.DataFrame(columns=scores_CV.keys()) scores_tab.loc[compt, ] = scores_CV.values() compt += 1 print "save results of the inner cross-validation : ", output_selection scores_tab.to_csv(output_selection, index=False) if not os.path.isfile(output_summary): print "Model Selection" scores_tab = pd.read_csv(output_selection) fold_groups = scores_tab.groupby('n_fold') compt = 0 for fold_val, fold_group in fold_groups: scores_dCV = OrderedDict() scores_dCV['n_fold'] = fold_val # for each outer fold and ecah criterion, select the set of # parameters that optimizes the criterion for item, val in criteria.items(): scores_dCV['criteria_' + item] = item loc_opt = val[0](fold_group[item]) value_opt = val[1](fold_group[item]) scores_dCV['value_opt_' + item] = value_opt param_opt = fold_group.parameters[loc_opt] a_opt = fold_group.a[loc_opt] l1_opt = fold_group.l1[loc_opt] tv_opt = fold_group.tv[loc_opt] scores_dCV['param_opt_' + item] = param_opt scores_dCV['a_opt_' + item] = a_opt scores_dCV['l1_opt_' + item] = l1_opt scores_dCV['tv_opt_' + item] = tv_opt # stock results in dataframe scores_select_model if compt == 0: scores_select_model = pd.DataFrame(columns=scores_dCV.keys()) scores_select_model.loc[compt, ] = scores_dCV.values() compt += 1 print "save results of the model selection : ", output_summary scores_select_model.to_csv(output_summary, index=False) return {}
def reducer_(key, values): # key : string of intermediary key # load return dict correspondning to mapper ouput. they need to be loaded. # DEBUG import glob, mapreduce BASE = "/neurospin/brainomics/2013_adni/ADAS11-MCIc-CTL/rndperm" INPUT = BASE + "/%i/%s" OUTPUT = BASE + "/../results/rndperm" keys = ["0.001_0.3335_0.3335_0.333_-1", "0.001_0.5_0_0.5_-1", "0.001_0.5_0.5_0_-1", "0.001_1_0_0_-1"] for key in keys: #key = keys[0] paths_5cv_all = [INPUT % (perm, key) for perm in xrange(NFOLDS * NRNDPERMS)] idx_5cv_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS) cpt = 0 qc = dict() r2_perms = np.zeros(NRNDPERMS) corr_perms = np.zeros(NRNDPERMS) r_bar_perms = np.zeros(NRNDPERMS) fleiss_kappa_stat_perms = np.zeros(NRNDPERMS) dice_bar_perms = np.zeros(NRNDPERMS) for perm_i in xrange(len(idx_5cv_blocks)-1): paths_5cv = paths_5cv_all[idx_5cv_blocks[perm_i]:idx_5cv_blocks[perm_i+1]] for p in paths_5cv: if os.path.exists(p) and not(p in qc): if p in qc: qc[p] += 1 else: qc[p] = 1 cpt += 1 # values = [mapreduce.OutputCollector(p) for p in paths_5cv] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) r2 = r2_score(y_true, y_pred) corr = np.corrcoef(y_true.ravel(), y_pred.ravel())[0, 1] betas = np.hstack([item["beta"] for item in values]).T # ## Compute beta similarity measures # # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0])]) print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)) print np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5, rtol=0, atol=1e-02) # # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # # Paire-wise Dice coeficient beta_n0 = betas_t != 0 ij = [[i, j] for i in xrange(5) for j in xrange(i+1, 5)] #print [[idx[0], idx[1]] for idx in ij] dice_bar = np.mean([float(np.sum(beta_signed[idx[0], :] == beta_signed[idx[1], :])) /\ (np.sum(beta_n0[idx[0], :]) + np.sum(beta_n0[idx[1], :])) for idx in ij]) except: dice_bar = fleiss_kappa_stat = 0. # r2_perms[perm_i] = r2 corr_perms[perm_i] = corr r_bar_perms[perm_i] = r_bar fleiss_kappa_stat_perms[perm_i] = fleiss_kappa_stat dice_bar_perms[perm_i] = dice_bar # END PERMS print "save", key np.savez_compressed(OUTPUT+"/perms_"+key+".npz", r2=r2_perms, corr=corr_perms, r_bar=r_bar_perms, fleiss_kappa=fleiss_kappa_stat_perms, dice_bar=dice_bar_perms) # perms = dict() fig, axis = plt.subplots(len(keys), 4)#, sharex='col') for i, key in enumerate(keys): perms[key] = np.load(OUTPUT+"/perms_"+key+".npz") n, bins, patches = axis[i, 0].hist(perms[key]['r2'], 50, normed=1, histtype='stepfilled') axis[i, 0].set_title(key + "_r2") n, bins, patches = axis[i, 1].hist(perms[key]['r_bar'], 50, normed=1, histtype='stepfilled') axis[i, 1].set_title(key + "_r_bar") n, bins, patches = axis[i, 2].hist(perms[key]['fleiss_kappa'], 50, histtype='stepfilled') axis[i, 2].set_title(key + "_fleiss_kappa") n, bins, patches = axis[i, 3].hist(perms[key]['dice_bar'], 50)#, 50, normed=1, histtype='stepfilled') axis[i, 3].set_title(key + "_dice_bar") plt.show() l1l2tv, l1tv, l1l2, l1 = ["0.001_0.3335_0.3335_0.333_-1", "0.001_0.5_0_0.5_-1", "0.001_0.5_0.5_0_-1", "0.001_1_0_0_-1"] # Read true scores import pandas as pd true = pd.read_csv(os.path.join(BASE, "..", "ADAS11-MCIc-CTL.csv")) true = true[true.a == 0.001] true_l1l2tv = true[true.l1 == 0.3335].iloc[0] true_l1l2 = true[(true.l1 == 0.5) & (true.l2 == 0.5)].iloc[0] true_l1tv = true[(true.l1 == 0.5) & (true.tv == 0.5)].iloc[0] true_l1 = true[(true.l1 == 1.)].iloc[0] # pvals nperms = float(len(perms[l1]['r2'])) from collections import OrderedDict pvals = OrderedDict() pvals["cond"] = ['l1', 'l1tv', 'l1l2', 'l1l2tv'] * 4 + \ ['l1 vs l1tv'] * 4 + ['l1l2 vs l1l2tv'] * 4 pvals["stat"] = ['r2'] * 4 + ['r_bar'] * 4 + ['fleiss_kappa'] * 4 + ['dice_bar'] * 4 +\ ['r2', 'r_bar', 'fleiss_kappa', 'dice_bar'] * 2 pvals["pval"] = [ np.sum(perms[l1]['r2'] > true_l1["r2"]), np.sum(perms[l1tv]['r2'] > true_l1tv["r2"]), np.sum(perms[l1l2]['r2'] > true_l1l2["r2"]), np.sum(perms[l1l2tv]['r2'] > true_l1l2tv["r2"]), np.sum(perms[l1]['r_bar'] > true_l1["beta_r_bar"]), np.sum(perms[l1tv]['r_bar'] > true_l1tv["beta_r_bar"]), np.sum(perms[l1l2]['r_bar'] > true_l1l2["beta_r_bar"]), np.sum(perms[l1l2tv]['r_bar'] > true_l1l2tv["beta_r_bar"]), np.sum(perms[l1]['fleiss_kappa'] > true_l1["beta_fleiss_kappa"]), np.sum(perms[l1tv]['fleiss_kappa'] > true_l1tv["beta_fleiss_kappa"]), np.sum(perms[l1l2]['fleiss_kappa'] > true_l1l2["beta_fleiss_kappa"]), np.sum(perms[l1l2tv]['fleiss_kappa'] > true_l1l2tv["beta_fleiss_kappa"]), np.sum(perms[l1]['dice_bar'] > true_l1["beta_dice_bar"]), np.sum(perms[l1tv]['dice_bar'] > true_l1tv["beta_dice_bar"]), np.sum(perms[l1l2]['dice_bar'] > true_l1l2["beta_dice_bar"]), np.sum(perms[l1l2tv]['dice_bar'] > true_l1l2tv["beta_dice_bar"]), # l1 vs l1tv np.sum((perms[l1tv]['r2'] - perms[l1]['r2']) > (true_l1tv["r2"] - true_l1["r2"])), np.sum((perms[l1tv]['r_bar'] - perms[l1]['r_bar']) > (true_l1tv["beta_r_bar"] - true_l1["beta_r_bar"])), np.sum((perms[l1tv]['fleiss_kappa'] - perms[l1]['fleiss_kappa']) > (true_l1tv["beta_fleiss_kappa"] - true_l1["beta_fleiss_kappa"])), np.sum((perms[l1tv]['dice_bar'] - perms[l1]['dice_bar']) > (true_l1tv["beta_dice_bar"] - true_l1["beta_dice_bar"])), # l1l2 vs l1l2tv np.sum((perms[l1l2]['r2'] - perms[l1l2tv]['r2']) > (true_l1l2["r2"] - true_l1l2tv["r2"])), np.sum((perms[l1l2tv]['r_bar'] - perms[l1l2]['r_bar']) > (true_l1l2tv["beta_r_bar"] - true_l1l2["beta_r_bar"])), np.sum((perms[l1l2tv]['fleiss_kappa'] - perms[l1l2]['fleiss_kappa']) > (true_l1l2tv["beta_fleiss_kappa"] - true_l1l2["beta_fleiss_kappa"])), np.sum((perms[l1l2tv]['dice_bar'] - perms[l1l2]['dice_bar']) > (true_l1l2tv["beta_dice_bar"] - true_l1l2["beta_dice_bar"]))] pvals = pd.DataFrame(pvals) pvals["pval"] /= nperms pvals.to_csv(os.path.join(OUTPUT, "pvals_stats_permutations.csv"), index=False)
def scores(key, paths, config, ret_y=False): import glob, mapreduce print key values = [mapreduce.OutputCollector(p) for p in paths] values = [item.load() for item in values] recall_mean_std = np.std([ np.mean( precision_recall_fscore_support( item["y_true"].ravel(), item["y_pred"])[1]) for item in values ]) / np.sqrt(len(values)) y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] prob_pred = [item["proba_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = np.concatenate(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) #area under curve score. n_ite = None betas = np.hstack( [item["beta"][config['penalty_start']:, :] for item in values]).T ## Compute beta similarity measures # Correlation R = np.corrcoef(betas) #print R R = R[np.triu_indices_from(R, 1)] print R # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0]) ]) #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)) print np.allclose(np.sqrt(np.sum(betas_t**2, 1)) / np.sqrt(np.sum(betas**2, 1)), [0.99] * 5, rtol=0, atol=1e-02) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in xrange(5) for j in xrange(i + 1, 5)] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0. scores = OrderedDict() try: a, l1, l2, tv, k = [float(par) for par in key.split("_")] scores['a'] = a scores['l1'] = l1 scores['l2'] = l2 scores['tv'] = tv left = float(1 - tv) if left == 0: left = 1. scores['l1_ratio'] = float(l1) / left scores['k'] = k except: pass scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['recall_mean'] = r.mean() scores['recall_mean_std'] = recall_mean_std scores['auc'] = auc # scores['beta_cor_mean'] = beta_cor_mean scores['precision_0'] = p[0] scores['precision_1'] = p[1] scores['precision_mean'] = p.mean() scores['f1_0'] = f[0] scores['f1_1'] = f[1] scores['f1_mean'] = f.mean() scores['support_0'] = s[0] scores['support_1'] = s[1] # scores['corr']= corr scores['beta_r'] = str(R) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice'] = str(dices) scores['beta_dice_bar'] = dice_bar scores['n_ite'] = n_ite scores['param_key'] = key if ret_y: scores["y_true"], scores["y_pred"], scores[ "prob_pred"] = y_true, y_pred, prob_pred return scores
def reducer(key, values): # key : string of intermediary key # load return dict correspondning to mapper ouput. they need to be loaded. # DEBUG import mapreduce as GLOBAL output_permutations = GLOBAL.OUTPUT_PERMUTATIONS map_output = GLOBAL.MAP_OUTPUT output_path = GLOBAL.OUTPUT_PATH roi = GLOBAL.ROI BASE = os.path.join("/neurospin/brainomics/2014_deptms/results_enettv/", "MRI_" + roi, map_output) INPUT = BASE + "/%i/%s" OUTPUT = BASE + "/../" + output_path if not os.path.exists(OUTPUT): os.makedirs(OUTPUT) criteria = GLOBAL.CRITERIA keys = ['_'.join(str(e) for e in a) for a in criteria] OK = 0 # params = criteria = ['recall_mean', 'min_recall', 'max_pvalue_recall', # 'accuracy', 'pvalue_accuracy'] if not OK: for key in keys: print "key: ", key paths_CV_all = [INPUT % (perm, key) \ for perm in xrange(NFOLDS * NRNDPERMS)] idx_CV_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS) recall_0_perms = np.zeros(NRNDPERMS) recall_1_perms = np.zeros(NRNDPERMS) recall_mean_perms = np.zeros(NRNDPERMS) accuracy_perms = np.zeros(NRNDPERMS) auc_perms = np.zeros(NRNDPERMS) crit = key[0:len(key):2] if not os.path.isfile(OUTPUT + \ "/perms_validation_" + crit + ".npz"): for perm in xrange(NRNDPERMS): print "perm: ", perm paths_CV_blocks = paths_CV_all[idx_CV_blocks[perm]:\ idx_CV_blocks[perm + 1]] values = [GLOBAL.OutputCollector(p) \ for p in paths_CV_blocks] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] prob_pred = [item["proba_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = np.concatenate(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) success = r * s success = success.astype('int') accuracy = (r[0] * s[0] + r[1] * s[1]) accuracy = accuracy.astype('int') recall_0_perms[perm] = r[0] recall_1_perms[perm] = r[1] recall_mean_perms[perm] = r.mean() accuracy_perms[perm] = accuracy / float(s[0] + s[1]) auc_perms[perm] = auc # END PERMS print "save", crit np.savez_compressed(OUTPUT + \ "/perms_validation_" + crit + ".npz", recall_0=recall_0_perms, recall_1=recall_1_perms, recall_mean=recall_mean_perms, accuracy=accuracy_perms, auc=auc_perms) OK = 1 #pvals if not os.path.isfile(os.path.join(OUTPUT, output_permutations)): print "Derive p-values" perms = dict() for i, key in enumerate(keys): print "crit: ", crit crit = key[0:len(key):2] perms[crit] = np.load(OUTPUT + \ "/perms_validation_" + crit + ".npz") print keys [recall_mean, min_recall, accuracy] = [keys[0][0:len(keys[0]):2], keys[1][0:len(keys[1]):2], keys[2][0:len(keys[2]):2]] print [recall_mean, min_recall, accuracy] # Read true scores true = pd.read_csv(os.path.join(BASE, "..", "results_dCV_validation.csv")) true_recall_mean = true[true.params == recall_mean].iloc[0] true_min_recall = true[true.params == min_recall].iloc[0] true_accuracy = true[true.params == accuracy].iloc[0] # pvals corrected for multiple comparisons nperms = float(len(perms[recall_mean]['recall_0'])) from collections import OrderedDict pvals = OrderedDict() #cond: criterion used to select the model pvals["cond"] = ['recall_mean'] * 5 + ['min_recall'] * 5 + \ ['accuracy'] * 5 #stat: statitics associated to the p-value pvals["stat"] = ['recall_0', 'recall_1', 'recall_mean', 'accuracy', 'auc'] * 3 pvals["pval"] = [ np.sum(perms[recall_mean]['recall_0'] > true_recall_mean["recall_0"]), np.sum(perms[recall_mean]['recall_1'] > true_recall_mean["recall_1"]), np.sum(perms[recall_mean]['recall_mean'] > true_recall_mean["recall_mean"]), np.sum(perms[recall_mean]['accuracy'] > true_recall_mean["accuracy"]), np.sum(perms[recall_mean]['auc'] > true_recall_mean["auc"]), np.sum(perms[min_recall]['recall_0'] > true_min_recall["recall_0"]), np.sum(perms[min_recall]['recall_1'] > true_min_recall["recall_1"]), np.sum(perms[min_recall]['recall_mean'] > true_min_recall["recall_mean"]), np.sum(perms[min_recall]['accuracy'] > true_min_recall["accuracy"]), np.sum(perms[min_recall]['auc'] > true_min_recall["auc"]), np.sum(perms[accuracy]['recall_0'] > true_accuracy["recall_0"]), np.sum(perms[accuracy]['recall_1'] > true_accuracy["recall_1"]), np.sum(perms[accuracy]['recall_mean'] > true_accuracy["recall_mean"]), np.sum(perms[accuracy]['accuracy'] > true_accuracy["accuracy"]), np.sum(perms[accuracy]['auc'] > true_accuracy["auc"])] pvals = pd.DataFrame(pvals) pvals["pval"] /= float(nperms) pvals.to_csv(os.path.join(OUTPUT, output_permutations), index=False) return {}