def mapper(key, output_collector): import mapreduce as GLOBAL Xtr = GLOBAL.DATA_RESAMPLED["X"][0] Xte = GLOBAL.DATA_RESAMPLED["X"][1] ytr = GLOBAL.DATA_RESAMPLED["y"][0] yte = GLOBAL.DATA_RESAMPLED["y"][1] l2 = float(key[0]) print(l2) class_weight = 'auto' # unbiased scaler = preprocessing.StandardScaler().fit(Xtr) Xtr = scaler.transform(Xtr) Xte = scaler.transform(Xte) mod = estimators.RidgeLogisticRegression(l2, penalty_start=penalty_start) mod.fit(Xtr, ytr.ravel()) y_pred = mod.predict(Xte) proba_pred = mod.predict_probability(Xte) ret = dict(y_pred=y_pred, y_true=yte, proba_pred=proba_pred, beta=mod.beta) if output_collector: output_collector.collect(key, ret) else: return ret
def init(): INPUT_DATA_X = os.path.join(WD_ORIGINAL, 'X.npy') INPUT_DATA_y = os.path.join(WD_ORIGINAL, 'y.npy') INPUT_MASK_PATH = os.path.join(WD_ORIGINAL, 'mask.nii') #INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/data/30yo/Atv.npz' # INPUT_CSV = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/population_30yo.csv' os.makedirs(WD, exist_ok=True) shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) #shutil.copy(INPUT_LINEAR_OPE_PATH, WD) ## Create config file os.chdir(WD) X = np.load("X.npy") y = np.load("y.npy") if not os.path.exists(os.path.join(WD, "Atv.npz")): import nibabel import parsimony.functions.nesterov.tv as nesterov_tv from parsimony.utils.linalgs import LinearOperatorNesterov img = nibabel.load(os.path.join(WD, "mask.nii")) Atv = nesterov_tv.linear_operator_from_mask(img.get_data(), calc_lambda_max=True) Atv.save(os.path.join(WD, "Atv.npz")) Atv_ = LinearOperatorNesterov(filename=os.path.join(WD, "Atv.npz")) assert Atv.get_singular_values(0) == Atv_.get_singular_values(0) assert np.allclose(Atv_.get_singular_values(0), 11.942045760666732, rtol=1e-03, atol=1e-03) assert np.all([ a.shape == (X.shape[1] - penalty_start, X.shape[1] - penalty_start) for a in Atv ]) if False and not os.path.exists(os.path.join(WD, "beta_start.npz")): betas = dict() import time alphas = [.01, 0.1, 1.0, 10] for alpha in alphas: mod = estimators.RidgeLogisticRegression( l=alpha, class_weight="auto", penalty_start=penalty_start) t_ = time.time() mod.fit(X, y.ravel()) print(time.time() - t_) # 11564 betas["lambda_%.2f" % alpha] = mod.beta np.savez(os.path.join(WD, "beta_start.npz"), **betas) beta_start = np.load(os.path.join(WD, "beta_start.npz")) assert np.all( [np.all(beta_start[a] == betas[a]) for a in beta_start.keys()]) ## Create config file # ######################################################################## # Setting 1: 5cv + large range of parameters: cv_largerange # with sub-sample training set with size 50, 100 # 5cv/cv0*[_sub50]/refit/* # sub_sizes = [50, 100] sub_sizes = [] cv_outer = [[ tr, te ] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42). split(np.zeros(y.shape[0]), y.ravel())] # check we got the same CV than previoulsy cv_old = json.load( open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"] cv_outer_old = [ cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)] ] assert np.all([ np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0]) for i in range(NFOLDS_OUTER) ]) assert np.all([ np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1]) for i in range(NFOLDS_OUTER) ]) # check END import collections cv = collections.OrderedDict() cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))] for cv_outer_i, (tr_val, te) in enumerate(cv_outer): # Simple CV cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te] # Nested CV # cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) # for cv_inner_i, (tr, val) in enumerate(cv_inner): # cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] # Sub-sample training set with size 50, 100 # => cv*_sub[50|100]/refit grps = np.unique(y[tr_val]).astype(int) ytr = y.copy() ytr[te] = np.nan g_idx = [np.where(ytr == g)[0] for g in grps] assert np.all([np.all(ytr[g_idx[g]] == g) for g in grps]) g_size = np.array([len(g) for g in g_idx]) g_prop = g_size / g_size.sum() for sub_size in sub_sizes: # sub_size = sub_sizes[0] sub_g_size = np.round(g_prop * sub_size).astype(int) g_sub_idx = [ np.random.choice(g_idx[g], sub_g_size[g], replace=False) for g in grps ] assert np.all([np.all(y[g_sub_idx[g]] == g) for g in grps]) tr_val_sub = np.concatenate(g_sub_idx) assert len(tr_val_sub) == sub_size assert np.all([idx in tr_val for idx in tr_val_sub]) assert np.all(np.logical_not([idx in te for idx in tr_val_sub])) cv["cv%02d_sub%i/refit" % (cv_outer_i, sub_size)] = [tr_val_sub, te] cv = {k: [cv[k][0].tolist(), cv[k][1].tolist()] for k in cv} # Nested CV # assert len(cv_largerange) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1 # Simple CV # assert len(cv) == NFOLDS_OUTER + 1 # Simple CV + sub-sample training set with size 50, 100: assert len(cv) == NFOLDS_OUTER * (1 + len(sub_sizes)) + 1 print(list(cv.keys())) # Large grid of parameters alphas = [0.001, 0.01, 0.1, 1.0] # alphas = [.01, 0.1, 1.0] # first ran with this grid tv_ratio = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] l1l2_ratio = [0.1, 0.5, 0.9] #l1l2_ratio = [0, 0.1, 0.5, 0.9, 1.0] # first ran with this grid algos = ["enettv", "enetgn"] params_enet_tvgn = [ list(param) for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio) ] assert len(params_enet_tvgn) == 240 # old 300 params_enet = [ list(param) for param in itertools.product(["enet"], alphas, l1l2_ratio, [0]) ] assert len(params_enet) == 12 # old 15 params = params_enet_tvgn + params_enet assert len(params) == 252 # 315 # Simple CV # assert len(params) * len(cv) == 1890 # Simple CV + sub-sample training set with size 50, 100: assert len(params) * len(cv) == 1512 # 1890 config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure_linear_operator_tv="Atv.npz", beta_start="beta_start.npz", map_output="5cv", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_cv_largerange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_cv_largerange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, walltime="250:00:00", suffix="_cv_largerange", freecores=2) # ######################################################################## # Setting 2: dcv + reduced range of parameters: dcv_reducedrange # 5cv/cv0*/cvnested0*/* cv_outer = [[ tr, te ] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42). split(np.zeros(y.shape[0]), y.ravel())] # check we got the same CV than previoulsy cv_old = json.load( open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"] cv_outer_old = [ cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)] ] assert np.all([ np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0]) for i in range(NFOLDS_OUTER) ]) assert np.all([ np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1]) for i in range(NFOLDS_OUTER) ]) # check END import collections cv = collections.OrderedDict() cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))] for cv_outer_i, (tr_val, te) in enumerate(cv_outer): cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te] cv_inner = StratifiedKFold(n_splits=NFOLDS_INNER, random_state=42).split( np.zeros(y[tr_val].shape[0]), y[tr_val].ravel()) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] cv = {k: [cv[k][0].tolist(), cv[k][1].tolist()] for k in cv} #assert len(cv) == NFOLDS_OUTER + 1 assert len(cv) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1 print(list(cv.keys())) # Reduced grid of parameters alphas = [0.001, 0.01, 0.1, 1.0] tv_ratio = [0.2, 0.8] l1l2_ratio = [0.1, 0.9] algos = ["enettv", "enetgn"] params_enet_tvgn = [ list(param) for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio) ] assert len(params_enet_tvgn) == 32 # 16 params_enet = [ list(param) for param in itertools.product(["enet"], alphas, l1l2_ratio, [0]) ] assert len(params_enet) == 8 # 4 params = params_enet_tvgn + params_enet assert len(params) == 40 # 20 assert len(params) * len(cv) == 1240 # 620 config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure_linear_operator_tv="Atv.npz", beta_start="beta_start.npz", map_output="5cv", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_dcv_reducedrange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_dcv_reducedrange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, walltime="250:00:00", suffix="_dcv_reducedrange", freecores=2)
def mapper(key, output_collector): """ # debug mapper config = json.load(open(os.path.join(WD, "config_cv_largerange.json"), "r")) load_globals(config) resample(config, 'refit/refit') key = ('enettv', 0.01, 0.1, 0.3) """ import mapreduce as GLOBAL Xtr = GLOBAL.DATA_RESAMPLED["X"][0] Xte = GLOBAL.DATA_RESAMPLED["X"][1] ytr = GLOBAL.DATA_RESAMPLED["y"][0] yte = GLOBAL.DATA_RESAMPLED["y"][1] # key = 'enettv_0.01_0.1_0.2'.split("_") algo, alpha, l1l2ratio, tvratio = key[0], float(key[1]), float( key[2]), float(key[3]) tv = alpha * tvratio l1 = alpha * float(1 - tv) * l1l2ratio l2 = alpha * float(1 - tv) * (1 - l1l2ratio) print(key, algo, alpha, l1, l2, tv) # alpha = float(key[0]) # l1, l2, tv = alpha * float(key[1]), alpha * float(key[2]), alpha * float(key[3]) # print("l1:%f, l2:%f, tv:%f" % (l1, l2, tv)) class_weight = "auto" # unbiased # beta_start = GLOBAL.beta_start["lambda_%.4f" % alpha] # mask = np.ones(Xtr.shape[0], dtype=bool) # scaler = preprocessing.StandardScaler().fit(Xtr) # Xtr = scaler.transform(Xtr) # Xte = scaler.transform(Xte) if algo == 'enettv': conesta = algorithms.proximal.CONESTA(max_iter=10000) mod = estimators.LogisticRegressionL1L2TV(l1, l2, tv, GLOBAL.Atv, algorithm=conesta, class_weight=class_weight, penalty_start=penalty_start) elif algo == 'enetgn': fista = algorithms.proximal.FISTA(max_iter=5000) mod = estimators.LogisticRegressionL1L2GraphNet( l1, l2, tv, GLOBAL.Agn, algorithm=fista, class_weight=class_weight, penalty_start=penalty_start) elif algo == 'enet': fista = algorithms.proximal.FISTA(max_iter=5000) mod = estimators.ElasticNetLogisticRegression( l1l2ratio, alpha, algorithm=fista, class_weight=class_weight, penalty_start=penalty_start) elif algo == 'ridge': mod = estimators.RidgeLogisticRegression(l1l2ratio, alpha, algorithm=fista, class_weight=class_weight, penalty_start=penalty_start) else: raise Exception('Algo%s not handled' % algo) mod.fit(Xtr, ytr.ravel()) y_pred = mod.predict(Xte) proba_pred = mod.predict_probability(Xte) ret = dict(y_pred=y_pred, y_true=yte, proba_pred=proba_pred, beta=mod.beta) #, mask=mask) if output_collector: output_collector.collect(key, ret) else: return ret
algorithm_params = dict(eps=1e-4, max_iter=20000, info=info) ## Get data structure from array shape # l2 + grad_descnt if has_sklearn: MODELS["2d_l2_sklearn"] = \ sklearn.linear_model.LogisticRegression(C=1. / alpha, fit_intercept=False, class_weight=None, dual=False) # Parsimony: minimize f(beta, X, y) = - loglik + alpha/2 * ||beta||_1 MODELS["2d_l2_grad_descnt"] = \ estimators.RidgeLogisticRegression(alpha, class_weight=None, mean=False, algorithm_params=algorithm_params) if has_sklearn: MODELS["2d_l2_inter_sklearn"] = \ sklearn.linear_model.LogisticRegression(C=1. / alpha, fit_intercept=True, class_weight=None, dual=False) MODELS["2d_l2_inter_grad_descnt"] = \ estimators.RidgeLogisticRegression(alpha, class_weight=None, mean=False, penalty_start=1, algorithm_params=algorithm_params)
# Empirically set the global penalty, based on maximum l1 penaly alpha = l1_max_logistic_loss(Xtr, ytr) ############################################################################### # Rgidge sklearn # Min f(beta) = - C loglik+ 1/2 * ||beta||^2_2 ridge_sklrn = LogisticRegression(C=1. / (alpha * n_train), fit_intercept=False) yte_pred_ridge = ridge_sklrn.fit(Xtr, ytr.ravel()).predict(Xte) _, recall_ridge_sklrn, _, _ = \ precision_recall_fscore_support(yte, yte_pred_ridge, average=None) # Ridge Parsimony # Min f(beta, X, y) = - loglik/n_train + k/2 * ||beta||^2_2 ridge_prsmy = estimators.RidgeLogisticRegression(alpha) yte_pred_ridge_prsmy = ridge_prsmy.fit(Xtr, ytr).predict(Xte) _, recall_ridge_prsmy, _, _ = \ precision_recall_fscore_support(yte, yte_pred_ridge_prsmy, average=None) # EldasticNet enet = estimators.ElasticNetLogisticRegression(l=0.5, alpha=alpha) yte_pred_enet = enet.fit(Xtr, ytr).predict(Xte) _, recall_enet, _, _ = \ precision_recall_fscore_support(yte, yte_pred_enet, average=None) # GraphNet # l1, l2, gn = alpha * np.array((.05, .75, .2)) # l1, l2, gn penalties l1, l2, gn = alpha * np.array((.33, .33, 33)) # l1, l2, gn penalties A = sparse.vstack(nesterov_tv.linear_operator_from_shape(shape))
#Reload with alphas = [.01, 0.1, 1.0] betas = np.load(os.path.join(OUTPUT, "beta_start.npz")) betas = {"lambda_%.4f" %float(k.split("_")[1]):betas[k] for k in betas.keys()} [[k, np.sum(betas[k] ** 2)] for k in betas.keys()] [[k, betas[k].shape] for k in betas.keys()] B = np.hstack([betas[k] for k in betas.keys()]) np.corrcoef(B.T) """ alphas = [0.0001, 0.001, 0.01, 0.1, 1.0] # alphas = [0.0001, 0.001] for alpha in alphas: mod = estimators.RidgeLogisticRegression( l=alpha, class_weight="auto", penalty_start=penalty_start, algorithm_params=dict(max_iter=10000)) t_ = time.clock() mod.fit(Xs, y.ravel()) print(time.clock() - t_, mod.algorithm.num_iter) # 11564 betas["lambda_%.4f" % alpha] = mod.beta #np.savez(os.path.join(OUTPUT, "beta_start_1000ite.npz"), **betas) np.savez(os.path.join(OUTPUT, "beta_start.npz"), **betas) betas.keys() beta_start = np.load(os.path.join(OUTPUT, "beta_start.npz")) assert np.all([np.all(beta_start[a] == betas[a]) for a in beta_start.keys()]) """