def one_edge_tv(coord_ij, tria_ij, data_ij, Y_ij, params): A_ij = tv.linear_operator_from_mesh(coord_ij, tria_ij) X_train, X_test, y_train, y_test = train_test_split(data_ij, Y_ij, shuffle=True, random_state=1, test_size=0.33) tv_reg = LinearRegressionL1L2TV(l1=params['l1'], l2=params['l2'], tv=params['tv'], A=A_ij) tv_reg.fit(X_train, y_train) y_train_pred = tv_reg.predict(X_train) y_test_pred = tv_reg.predict(X_test) mse_test = mean_squared_error(y_test, y_test_pred) r2_test = r2_score(y_test, y_test_pred_tv) print('TRAIN MSE TV: {}, TEST MSE TV: {}'.format( mean_squared_error(y_train, y_train_pred), mse_test)) print('TRAIN R2 TV: {}, TEST R2 TV: {}\n'.format( r2_score(y_train, y_train_pred), r2_test)) return r2_test
def test_tvhelper_linear_operator_from_mesh(self): import parsimony.functions.nesterov.tv as tv_helper mesh_coord = np.array([[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]]) mesh_triangles = np.array([[0 ,1, 3], [0, 2 ,3], [2, 3, 5], [2, 4, 5]]) A = tv_helper.linear_operator_from_mesh(mesh_coord, mesh_triangles) a =[[np.where(l)[0].tolist() for l in a.toarray()] for a in A] b = [[[], [0, 1], [0, 2], [0, 3], [2, 4], [2, 5]], [[], [], [], [1, 3], [], [3, 5]], [[], [], [], [2, 3], [], [4, 5]]] assert a == b
def load_globals(config): import mapreduce as GLOBAL # access to global variables GLOBAL.DATA = GLOBAL.load_data(config["data"]) import brainomics.mesh_processing as mesh_utils mesh_coord, mesh_triangles = mesh_utils.mesh_arrays( config["structure"]["mesh"]) mask = np.load(config["structure"]["mask"]) GLOBAL.mesh_coord, GLOBAL.mesh_triangles, GLOBAL.mask = mesh_coord, mesh_triangles, mask A = tv_helper.linear_operator_from_mesh(GLOBAL.mesh_coord, GLOBAL.mesh_triangles, GLOBAL.mask) GLOBAL.A = A GLOBAL.CONFIG = config
def load_globals(config): import mapreduce as GLOBAL # access to global variables GLOBAL.DATA = GLOBAL.load_data(config["data"]) STRUCTURE = np.load(config["structure"]) A = tv_helper.A_from_mask(STRUCTURE) N_COMP = config["N_COMP"] GLOBAL.A, GLOBAL.STRUCTURE,GLOBAL.N_COMP = A, STRUCTURE,N_COMP mesh_coord, mesh_triangles = mesh_utils.mesh_arrays(os.path.join(TEMPLATE_PATH, "lrh.pial.gii")) mask = np.load(os.path.join(INPUT_BASE_DIR, "mask.npy")) import parsimony.functions.nesterov.tv as tv_helper Atv = tv_helper.linear_operator_from_mesh(mesh_coord, mesh_triangles, mask=mask) GLOBAL.Atv = Atv GLOBAL.FULL_RESAMPLE = config['full_resample']
np.save(os.path.join(OUTPUT, "mask.npy"), mask) X = Xtot[:, mask] assert X.shape == (280, 299806) ############################################################################# X = np.hstack([Z, X]) assert X.shape == (280, 299809) #Remove nan lines X = X[np.logical_not(np.isnan(y)).ravel(), :] y = y[np.logical_not(np.isnan(y))] assert X.shape == (280, 299809) np.save(os.path.join(OUTPUT, "X.npy"), X) np.save(os.path.join(OUTPUT, "y.npy"), y) ############################################################################# import parsimony.functions.nesterov.tv as nesterov_tv from parsimony.utils.linalgs import LinearOperatorNesterov Atv = nesterov_tv.linear_operator_from_mesh(cor, tri, mask, calc_lambda_max=True) Atv.save(os.path.join(OUTPUT, "Atv.npz")) Atv_ = LinearOperatorNesterov(filename=os.path.join(OUTPUT, "Atv.npz")) assert Atv.get_singular_values(0) == Atv_.get_singular_values(0) assert np.allclose(Atv_.get_singular_values(0), 8.999, rtol=1e-03, atol=1e-03) assert np.all([a.shape == (299806, 299806) for a in Atv])
MODELS["2d_l1l2tv_inexactfista"] = \ estimators.LogisticRegressionL1L2TVInexactFISTA( l1, l2, tv, Al1tv, algorithm_params=algorithm_params) MODELS["2d_l1l2tv_inter_inexactfista"] = \ estimators.LogisticRegressionL1L2TVInexactFISTA( l1, l2, tv, Al1tv, penalty_start=1, algorithm_params=algorithm_params) ## Get data structure from mesh # build a cylinder mesh with the same topology than the 2D grid xyz, tri = mesh.cylinder(shape[1], shape[0]) Atvmesh = nesterov_tv.linear_operator_from_mesh(xyz, tri) MODELS["mesh_l1l2tv_conesta"] = \ estimators.LogisticRegressionL1L2TV( l1, l2, tv, Atvmesh, algorithm=algorithms.proximal.CONESTA(), algorithm_params=algorithm_params) MODELS["mesh_l1l2tv_inter_conesta"] = \ estimators.LogisticRegressionL1L2TV( l1, l2, tv, Atvmesh, penalty_start=1, algorithm=algorithms.proximal.CONESTA(), algorithm_params=algorithm_params) Atvl1mesh = l1tv.linear_operator_from_mesh(xyz, tri)
global_pen = 0.1 tv_ratio =1e-05# 0.5 l1_ratio = 0.1 ltv = global_pen * tv_ratio ll1 = l1_ratio * global_pen * (1 - tv_ratio) ll2 = (1 - l1_ratio) * global_pen * (1 - tv_ratio) assert(np.allclose(ll1 + ll2 + ltv, global_pen)) mesh_coord, mesh_triangles = mesh_utils.mesh_arrays(os.path.join(TEMPLATE_PATH, "lrh.pial.gii")) mask = np.load(os.path.join(INPUT_BASE_DIR, "mask.npy")) import parsimony.functions.nesterov.tv as tv_helper Atv = tv_helper.linear_operator_from_mesh(mesh_coord, mesh_triangles, mask=mask) # PARSIMONY ######################################## from parsimony.algorithms.utils import AlgorithmSnapshot snapshot = AlgorithmSnapshot('/neurospin/brainomics/2014_pca_struct/adni/adni_time/enet_1e-6/',saving_period=1).save_conesta mod = pca_tv.PCA_L1_L2_TV(n_components=3, l1=ll1, l2=ll2, ltv=ltv, Atv=Atv, criterion="frobenius", eps=1e-6, inner_eps=1e-1, max_iter=100, inner_max_iter=int(1e4),
def init(): INPUT_DATA_X = os.path.join(WD_ORIGINAL, 'X.npy') INPUT_DATA_y = os.path.join(WD_ORIGINAL, 'y.npy') INPUT_MASK_PATH = os.path.join(WD_ORIGINAL, 'mask.npy') INPUT_MESH_PATH = '/neurospin/brainomics/2013_adni/MCIc-CTL-FS_cs/lrh.pial.gii' #INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/data/30yo/Atv.npz' # INPUT_CSV = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/population_30yo.csv' os.makedirs(WD, exist_ok=True) shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) shutil.copy(INPUT_MESH_PATH, WD) #shutil.copy(INPUT_LINEAR_OPE_PATH, WD) ## Create config file os.chdir(WD) X = np.load("X.npy") y = np.load("y.npy") if not os.path.exists(os.path.join(WD, "Atv.npz")): import brainomics.mesh_processing as mesh_utils cor, tri = mesh_utils.mesh_arrays(os.path.join(WD, "lrh.pial.gii")) mask = np.load(os.path.join(WD, 'mask.npy')) import parsimony.functions.nesterov.tv as nesterov_tv from parsimony.utils.linalgs import LinearOperatorNesterov Atv = nesterov_tv.linear_operator_from_mesh(cor, tri, mask, calc_lambda_max=True) Atv.save(os.path.join(WD, "Atv.npz")) Atv_ = LinearOperatorNesterov(filename=os.path.join(WD, "Atv.npz")) assert Atv.get_singular_values(0) == Atv_.get_singular_values(0) assert np.allclose(Atv_.get_singular_values(0), 8.999, rtol=1e-03, atol=1e-03) assert np.all([a.shape == (317089, 317089) for a in Atv]) if not os.path.exists(os.path.join(WD, "beta_start.npz")): betas = dict() import time alphas = [.01, 0.1, 1.0, 10] for alpha in alphas: mod = estimators.RidgeLogisticRegression(l=alpha, class_weight="auto", penalty_start=penalty_start) t_ = time.time() mod.fit(X, y.ravel()) print(time.time() - t_) # 11564 betas["lambda_%.2f" % alpha] = mod.beta np.savez(os.path.join(WD, "beta_start.npz"), **betas) beta_start = np.load(os.path.join(WD, "beta_start.npz")) assert np.all([np.all(beta_start[a] == betas[a]) for a in beta_start.keys()]) ## Create config file # ######################################################################## # Setting 1: 5cv + large range of parameters: cv_largerange # with sub-sample training set with size 50, 100 # 5cv/cv0*[_sub50]/refit/* # sub_sizes = [50, 100] sub_sizes = [] cv_outer = [[tr, te] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42).split(np.zeros(y.shape[0]), y.ravel())] # check we got the same CV than previoulsy cv_old = json.load(open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"] cv_outer_old = [cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)]] assert np.all([np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0]) for i in range(NFOLDS_OUTER)]) assert np.all([np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1]) for i in range(NFOLDS_OUTER)]) # check END import collections cv = collections.OrderedDict() cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))] for cv_outer_i, (tr_val, te) in enumerate(cv_outer): # Simple CV cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te] # Nested CV # cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) # for cv_inner_i, (tr, val) in enumerate(cv_inner): # cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] # Sub-sample training set with size 50, 100 # => cv*_sub[50|100]/refit grps = np.unique(y[tr_val]).astype(int) ytr = y.copy() ytr[te] = np.nan g_idx = [np.where(ytr == g)[0] for g in grps] assert np.all([np.all(ytr[g_idx[g]] == g) for g in grps]) g_size = np.array([len(g) for g in g_idx]) g_prop = g_size / g_size.sum() for sub_size in sub_sizes: # sub_size = sub_sizes[0] sub_g_size = np.round(g_prop * sub_size).astype(int) g_sub_idx = [np.random.choice(g_idx[g], sub_g_size[g], replace=False) for g in grps] assert np.all([np.all(y[g_sub_idx[g]] == g) for g in grps]) tr_val_sub = np.concatenate(g_sub_idx) assert len(tr_val_sub) == sub_size assert np.all([idx in tr_val for idx in tr_val_sub]) assert np.all(np.logical_not([idx in te for idx in tr_val_sub])) cv["cv%02d_sub%i/refit" % (cv_outer_i, sub_size)] = [tr_val_sub, te] cv = {k:[cv[k][0].tolist(), cv[k][1].tolist()] for k in cv} # Nested CV # assert len(cv_largerange) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1 # Simple CV # assert len(cv) == NFOLDS_OUTER + 1 # Simple CV + sub-sample training set with size 50, 100: assert len(cv) == NFOLDS_OUTER * (1 + len(sub_sizes)) + 1 print(list(cv.keys())) # Large grid of parameters alphas = [0.001, 0.01, 0.1, 1.0] # alphas = [.01, 0.1, 1.0] # first ran with this grid tv_ratio = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] l1l2_ratio = [0.1, 0.5, 0.9] # l1l2_ratio = [0, 0.1, 0.5, 0.9, 1.0] # first ran with this grid algos = ["enettv", "enetgn"] params_enet_tvgn = [list(param) for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio)] assert len(params_enet_tvgn) == 240 # old 300 params_enet = [list(param) for param in itertools.product(["enet"], alphas, l1l2_ratio, [0])] assert len(params_enet) == 12 # old 15 params = params_enet_tvgn + params_enet assert len(params) == 252 # 315 # Simple CV # assert len(params) * len(cv) == 1890 # Simple CV + sub-sample training set with size 50, 100: assert len(params) * len(cv) == 1512 # 5040 config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure_linear_operator_tv="Atv.npz", beta_start="beta_start.npz", map_output="5cv", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_cv_largerange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_cv_largerange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd,walltime = "250:00:00", suffix="_cv_largerange", freecores=2) # ######################################################################## # Setting 2: dcv + reduced range of parameters: dcv_reducedrange # 5cv/cv0*/cvnested0*/* cv_outer = [[tr, te] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42).split(np.zeros(y.shape[0]), y.ravel())] # check we got the same CV than previoulsy cv_old = json.load(open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"] cv_outer_old = [cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)]] assert np.all([np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0]) for i in range(NFOLDS_OUTER)]) assert np.all([np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1]) for i in range(NFOLDS_OUTER)]) # check END import collections cv = collections.OrderedDict() cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))] for cv_outer_i, (tr_val, te) in enumerate(cv_outer): cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te] cv_inner = StratifiedKFold(n_splits=NFOLDS_INNER, random_state=42).split(np.zeros(y[tr_val].shape[0]), y[tr_val].ravel()) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] cv = {k:[cv[k][0].tolist(), cv[k][1].tolist()] for k in cv} #assert len(cv) == NFOLDS_OUTER + 1 assert len(cv) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1 print(list(cv.keys())) # Reduced grid of parameters alphas = [0.001, 0.01, 0.1, 1.0] # alphas = [.01, 0.1] # original tv_ratio = [0.2, 0.8] l1l2_ratio = [0.1, 0.9] algos = ["enettv", "enetgn"] params_enet_tvgn = [list(param) for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio)] assert len(params_enet_tvgn) == 32 # 16 params_enet = [list(param) for param in itertools.product(["enet"], alphas, l1l2_ratio, [0])] assert len(params_enet) == 8 # 4 params = params_enet_tvgn + params_enet assert len(params) == 40 # 20 assert len(params) * len(cv) == 1240 # 620 config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure_linear_operator_tv="Atv.npz", beta_start="beta_start.npz", map_output="5cv", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_dcv_reducedrange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_dcv_reducedrange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd,walltime = "250:00:00", suffix="_dcv_reducedrange", freecores=2)