def init(): INPUT_DATA_X = os.path.join('X.npy') INPUT_DATA_y = os.path.join('y.npy') INPUT_MASK_PATH = os.path.join("mask.nii") #WD = os.path.join(WD, 'logistictvenet_5cv') if not os.path.exists(WD): os.makedirs(WD) os.chdir(WD) ############################################################################# ## Create config file y = np.load(INPUT_DATA_y) X = np.load(INPUT_DATA_X) from parsimony.utils.penalties import l1_max_logistic_loss assert l1_max_logistic_loss(X[:, 3:], y) == 0.20434911093262279 if os.path.exists(config_filenane()): old_conf = json.load(open(config_filenane())) cv = old_conf["resample"] else: cv_outer = [[tr, te] for tr,te in StratifiedKFold(y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] """ cv_outer = [[np.array(tr), np.array(te)] for tr,te in json.load(open("/neurospin/brainomics/2013_adni/MCIc-CTL_csi/config.json", "r"))["resample"][1:]] """ import collections cv = collections.OrderedDict() for cv_outer_i, (tr_val, te) in enumerate(cv_outer): cv["cv%02d/refit" % cv_outer_i] = [tr_val, te] cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % (cv_outer_i, cv_inner_i)] = [tr_val[tr], tr_val[val]] for k in cv: cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()] print cv.keys() # Some QC N = float(len(y)); p0 = np.sum(y==0) / N; p1 = np.sum(y==1) / N; for k in cv: tr, val = cv[k] tr, val = np.array(tr), np.array(val) print k, "\t: tr+val=", len(tr) + len(val) assert not set(tr).intersection(val) assert abs(np.sum(y[tr]==0)/float(len(y[tr])) - p0) < 0.01 assert abs(np.sum(y[tr]==1)/float(len(y[tr])) - p1) < 0.01 if k.count("refit"): te = val assert len(tr) + len(te) == len(y) assert abs(len(y[tr])/N - (1 - 1./NFOLDS_OUTER)) < 0.01 else: te = np.array(cv[k.split("/")[0] + "/refit"])[1] assert abs(len(y[tr])/N - (1 - 1./NFOLDS_OUTER) * (1 - 1./NFOLDS_INNER)) < 0.01 assert not set(tr).intersection(te) assert not set(val).intersection(te) len(tr) + len(val) + len(te) == len(y) tv_ratios = [0., .2, .8] l1_ratios = [np.array([1., .1, .9, 1]), np.array([1., .9, .1, 1])] # [alpha, l1 l2 tv] alphas_l1l2tv = [.01, .1] alphas_l2tv = [round(alpha, 10) for alpha in 10. ** np.arange(-2, 4)] k_range = [-1] l1l2tv =[np.array([alpha, float(1-tv), float(1-tv), tv]) * l1_ratio for alpha in alphas_l1l2tv for tv in tv_ratios for l1_ratio in l1_ratios] # specific case for without l1 since it supports larger penalties l2tv =[np.array([alpha, 0., float(1-tv), tv]) for alpha in alphas_l2tv for tv in tv_ratios] params = l1l2tv + l2tv params = [param.tolist() + [k] for k in k_range for param in params] params = {"_".join([str(p) for p in param]):param for param in params} assert len(params) == 30 user_func_filename = os.path.join(os.environ["HOME"], "git", "scripts", "2013_adni", "MCIc-CTL", "02_tvenet_modselectcv_csi.py") #print __file__, os.path.abspath(__file__) print "user_func", user_func_filename #import sys #sys.exit(0) # Use relative path from config.json config = dict(data=dict(X=INPUT_DATA_X, y=INPUT_DATA_y), params=params, resample=cv, mask_filename=INPUT_MASK_PATH, penalty_start = 3, map_output="modselectcv", user_func=user_func_filename, #reduce_input="rndperm/*/*", reduce_group_by="user_defined", reduce_output="MCIc-CTL_csi_modselectcv.csv") json.dump(config, open(os.path.join(WD, "config_modselectcv.json"), "w")) ############################################################################# # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(WD) cmd = "mapreduce.py --map %s/config_modselectcv.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd) ############################################################################# # Sync to cluster print "Sync data to gabriel.intra.cea.fr: " os.system(sync_push_filename) ############################################################################# print "# Start by running Locally with 2 cores, to check that everything os OK)" print "Interrupt after a while CTL-C" print "mapreduce.py --map %s/config_modselectcv.json --ncore 2" % WD #os.system("mapreduce.py --mode map --config %s/config.json" % WD) print "# 1) Log on gabriel:" print 'ssh -t gabriel.intra.cea.fr' print "# 2) Run one Job to test" print "qsub -I" print "cd %s" % WD_CLUSTER print "./job_Global_long.pbs" print "# 3) Run on cluster" print "qsub job_Global_long.pbs" print "# 4) Log out and pull Pull" print "exit" print sync_pull_filename ############################################################################# print "# Reduce" print "mapreduce.py --reduce %s/config_modselectcv.json" % WD
ytr = y[tr] Xte = X[te, :] yte = y[te] beta_start = weights.RandomUniformWeights().get_weights(Xtr.shape[1]) # check that ytr is balanced #assert ytr.sum() / ytr.shape[0] == 0.5 #assert yte.sum() / yte.shape[0] == 0.53500000000000003 # Dataset with intercept Xtr_i = np.c_[np.ones((Xtr.shape[0], 1)), Xtr] Xte_i = np.c_[np.ones((Xte.shape[0], 1)), Xte] beta_start_i = weights.RandomUniformWeights().get_weights(Xtr_i.shape[1]) # global penalty alpha = l1_max_logistic_loss(Xtr, ytr) from parsimony.algorithms.utils import Info info = [Info.converged, Info.num_iter, Info.time, Info.func_val] ############################################################################### ## Models ############################################################################### MODELS = collections.OrderedDict() algorithm_params = dict(eps=1e-4, max_iter=20000, info=info) ## Get data structure from array shape # l2 + grad_descnt if has_sklearn:
shape = (50, 50, 1) X3d, y, beta3d, proba = datasets.classification.dice5.load(n_samples=n_samples, shape=shape, snr=10, random_seed=1) X = X3d.reshape((n_samples, np.prod(beta3d.shape))) ### n_train = 300 Xtr = X[:n_train, :] ytr = y[:n_train] Xte = X[n_train:, :] yte = y[n_train:] # Empirically set the global penalty, based on maximum l1 penaly alpha = l1_max_logistic_loss(Xtr, ytr) ########################################################################### ## Use sklearn l2 penalized LogisticRegression # Minimize: # f(beta) = - C loglik+ 1/2 * ||beta||^2_2 ridge_sklrn = LogisticRegression(C=1. / (alpha * n_train), fit_intercept=False) yte_pred_ridge = ridge_sklrn.fit(Xtr, ytr.ravel()).predict(Xte) _, recall_ridge_sklrn, _, _ = precision_recall_fscore_support(yte, yte_pred_ridge, average=None) ########################################################################### ## Use parsimony l2 penalized LogisticRegression: LogisticRegressionL1L2TV with l1=tv=0 # Minimize: # f(beta, X, y) = - loglik/n_train + k/2 * ||beta||^2_2
l1l2tv = np.concatenate(l1l2tv) params = [params.tolist() for params in l1l2tv] ############################################################################# #File to store classification scores f = open( os.path.join(BASE_PATH, 'results', 'Logistic_L1_L2_TV_withHC', 'parameters_sorescsv'), 'wb') c = csv.writer(f, delimiter=',') c.writerow([ "alpha", "l1", "l2", "tv", "accuracy", "recall_0", "recall_1", "precision_0", "precision_1", "auc" ]) # Empirically set the global penalty, based on maximum l1 penaly alpha = l1_max_logistic_loss(T, y) conesta = algorithms.proximal.CONESTA(max_iter=500) A = nesterov_tv.linear_operator_from_mask(mask_bool) # Messages for communication between processes FLAG_STOP_PROCESS = "STOP_WORK" FLAG_PROCESS_FINISHED = "PROCESS_HAS_FINISHED" nb_processes = 30 # Data structures for parallel processing manager = Manager() # multiprocessing.Manager() work_queue, result_queue = manager.Queue(), manager.Queue() # Add jobs in work_queue for p in params: #print p work_queue.put(p)
Xd = Xd[keep] ALPHA, L1_PROP = 1 / np.log2(Xd.shape[1]), 0.0 print(ALPHA, L1_PROP) """ Xd2 = Xd.copy() Xd2["TRANSITION"] = yd Xd2.to_csv(WD+"/data/transitionPREP_ARonly_CAARMSonly.csv", index=False) """ X = np.asarray(Xd) y = np.asarray(yd) y = y.astype(float)[:, np.newaxis] #A, n_compacts = A_empty(X.shape[1]-1) l1_max_logistic_loss(X, y) assert X.shape == (27, 26) ############################################################################ ## FIXED PARAMETERS ############################################################################ #N_FOLDS = 10 N_PERMS = 10001 NBOOT = 1000 ALPHAS = [.01, 0.05, .1, 1/ np.log2(X.shape[1]), .5, 1, 10, 100] #L1_PROP = np.arange(0, 1.1, 0.1) L1_PROPS = np.arange(0, 1.1, 0.1) def scores(y_true, y_pred, prob_pred):
############################################################################### # Fetch dice5 dataset dataset_name = "%s_%s_%ix%ix%i_%i_dataset_v%s.npz" % \ tuple(["dice5", "classif", 50, 50, 1, 500, '0.3.1']) _, data = datasets.utils.download_dataset(dataset_name) X, y, beta3d, = data['X'], data['y'], data['beta3d'] ############################################################################### # Solve in parallel many Enet-TV problems # --------------------------------------- # # Empirically set the global penalty, based on maximum l1 penaly alpha = l1_max_logistic_loss(X, y) ############################################################################### # Penalization parameters are now vectors of equal length l1 = alpha * np.array([0.5, 0.5, 0.5]) l2 = alpha * np.array([0.5, 0.5, 0.5]) tv = alpha * np.array([0.01, 0.2, 0.8]) max_iter = 1000 ############################################################################### # Build linear operator and fit the model: A = nesterov_tv.linear_operator_from_shape(beta3d.shape, calc_lambda_max=True) enettv = estimators.LogisticRegressionL1L2TV( l1=l1, l2=l2,
l1l2tv = np.concatenate(l1l2tv) params = [params.tolist() for params in l1l2tv] ############################################################################# #File to store classification scores f = open( os.path.join(BASE_PATH, 'results', 'Logistic_L1_L2_TV', 'parameters_soresBetas.csv'), 'wb') c = csv.writer(f, delimiter=',') c.writerow([ "alpha", "l1", "l2", "tv", "accuracy", "recall_0", "recall_1", "precision_0", "precision_1", "auc" ]) # Empirically set the global penalty, based on maximum l1 penaly alpha = l1_max_logistic_loss(T_all, y_all) conesta = algorithms.proximal.CONESTA(max_iter=500) A = nesterov_tv.linear_operator_from_mask(mask_bool) # Messages for communication between processes FLAG_STOP_PROCESS = "STOP_WORK" FLAG_PROCESS_FINISHED = "PROCESS_HAS_FINISHED" nb_processes = 15 # Data structures for parallel processing manager = Manager() # multiprocessing.Manager() work_queue, result_queue = manager.Queue(), manager.Queue() # Add jobs in work_queue for p in params: #print p work_queue.put(p)