def init():
    INPUT_DATA_X = os.path.join('X.npy')
    INPUT_DATA_y = os.path.join('y.npy')
    INPUT_MASK_PATH = os.path.join("mask.nii")
    #WD = os.path.join(WD, 'logistictvenet_5cv')
    if not os.path.exists(WD):
        os.makedirs(WD)

    os.chdir(WD)

    #############################################################################
    ## Create config file
    y = np.load(INPUT_DATA_y)
    X = np.load(INPUT_DATA_X)
    from parsimony.utils.penalties import l1_max_logistic_loss
    assert l1_max_logistic_loss(X[:, 3:], y) == 0.20434911093262279
    if os.path.exists(config_filenane()):
        old_conf = json.load(open(config_filenane()))
        cv = old_conf["resample"]
    else:
        cv_outer = [[tr, te] for tr,te in StratifiedKFold(y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)]
        """
        cv_outer = [[np.array(tr), np.array(te)] for tr,te in json.load(open("/neurospin/brainomics/2013_adni/MCIc-CTL_csi/config.json", "r"))["resample"][1:]]
        """
        import collections
        cv = collections.OrderedDict()
        for cv_outer_i, (tr_val, te) in enumerate(cv_outer):
            cv["cv%02d/refit" % cv_outer_i] = [tr_val, te]
            cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42)
            for cv_inner_i, (tr, val) in enumerate(cv_inner):
                cv["cv%02d/cvnested%02d" % (cv_outer_i, cv_inner_i)] = [tr_val[tr], tr_val[val]]
        for k in cv:
            cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()]

    print cv.keys()
    # Some QC
    N = float(len(y)); p0 = np.sum(y==0) / N; p1 = np.sum(y==1) / N;
    for k in cv:
        tr, val = cv[k]
        tr, val = np.array(tr), np.array(val)
        print k, "\t: tr+val=", len(tr) + len(val)
        assert not set(tr).intersection(val)
        assert abs(np.sum(y[tr]==0)/float(len(y[tr])) - p0) < 0.01
        assert abs(np.sum(y[tr]==1)/float(len(y[tr])) - p1) < 0.01
        if k.count("refit"):
            te = val
            assert len(tr) + len(te) == len(y)
            assert abs(len(y[tr])/N - (1 - 1./NFOLDS_OUTER)) < 0.01
        else:
            te = np.array(cv[k.split("/")[0] + "/refit"])[1]
            assert abs(len(y[tr])/N - (1 - 1./NFOLDS_OUTER) * (1 - 1./NFOLDS_INNER)) < 0.01
            assert not set(tr).intersection(te)
            assert not set(val).intersection(te)
            len(tr) + len(val) + len(te) == len(y)

    tv_ratios = [0., .2, .8]
    l1_ratios = [np.array([1., .1, .9, 1]), np.array([1., .9, .1, 1])]  # [alpha, l1 l2 tv]
    alphas_l1l2tv = [.01, .1]
    alphas_l2tv = [round(alpha, 10) for alpha in 10. ** np.arange(-2, 4)]
    k_range = [-1]
    l1l2tv =[np.array([alpha, float(1-tv), float(1-tv), tv]) * l1_ratio
        for alpha in alphas_l1l2tv for tv in tv_ratios for l1_ratio in l1_ratios]
    # specific case for without l1 since it supports larger penalties
    l2tv =[np.array([alpha, 0., float(1-tv), tv])
        for alpha in alphas_l2tv for tv in tv_ratios]
    params = l1l2tv + l2tv
    params = [param.tolist() + [k] for k in k_range for param in params]
    params = {"_".join([str(p) for p in param]):param for param in params}
    assert len(params) == 30
    user_func_filename = os.path.join(os.environ["HOME"],
        "git", "scripts", "2013_adni", "MCIc-CTL",
        "02_tvenet_modselectcv_csi.py")
    #print __file__, os.path.abspath(__file__)
    print "user_func", user_func_filename
    #import sys
    #sys.exit(0)
    # Use relative path from config.json
    config = dict(data=dict(X=INPUT_DATA_X, y=INPUT_DATA_y),
                  params=params, resample=cv,
                  mask_filename=INPUT_MASK_PATH,
                  penalty_start = 3,
                  map_output="modselectcv",
                  user_func=user_func_filename,
                  #reduce_input="rndperm/*/*",
                  reduce_group_by="user_defined",
                  reduce_output="MCIc-CTL_csi_modselectcv.csv")
    json.dump(config, open(os.path.join(WD, "config_modselectcv.json"), "w"))

    #############################################################################
    # Build utils files: sync (push/pull) and PBS
    import brainomics.cluster_gabriel as clust_utils
    sync_push_filename, sync_pull_filename, WD_CLUSTER = \
        clust_utils.gabriel_make_sync_data_files(WD)
    cmd = "mapreduce.py --map  %s/config_modselectcv.json" % WD_CLUSTER
    clust_utils.gabriel_make_qsub_job_files(WD, cmd)
    #############################################################################
    # Sync to cluster
    print "Sync data to gabriel.intra.cea.fr: "
    os.system(sync_push_filename)
    #############################################################################
    print "# Start by running Locally with 2 cores, to check that everything os OK)"
    print "Interrupt after a while CTL-C"
    print "mapreduce.py --map %s/config_modselectcv.json --ncore 2" % WD
    #os.system("mapreduce.py --mode map --config %s/config.json" % WD)
    print "# 1) Log on gabriel:"
    print 'ssh -t gabriel.intra.cea.fr'
    print "# 2) Run one Job to test"
    print "qsub -I"
    print "cd %s" % WD_CLUSTER
    print "./job_Global_long.pbs"
    print "# 3) Run on cluster"
    print "qsub job_Global_long.pbs"
    print "# 4) Log out and pull Pull"
    print "exit"
    print sync_pull_filename
    #############################################################################
    print "# Reduce"
    print "mapreduce.py --reduce %s/config_modselectcv.json" % WD
ytr = y[tr]
Xte = X[te, :]
yte = y[te]
beta_start = weights.RandomUniformWeights().get_weights(Xtr.shape[1])

# check that ytr is balanced
#assert ytr.sum() / ytr.shape[0] == 0.5
#assert yte.sum() / yte.shape[0] == 0.53500000000000003

# Dataset with intercept
Xtr_i = np.c_[np.ones((Xtr.shape[0], 1)), Xtr]
Xte_i = np.c_[np.ones((Xte.shape[0], 1)), Xte]
beta_start_i = weights.RandomUniformWeights().get_weights(Xtr_i.shape[1])

# global penalty
alpha = l1_max_logistic_loss(Xtr, ytr)

from parsimony.algorithms.utils import Info
info = [Info.converged, Info.num_iter, Info.time, Info.func_val]

###############################################################################
## Models
###############################################################################
MODELS = collections.OrderedDict()

algorithm_params = dict(eps=1e-4, max_iter=20000, info=info)

## Get data structure from array shape

# l2 + grad_descnt
if has_sklearn:
shape = (50, 50, 1)

X3d, y, beta3d, proba = datasets.classification.dice5.load(n_samples=n_samples,
shape=shape, snr=10, random_seed=1)
X = X3d.reshape((n_samples, np.prod(beta3d.shape)))

###
n_train = 300

Xtr = X[:n_train, :]
ytr = y[:n_train]
Xte = X[n_train:, :]
yte = y[n_train:]

# Empirically set the global penalty, based on maximum l1 penaly
alpha = l1_max_logistic_loss(Xtr, ytr)

###########################################################################
## Use sklearn l2 penalized LogisticRegression
# Minimize:
# f(beta) = - C loglik+ 1/2 * ||beta||^2_2
ridge_sklrn = LogisticRegression(C=1. / (alpha * n_train), fit_intercept=False)

yte_pred_ridge = ridge_sklrn.fit(Xtr, ytr.ravel()).predict(Xte)
_, recall_ridge_sklrn, _, _ = precision_recall_fscore_support(yte, yte_pred_ridge, average=None)


###########################################################################
## Use parsimony l2 penalized LogisticRegression: LogisticRegressionL1L2TV with l1=tv=0
# Minimize:
#    f(beta, X, y) = - loglik/n_train + k/2 * ||beta||^2_2
예제 #4
0
l1l2tv = np.concatenate(l1l2tv)
params = [params.tolist() for params in l1l2tv]
#############################################################################

#File to store classification scores
f = open(
    os.path.join(BASE_PATH, 'results', 'Logistic_L1_L2_TV_withHC',
                 'parameters_sorescsv'), 'wb')
c = csv.writer(f, delimiter=',')
c.writerow([
    "alpha", "l1", "l2", "tv", "accuracy", "recall_0", "recall_1",
    "precision_0", "precision_1", "auc"
])

# Empirically set the global penalty, based on maximum l1 penaly
alpha = l1_max_logistic_loss(T, y)
conesta = algorithms.proximal.CONESTA(max_iter=500)
A = nesterov_tv.linear_operator_from_mask(mask_bool)

# Messages for communication between processes
FLAG_STOP_PROCESS = "STOP_WORK"
FLAG_PROCESS_FINISHED = "PROCESS_HAS_FINISHED"
nb_processes = 30
# Data structures for parallel processing
manager = Manager()  # multiprocessing.Manager()
work_queue, result_queue = manager.Queue(), manager.Queue()

# Add jobs in work_queue
for p in params:
    #print p
    work_queue.put(p)
예제 #5
0
    Xd = Xd[keep]
    ALPHA, L1_PROP = 1 / np.log2(Xd.shape[1]), 0.0

print(ALPHA, L1_PROP)

"""
Xd2 = Xd.copy()
Xd2["TRANSITION"] = yd
Xd2.to_csv(WD+"/data/transitionPREP_ARonly_CAARMSonly.csv", index=False)
"""

X = np.asarray(Xd)
y = np.asarray(yd)
y = y.astype(float)[:, np.newaxis]
#A, n_compacts = A_empty(X.shape[1]-1)
l1_max_logistic_loss(X, y)

assert X.shape == (27, 26)

############################################################################
## FIXED PARAMETERS
############################################################################

#N_FOLDS = 10
N_PERMS = 10001
NBOOT = 1000
ALPHAS = [.01, 0.05, .1, 1/ np.log2(X.shape[1]), .5, 1, 10, 100]
    #L1_PROP = np.arange(0, 1.1, 0.1)
L1_PROPS = np.arange(0, 1.1, 0.1)

def scores(y_true, y_pred, prob_pred):
###############################################################################
# Fetch dice5 dataset
dataset_name = "%s_%s_%ix%ix%i_%i_dataset_v%s.npz" % \
         tuple(["dice5", "classif", 50, 50, 1, 500, '0.3.1'])
_, data  = datasets.utils.download_dataset(dataset_name)

X, y, beta3d, = data['X'], data['y'], data['beta3d']

###############################################################################
# Solve in parallel many Enet-TV problems
# ---------------------------------------
#
# Empirically set the global penalty, based on maximum l1 penaly

alpha = l1_max_logistic_loss(X, y)

###############################################################################
# Penalization parameters are now vectors of equal length

l1 = alpha * np.array([0.5, 0.5, 0.5])
l2 = alpha * np.array([0.5, 0.5, 0.5])
tv = alpha * np.array([0.01, 0.2, 0.8])
max_iter = 1000

###############################################################################
# Build linear operator and fit the model:
A = nesterov_tv.linear_operator_from_shape(beta3d.shape, calc_lambda_max=True)
enettv = estimators.LogisticRegressionL1L2TV(
               l1=l1,
               l2=l2,
예제 #7
0
l1l2tv = np.concatenate(l1l2tv)
params = [params.tolist() for params in l1l2tv]
#############################################################################

#File to store classification scores
f = open(
    os.path.join(BASE_PATH, 'results', 'Logistic_L1_L2_TV',
                 'parameters_soresBetas.csv'), 'wb')
c = csv.writer(f, delimiter=',')
c.writerow([
    "alpha", "l1", "l2", "tv", "accuracy", "recall_0", "recall_1",
    "precision_0", "precision_1", "auc"
])

# Empirically set the global penalty, based on maximum l1 penaly
alpha = l1_max_logistic_loss(T_all, y_all)
conesta = algorithms.proximal.CONESTA(max_iter=500)
A = nesterov_tv.linear_operator_from_mask(mask_bool)

# Messages for communication between processes
FLAG_STOP_PROCESS = "STOP_WORK"
FLAG_PROCESS_FINISHED = "PROCESS_HAS_FINISHED"
nb_processes = 15
# Data structures for parallel processing
manager = Manager()  # multiprocessing.Manager()
work_queue, result_queue = manager.Queue(), manager.Queue()

# Add jobs in work_queue
for p in params:
    #print p
    work_queue.put(p)