def linear_operator_from_shape(shape, weights=None, calc_lambda_max=False): """Generates the linear operator for the total variation Nesterov function from the shape of a 1D, 2D or 3D image. Parameters ---------- shape : List or tuple with 1, 2 or 3 integers. The shape of the 1D, 2D or 3D image. shape has the form X, (X,), (Y, X) or (Z, Y, X), where Z is the number of "layers", Y is the number of rows and X is the number of columns. The shape does not involve any intercept variables. weights : Sequence, e.g. list or numpy (p-by-1) array. Weights put on the groups. Default is weight 1 for each group, i.e. no weight. calc_lambda_max: boolean. Should the largest singular value being precomputed ? """ if not isinstance(shape, (list, tuple)): shape = [shape] while len(shape) < 3: shape = tuple([1] + list(shape)) nz = shape[0] ny = shape[1] nx = shape[2] p = nx * ny * nz ind = np.arange(p).reshape((nz, ny, nx)) if weights is not None: weights = np.array(weights) weights = weights.ravel() # w = sparse.spdiags(weights.ravel(), 0, p, p) if nx > 1: if weights is not None: Ax = sparse.spdiags(weights, -1, p, p).T - \ sparse.spdiags(weights, 0, p, p) Ax = Ax.tocsr() else: Ax = sparse.eye(p, p, 1, format='csr') - \ sparse.eye(p, p) zind = ind[:, :, -1].ravel() for i in zind: Ax.data[Ax.indptr[i]: \ Ax.indptr[i + 1]] = 0 Ax.eliminate_zeros() else: Ax = sparse.csr_matrix((p, p), dtype=float) if ny > 1: if weights is not None: Ay = sparse.spdiags(weights, -nx, p, p).T - \ sparse.spdiags(weights, 0, p, p) Ay = Ay.tocsr() else: Ay = sparse.eye(p, p, nx, format='csr') - \ sparse.eye(p, p) yind = ind[:, -1, :].ravel() for i in yind: Ay.data[Ay.indptr[i]: \ Ay.indptr[i + 1]] = 0 Ay.eliminate_zeros() else: Ay = sparse.csr_matrix((p, p), dtype=float) if nz > 1: if weights is not None: Az = sparse.spdiags(weights, -(ny * nx), p, p).T - \ sparse.spdiags(weights, 0, p, p) Az = Az.tocsr() else: Az = (sparse.eye(p, p, ny * nx, format='csr') - \ sparse.eye(p, p)) xind = ind[-1, :, :].ravel() for i in xind: Az.data[Az.indptr[i]: \ Az.indptr[i + 1]] = 0 Az.eliminate_zeros() else: Az = sparse.csr_matrix((p, p), dtype=float) A = LinearOperatorNesterov(Ax, Ay, Az) A.n_compacts = (nz * ny * nx - 1) if calc_lambda_max: A.singular_values = [TotalVariation(l=0., A=A).lambda_max()] return A
def linear_operator_from_mesh(mesh_coord, mesh_triangles, mask=None, offset=0, weights=None, calc_lambda_max=False): """Generates the linear operator for the total variation Nesterov function from a mesh. Parameters ---------- mesh_coord : Numpy array [n, 3] of float. mesh_triangles : Numpy array, n_triangles-by-3. The (integer) indices of the three nodes forming the triangle. mask : Numpy array (shape (n,)) of integers/boolean. Non-null values correspond to columns of X. Groups may be defined using different values in the mask. TV will be applied within groups of the same value in the mask. offset : Non-negative integer. The index of the first column, variable, where TV applies. This is different from penalty_start which define where the penalty applies. The offset defines where TV applies within the penalised variables. Example: X := [Intercept, Age, Weight, Image]. Intercept is not penalized, TV does not apply on Age and Weight but only on Image. Thus: penalty_start = 1, offset = 2 (skip Age and Weight). weights : Numpy array. The weight put on the gradient of every point. Default is weight 1 for each point, or equivalently, no weight. The weights is a numpy array of the same shape as mask. calc_lambda_max: boolean. Should the largest singular value being precomputed ? Returns ------- out1 : List or sparse matrices. Linear operator for the total variation Nesterov function computed over a mesh. out2 : Integer. The number of compacts. Examples -------- >>> import numpy as np >>> import parsimony.functions.nesterov.tv as tv_helper >>> mesh_coord = np.array([[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]]) >>> mesh_triangles = np.array([[0 ,1, 3], [0, 2 ,3], [2, 3, 5], [2, 4, 5]]) >>> A = tv_helper.linear_operator_from_mesh(mesh_coord, mesh_triangles) """ if mask is None: mask = np.ones(mesh_coord.shape[0], dtype=bool) assert mask.shape[0] == mesh_coord.shape[0] mask_bool = mask != 0 mask_idx = np.where(mask_bool)[0] # Mapping from full array to masked array. map_full2masked = np.zeros(mask.shape, dtype=int) map_full2masked[:] = -1 map_full2masked[mask_bool] = np.arange(np.sum(mask_bool)) + offset ## 1) Associate edges to nodes nodes_with_edges = [[] for i in range(mesh_coord.shape[0])] def connect_edge_to_node(node_idx1, node_idx2, nodes_with_edges): # Attach edge to first node. if np.sum(mesh_coord[node_idx1] - mesh_coord[node_idx2]) >= 0: edge = [node_idx1, node_idx2] if not edge in nodes_with_edges[node_idx1]: nodes_with_edges[node_idx1].append(edge) else: # attach edge to second node edge = [node_idx2, node_idx1] if not edge in nodes_with_edges[node_idx2]: nodes_with_edges[node_idx2].append(edge) for i in range(mesh_triangles.shape[0]): t = mesh_triangles[i, :] connect_edge_to_node(t[0], t[1], nodes_with_edges) connect_edge_to_node(t[0], t[2], nodes_with_edges) connect_edge_to_node(t[1], t[2], nodes_with_edges) max_connectivity = np.max(np.array([len(n) for n in nodes_with_edges])) # 3. build sparse matrices # 1..max_connectivity of i, j and value A = [[[], [], []] for i in range(max_connectivity)] n_compacts = 0 for node_idx in mask_idx: #node_idx = 0 found = False node = nodes_with_edges[node_idx] for i, v in enumerate(node): found = False if weights is not None: w = weights[i] else: w = 1.0 #print i, v node1_idx, node2_idx = v if mask_bool[node1_idx] and mask_bool[node2_idx]: found = True A[i][0] += [map_full2masked[node1_idx], map_full2masked[node1_idx]] A[i][1] += [map_full2masked[node1_idx], map_full2masked[node2_idx]] A[i][2] += [-w, w] if found: n_compacts += 1 p = mask.sum() A = [sparse.csr_matrix((A[i][2], (A[i][0], A[i][1])), shape=(p, p)) for i in range(len(A))] A = LinearOperatorNesterov(*A) A.n_compacts = n_compacts if calc_lambda_max: A.singular_values = [TotalVariation(l=0., A=A).lambda_max()] return A
def linear_operator_from_mask(mask, offset=0, weights=None, calc_lambda_max=False): """Generates the linear operator for the total variation Nesterov function from a mask for a 3D image. Parameters ---------- mask : Numpy array of integers. The mask has the same shape as the original data. Non-null values correspond to columns of X. Groups may be defined using different values in the mask. TV will be applied within groups of the same value in the mask. offset: Non-negative integer. The index of the first column, variable, where TV applies. This is different from penalty_start which define where the penalty applies. The offset defines where TV applies within the penalised variables. Example: X := [Intercept, Age, Weight, Image]. Intercept is not penalized, TV does not apply on Age and Weight but only on Image. Thus: penalty_start = 1, offset = 2 (skip Age and Weight). weights : Numpy array. The weight put on the gradient of every point. Default is weight 1 for each point, or equivalently, no weight. The weights is a numpy array of the same shape as mask. calc_lambda_max: boolean. Should the largest singular value being precomputed ? """ while len(mask.shape) < 3: mask = mask[..., np.newaxis] if weights is not None: while len(weights.shape) < 3: weights = weights[..., np.newaxis] nx, ny, nz = mask.shape mask_bool = mask != 0 xyz_mask = np.where(mask_bool) Ax_i = list() Ax_j = list() Ax_v = list() Ay_i = list() Ay_j = list() Ay_v = list() Az_i = list() Az_j = list() Az_v = list() n_compacts = 0 p = np.sum(mask_bool) + offset # Mapping from image coordinate to flat masked array. im2flat = np.zeros(mask.shape, dtype=int) im2flat[:] = -1 im2flat[mask_bool] = np.arange(np.sum(mask_bool)) + offset for pt in range(len(xyz_mask[0])): found = False x, y, z = xyz_mask[0][pt], xyz_mask[1][pt], xyz_mask[2][pt] i_pt = im2flat[x, y, z] val = mask[x, y, z] if weights is not None: w = weights[x, y, z] else: w = 1.0 if x + 1 < nx and (mask[x + 1, y, z] == val): found = True Ax_i += [i_pt, i_pt] Ax_j += [i_pt, im2flat[x + 1, y, z]] Ax_v += [-w, w] if y + 1 < ny and (mask[x, y + 1, z] == val): found = True Ay_i += [i_pt, i_pt] Ay_j += [i_pt, im2flat[x, y + 1, z]] Ay_v += [-w, w] if z + 1 < nz and (mask[x, y, z + 1] == val): found = True Az_i += [i_pt, i_pt] Az_j += [i_pt, im2flat[x, y, z + 1]] Az_v += [-w, w] if found: n_compacts += 1 Ax = sparse.csr_matrix((Ax_v, (Ax_i, Ax_j)), shape=(p, p)) Ay = sparse.csr_matrix((Ay_v, (Ay_i, Ay_j)), shape=(p, p)) Az = sparse.csr_matrix((Az_v, (Az_i, Az_j)), shape=(p, p)) A = LinearOperatorNesterov(Ax, Ay, Az) A.n_compacts = n_compacts if calc_lambda_max: A.singular_values = [TotalVariation(l=0., A=A).lambda_max()] return A
def linear_operator_from_subset_mask(mask, weights=None, calc_lambda_max=False): """Generates the linear operator for the total variation Nesterov function from a mask for a 3D image. The binary mask marks a subset of the variables that are supposed to be smoothed. The mask has the same size as the input and output image. Parameters ---------- mask : Numpy array. The mask. The mask does not involve any intercept variables. weights : Numpy array. The weight put on the gradient of every point. Default is weight 1 for each point, or equivalently, no weight. The weights is a numpy array of the same shape as mask. calc_lambda_max: boolean. Should the largest singular value being precomputed ? """ while len(mask.shape) < 3: mask = mask[np.newaxis, :] if weights is not None: while len(weights.shape) < 3: weights = weights[np.newaxis, :] nz, ny, nx = mask.shape mask = mask.astype(bool) zyx_mask = np.where(mask) Ax_i = list() Ax_j = list() Ax_v = list() Ay_i = list() Ay_j = list() Ay_v = list() Az_i = list() Az_j = list() Az_v = list() num_compacts = 0 # p = np.sum(mask) # Mapping from image coordinate to flat masked array. def im2flat(sub, dims): return sub[0] * dims[2] * dims[1] + \ sub[1] * dims[2] + \ sub[2] # im2flat = np.zeros(mask.shape, dtype=int) # im2flat[:] = -1 # im2flat[mask] = np.arange(p) # im2flat[np.arange(p)] = np.arange(p) for pt in range(len(zyx_mask[0])): found = False z, y, x = zyx_mask[0][pt], zyx_mask[1][pt], zyx_mask[2][pt] i_pt = im2flat((z, y, x), mask.shape) if weights is not None: w = weights[z, y, x] else: w = 1.0 if z + 1 < nz and mask[z + 1, y, x]: found = True Az_i += [i_pt, i_pt] Az_j += [i_pt, im2flat((z + 1, y, x), mask.shape)] Az_v += [-w, w] if y + 1 < ny and mask[z, y + 1, x]: found = True Ay_i += [i_pt, i_pt] Ay_j += [i_pt, im2flat((z, y + 1, x), mask.shape)] Ay_v += [-w, w] if x + 1 < nx and mask[z, y, x + 1]: found = True Ax_i += [i_pt, i_pt] Ax_j += [i_pt, im2flat((z, y, x + 1), mask.shape)] Ax_v += [-w, w] if found: num_compacts += 1 p = np.prod(mask.shape) Az = sparse.csr_matrix((Az_v, (Az_i, Az_j)), shape=(p, p)) Ay = sparse.csr_matrix((Ay_v, (Ay_i, Ay_j)), shape=(p, p)) Ax = sparse.csr_matrix((Ax_v, (Ax_i, Ax_j)), shape=(p, p)) A = LinearOperatorNesterov(Ax, Ay, Az) A.n_compacts = num_compacts if calc_lambda_max: A.singular_values = [TotalVariation(l=0., A=A).lambda_max()] return A
#Use mean imputation, we could have used median for age #imput = sklearn.preprocessing.Imputer(strategy = 'median',axis=0) #Z = imput.fit_transform(Z) X = np.hstack([Z, X]) assert X.shape == (526, 140364) #Remove nan lines X = X[np.logical_not(np.isnan(y)).ravel(), :] y = y[np.logical_not(np.isnan(y))] assert X.shape == (526, 140364) np.save(os.path.join(OUTPUT, "X.npy"), X) np.save(os.path.join(OUTPUT, "y.npy"), y) ############################################################################### ############################################################################### # precompute linearoperator X = np.load(os.path.join(OUTPUT, "X.npy")) y = np.load(os.path.join(OUTPUT, "y.npy")) mask = nibabel.load(os.path.join(OUTPUT, "mask.nii")) import parsimony.functions.nesterov.tv as nesterov_tv from parsimony.utils.linalgs import LinearOperatorNesterov Atv = nesterov_tv.linear_operator_from_mask(mask.get_data(), calc_lambda_max=True) Atv.save(os.path.join(OUTPUT, "Atv.npz")) Atv_ = LinearOperatorNesterov(filename=os.path.join(OUTPUT, "Atv.npz")) assert Atv.get_singular_values(0) == Atv_.get_singular_values(0)
def load_globals(config): import mapreduce as GLOBAL # access to global variables GLOBAL.DATA = GLOBAL.load_data(config["data"]) Atv = LinearOperatorNesterov(filename=config["structure_linear_operator_tv"]) GLOBAL.Atv = Atv GLOBAL.FULL_RESAMPLE = config['full_resample']
def linear_operator_from_mesh(mesh_coord, mesh_triangles, mask=None, offset=0, weights=None, calc_lambda_max=False): """Generates the linear operator for the total variation Nesterov function from a mesh. Parameters ---------- mesh_coord : Numpy array [n, 3] of float. mesh_triangles : Numpy array, n_triangles-by-3. The (integer) indices of the three nodes forming the triangle. mask : Numpy array (shape (n,)) of integers/boolean. Non-null values correspond to columns of X. Groups may be defined using different values in the mask. TV will be applied within groups of the same value in the mask. offset : Non-negative integer. The index of the first column, variable, where TV applies. This is different from penalty_start which define where the penalty applies. The offset defines where TV applies within the penalised variables. Example: X := [Intercept, Age, Weight, Image]. Intercept is not penalized, TV does not apply on Age and Weight but only on Image. Thus: penalty_start = 1, offset = 2 (skip Age and Weight). weights : Numpy array. The weight put on the gradient of every point. Default is weight 1 for each point, or equivalently, no weight. The weights is a numpy array of the same shape as mask. calc_lambda_max: boolean. Should the largest singular value being precomputed ? Returns ------- out1 : List or sparse matrices. Linear operator for the total variation Nesterov function computed over a mesh. out2 : Integer. The number of compacts. Examples -------- >>> import numpy as np >>> import parsimony.functions.nesterov.tv as tv_helper >>> mesh_coord = np.array([[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]]) >>> mesh_triangles = np.array([[0 ,1, 3], [0, 2 ,3], [2, 3, 5], [2, 4, 5]]) >>> A = tv_helper.linear_operator_from_mesh(mesh_coord, mesh_triangles) """ if mask is None: mask = np.ones(mesh_coord.shape[0], dtype=bool) assert mask.shape[0] == mesh_coord.shape[0] mask_bool = mask != 0 mask_idx = np.where(mask_bool)[0] # Mapping from full array to masked array. map_full2masked = np.zeros(mask.shape, dtype=int) map_full2masked[:] = -1 map_full2masked[mask_bool] = np.arange(np.sum(mask_bool)) + offset ## 1) Associate edges to nodes nodes_with_edges = [[] for i in range(mesh_coord.shape[0])] def connect_edge_to_node(node_idx1, node_idx2, nodes_with_edges): # Attach edge to first node. if np.sum(mesh_coord[node_idx1] - mesh_coord[node_idx2]) >= 0: edge = [node_idx1, node_idx2] if not edge in nodes_with_edges[node_idx1]: nodes_with_edges[node_idx1].append(edge) else: # attach edge to second node edge = [node_idx2, node_idx1] if not edge in nodes_with_edges[node_idx2]: nodes_with_edges[node_idx2].append(edge) for i in range(mesh_triangles.shape[0]): t = mesh_triangles[i, :] connect_edge_to_node(t[0], t[1], nodes_with_edges) connect_edge_to_node(t[0], t[2], nodes_with_edges) connect_edge_to_node(t[1], t[2], nodes_with_edges) max_connectivity = np.max(np.array([len(n) for n in nodes_with_edges])) # 3. build sparse matrices # 1..max_connectivity of i, j and value A = [[[], [], []] for i in range(max_connectivity)] n_compacts = 0 for node_idx in mask_idx: #node_idx = 0 found = False node = nodes_with_edges[node_idx] for i, v in enumerate(node): found = False if weights is not None: w = weights[i] else: w = 1.0 #print i, v node1_idx, node2_idx = v if mask_bool[node1_idx] and mask_bool[node2_idx]: found = True A[i][0] += [ map_full2masked[node1_idx], map_full2masked[node1_idx] ] A[i][1] += [ map_full2masked[node1_idx], map_full2masked[node2_idx] ] A[i][2] += [-w, w] if found: n_compacts += 1 p = mask.sum() A = [ sparse.csr_matrix((A[i][2], (A[i][0], A[i][1])), shape=(p, p)) for i in range(len(A)) ] A = LinearOperatorNesterov(*A) A.n_compacts = n_compacts if calc_lambda_max: A.singular_values = [TotalVariation(l=0., A=A).lambda_max()] return A
np.save(os.path.join(OUTPUT, "mask.npy"), mask) X = Xtot[:, mask] assert X.shape == (80, 299798) ############################################################################# X = np.hstack([Z, X]) assert X.shape == (80, 299800) #Remove nan lines X = X[np.logical_not(np.isnan(y)).ravel(), :] y = y[np.logical_not(np.isnan(y))] assert X.shape == (80, 299800) np.save(os.path.join(OUTPUT, "X.npy"), X) np.save(os.path.join(OUTPUT, "y.npy"), y) ############################################################################# import parsimony.functions.nesterov.tv as nesterov_tv from parsimony.utils.linalgs import LinearOperatorNesterov Atv = nesterov_tv.linear_operator_from_mesh(cor, tri, mask, calc_lambda_max=True) Atv.save(os.path.join(OUTPUT, "Atv.npz")) Atv_ = LinearOperatorNesterov(filename=os.path.join(OUTPUT, "Atv.npz")) assert Atv.get_singular_values(0) == Atv_.get_singular_values(0) assert np.allclose(Atv_.get_singular_values(0), 8.999, rtol=1e-03, atol=1e-03) assert np.all([a.shape == (299798, 299798) for a in Atv])
# Save data X and y X = Xtot[:, mask_bool.ravel()] #Use mean imputation, we could have used median for age #imput = sklearn.preprocessing.Imputer(strategy = 'median',axis=0) #Z = imput.fit_transform(Z) X = np.hstack([Z, X]) assert X.shape == (606, 125962) #Remove nan lines X = X[np.logical_not(np.isnan(y)).ravel(), :] y = y[np.logical_not(np.isnan(y))] assert X.shape == (606, 125962) np.save(os.path.join(OUTPUT, "X.npy"), X) np.save(os.path.join(OUTPUT, "y.npy"), y) ############################################################################### # precompute linearoperator X = np.load(os.path.join(OUTPUT, "X.npy")) y = np.load(os.path.join(OUTPUT, "y.npy")) mask = nibabel.load(os.path.join(OUTPUT, "mask.nii")) Atv = nesterov_tv.linear_operator_from_mask(mask.get_data(), calc_lambda_max=True) Atv.save(os.path.join(OUTPUT, "Atv.npz")) Atv_ = LinearOperatorNesterov(filename=os.path.join(OUTPUT, "Atv.npz")) assert Atv.get_singular_values(0) == Atv_.get_singular_values(0) assert np.allclose(Atv_.get_singular_values(0), 11.909770107366217)
def load_globals(config): import mapreduce as GLOBAL # access to global variables GLOBAL.DATA = GLOBAL.load_data(config["data"]) A = LinearOperatorNesterov(filename=config["structure_linear_operator_tv"]) GLOBAL.A = A
def init(): INPUT_DATA_X = os.path.join(WD_ORIGINAL, 'X.npy') INPUT_DATA_y = os.path.join(WD_ORIGINAL, 'y.npy') INPUT_MASK_PATH = os.path.join(WD_ORIGINAL, 'mask.npy') INPUT_MESH_PATH = '/neurospin/brainomics/2013_adni/MCIc-CTL-FS_cs/lrh.pial.gii' #INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/data/30yo/Atv.npz' # INPUT_CSV = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/population_30yo.csv' os.makedirs(WD, exist_ok=True) shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) shutil.copy(INPUT_MESH_PATH, WD) #shutil.copy(INPUT_LINEAR_OPE_PATH, WD) ## Create config file os.chdir(WD) X = np.load("X.npy") y = np.load("y.npy") if not os.path.exists(os.path.join(WD, "Atv.npz")): import brainomics.mesh_processing as mesh_utils cor, tri = mesh_utils.mesh_arrays(os.path.join(WD, "lrh.pial.gii")) mask = np.load(os.path.join(WD, 'mask.npy')) import parsimony.functions.nesterov.tv as nesterov_tv from parsimony.utils.linalgs import LinearOperatorNesterov Atv = nesterov_tv.linear_operator_from_mesh(cor, tri, mask, calc_lambda_max=True) Atv.save(os.path.join(WD, "Atv.npz")) Atv_ = LinearOperatorNesterov(filename=os.path.join(WD, "Atv.npz")) assert Atv.get_singular_values(0) == Atv_.get_singular_values(0) assert np.allclose(Atv_.get_singular_values(0), 8.999, rtol=1e-03, atol=1e-03) assert np.all([a.shape == (317089, 317089) for a in Atv]) if not os.path.exists(os.path.join(WD, "beta_start.npz")): betas = dict() import time alphas = [.01, 0.1, 1.0, 10] for alpha in alphas: mod = estimators.RidgeLogisticRegression(l=alpha, class_weight="auto", penalty_start=penalty_start) t_ = time.time() mod.fit(X, y.ravel()) print(time.time() - t_) # 11564 betas["lambda_%.2f" % alpha] = mod.beta np.savez(os.path.join(WD, "beta_start.npz"), **betas) beta_start = np.load(os.path.join(WD, "beta_start.npz")) assert np.all([np.all(beta_start[a] == betas[a]) for a in beta_start.keys()]) ## Create config file # ######################################################################## # Setting 1: 5cv + large range of parameters: cv_largerange # with sub-sample training set with size 50, 100 # 5cv/cv0*[_sub50]/refit/* # sub_sizes = [50, 100] sub_sizes = [] cv_outer = [[tr, te] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42).split(np.zeros(y.shape[0]), y.ravel())] # check we got the same CV than previoulsy cv_old = json.load(open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"] cv_outer_old = [cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)]] assert np.all([np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0]) for i in range(NFOLDS_OUTER)]) assert np.all([np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1]) for i in range(NFOLDS_OUTER)]) # check END import collections cv = collections.OrderedDict() cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))] for cv_outer_i, (tr_val, te) in enumerate(cv_outer): # Simple CV cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te] # Nested CV # cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) # for cv_inner_i, (tr, val) in enumerate(cv_inner): # cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] # Sub-sample training set with size 50, 100 # => cv*_sub[50|100]/refit grps = np.unique(y[tr_val]).astype(int) ytr = y.copy() ytr[te] = np.nan g_idx = [np.where(ytr == g)[0] for g in grps] assert np.all([np.all(ytr[g_idx[g]] == g) for g in grps]) g_size = np.array([len(g) for g in g_idx]) g_prop = g_size / g_size.sum() for sub_size in sub_sizes: # sub_size = sub_sizes[0] sub_g_size = np.round(g_prop * sub_size).astype(int) g_sub_idx = [np.random.choice(g_idx[g], sub_g_size[g], replace=False) for g in grps] assert np.all([np.all(y[g_sub_idx[g]] == g) for g in grps]) tr_val_sub = np.concatenate(g_sub_idx) assert len(tr_val_sub) == sub_size assert np.all([idx in tr_val for idx in tr_val_sub]) assert np.all(np.logical_not([idx in te for idx in tr_val_sub])) cv["cv%02d_sub%i/refit" % (cv_outer_i, sub_size)] = [tr_val_sub, te] cv = {k:[cv[k][0].tolist(), cv[k][1].tolist()] for k in cv} # Nested CV # assert len(cv_largerange) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1 # Simple CV # assert len(cv) == NFOLDS_OUTER + 1 # Simple CV + sub-sample training set with size 50, 100: assert len(cv) == NFOLDS_OUTER * (1 + len(sub_sizes)) + 1 print(list(cv.keys())) # Large grid of parameters alphas = [0.001, 0.01, 0.1, 1.0] # alphas = [.01, 0.1, 1.0] # first ran with this grid tv_ratio = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] l1l2_ratio = [0.1, 0.5, 0.9] # l1l2_ratio = [0, 0.1, 0.5, 0.9, 1.0] # first ran with this grid algos = ["enettv", "enetgn"] params_enet_tvgn = [list(param) for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio)] assert len(params_enet_tvgn) == 240 # old 300 params_enet = [list(param) for param in itertools.product(["enet"], alphas, l1l2_ratio, [0])] assert len(params_enet) == 12 # old 15 params = params_enet_tvgn + params_enet assert len(params) == 252 # 315 # Simple CV # assert len(params) * len(cv) == 1890 # Simple CV + sub-sample training set with size 50, 100: assert len(params) * len(cv) == 1512 # 5040 config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure_linear_operator_tv="Atv.npz", beta_start="beta_start.npz", map_output="5cv", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_cv_largerange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_cv_largerange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd,walltime = "250:00:00", suffix="_cv_largerange", freecores=2) # ######################################################################## # Setting 2: dcv + reduced range of parameters: dcv_reducedrange # 5cv/cv0*/cvnested0*/* cv_outer = [[tr, te] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42).split(np.zeros(y.shape[0]), y.ravel())] # check we got the same CV than previoulsy cv_old = json.load(open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"] cv_outer_old = [cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)]] assert np.all([np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0]) for i in range(NFOLDS_OUTER)]) assert np.all([np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1]) for i in range(NFOLDS_OUTER)]) # check END import collections cv = collections.OrderedDict() cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))] for cv_outer_i, (tr_val, te) in enumerate(cv_outer): cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te] cv_inner = StratifiedKFold(n_splits=NFOLDS_INNER, random_state=42).split(np.zeros(y[tr_val].shape[0]), y[tr_val].ravel()) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] cv = {k:[cv[k][0].tolist(), cv[k][1].tolist()] for k in cv} #assert len(cv) == NFOLDS_OUTER + 1 assert len(cv) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1 print(list(cv.keys())) # Reduced grid of parameters alphas = [0.001, 0.01, 0.1, 1.0] # alphas = [.01, 0.1] # original tv_ratio = [0.2, 0.8] l1l2_ratio = [0.1, 0.9] algos = ["enettv", "enetgn"] params_enet_tvgn = [list(param) for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio)] assert len(params_enet_tvgn) == 32 # 16 params_enet = [list(param) for param in itertools.product(["enet"], alphas, l1l2_ratio, [0])] assert len(params_enet) == 8 # 4 params = params_enet_tvgn + params_enet assert len(params) == 40 # 20 assert len(params) * len(cv) == 1240 # 620 config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure_linear_operator_tv="Atv.npz", beta_start="beta_start.npz", map_output="5cv", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_dcv_reducedrange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_dcv_reducedrange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd,walltime = "250:00:00", suffix="_dcv_reducedrange", freecores=2)
# [0.19778403 0.04279359 0.03579749] assert pca.components_.shape == (N_COMP, 371278) PC = pca.transform(X) #U = pca.transform(X) d = pca.singular_values_ V = pca.components_.T U = pca.transform(X) explained_variance = pca.explained_variance_ratio_.cumsum() if options.algo == 'enettv': ######################################################################################################################## # PCA TV from parsimony.utils.linalgs import LinearOperatorNesterov #mask_img = nibabel.Nifti1Image(mask_arr.astype(float), affine=ref_img.affine) Atv = LinearOperatorNesterov(filename=os.path.join(ANALYSIS_DATA_PATH, "Atv.npz")) #assert Atv.get_singular_values(0) == Atv_.get_singular_values(0) assert np.allclose(Atv.get_singular_values(0), 11.974760295502465) inner_max_iter = int(1e3) l1max = pca_tv.PCAL1L2TV.l1_max(X) * .9 # 0.03899665773990707 assert np.allclose(l1max, 0.03509699196591636) if False: # Not to bad, TV too low # ll1 < 0.01 * l1max, tv = 0.01 * 1/3 ll1, ll2, ltv = 0.01 * l1max, 1, 0.01 key_pca_enettv = "pca_enettv_%.4f_%.3f_%.3f" % (ll1, ll2, ltv) # Corr with old PC[-0.99966211718252285, -0.99004655401439967, -0.74332811780676245] if False:# Too much l1, not enough tv