def _split_train_val(patient_folders): """Splits the patient folders into train and validation splits. """ # Construct train and validation splits using default parameters if paths.SUBMISSION_NR == 1: print("Using proper validation set") validation_patients_indices = validation_set.get_cross_validation_indices( indices=ALL_TRAIN_PATIENT_IDS, validation_index=0) else: print("WARNING: no validation set!!") validation_patients_indices = [1] train_patients_indices = [ i for i in ALL_TRAIN_PATIENT_IDS if i not in validation_patients_indices ] train_patient_folders = [ folder for folder in patient_folders if not _extract_id_from_path(folder) in validation_patients_indices ] validation_patient_folders = [ folder for folder in patient_folders if folder not in train_patient_folders ] return (train_patient_folders, validation_patient_folders, validation_patients_indices, train_patients_indices)
def test_default_split(self): # Create the cross-validation splits splits = [ validation_set.get_cross_validation_indices( _TEST_INDICES, i, _TEST_NO_SPLITS, _TEST_SEED) for i in xrange(_TEST_NO_SPLITS)] # Load the right ones with open(_DEFAULT_SPLIT_RESULT_FILE, 'r') as f: target_splits = pickle.load(f) # Check for errors for gen, target in zip(splits, target_splits): self.assertEqual(gen, target)
def _split_train_val(patient_folders): """Splits the patient folders into train and validation splits. """ # Construct train and validation splits using default parameters if paths.SUBMISSION_NR == 1: print "Using proper validation set" validation_patients_indices = validation_set.get_cross_validation_indices( indices=ALL_TRAIN_PATIENT_IDS, validation_index=0) else: print "WARNING: no validation set!!" validation_patients_indices = [1] train_patients_indices = [i for i in ALL_TRAIN_PATIENT_IDS if i not in validation_patients_indices] train_patient_folders = [ folder for folder in patient_folders if not _extract_id_from_path(folder) in validation_patients_indices] validation_patient_folders = [ folder for folder in patient_folders if folder not in train_patient_folders] return ( train_patient_folders, validation_patient_folders, validation_patients_indices, train_patients_indices)
NUM_TRAIN_PATIENTS = num_patients['train'] NUM_VALID_PATIENTS = num_patients['validation'] NUM_TEST_PATIENTS = num_patients['test'] NUM_PATIENTS = NUM_TRAIN_PATIENTS + NUM_VALID_PATIENTS + NUM_TEST_PATIENTS ############## # Sunny data # ############## # This small dataset is loaded into memory _SUNNY_DATA_PATH = os.path.join(_DATA_FOLDER, "pkl_annotated", "data.pkl") _sunny_data = _load_file(_SUNNY_DATA_PATH) num_sunny_images = len(_sunny_data["images"]) _validation_sunny_indices = validation_set.get_cross_validation_indices( indices=range(num_sunny_images)) _train_sunny_indices = [ i for i in range(num_sunny_images) if i not in _validation_sunny_indices] sunny_train_images = np.array(_sunny_data["images"])[_train_sunny_indices] sunny_train_labels = np.array(_sunny_data["labels"])[_train_sunny_indices] sunny_validation_images = np.array(_sunny_data["images"])[_validation_sunny_indices] sunny_validation_labels = np.array(_sunny_data["labels"])[_validation_sunny_indices] ########################### # Data form preprocessing # ########################### _HOUGH_ROI_PATHS = ( TEMP_FILES_PATH + 'pkl_train_slice2roi.pkl',
import re from configuration import config import cPickle as pickle import utils from validation_set import get_cross_validation_indices import random print "Loading data" patient_folders = sorted( glob.glob("/data/dsb15_pkl/pkl_train/*/study/"), key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1) )) # glob is non-deterministic! validation_patients_indices = get_cross_validation_indices(indices=range( 1, 501), validation_index=0) train_patients_indices = [ i for i in range(1, 501) if i not in validation_patients_indices ] VALIDATION_REGEX = "|".join( ["(/%d/)" % i for i in validation_patients_indices]) train_patient_folders = [ folder for folder in patient_folders if re.search(VALIDATION_REGEX, folder) is None ] validation_patient_folders = [ folder for folder in patient_folders if folder not in train_patient_folders ]
def optimize_expert_weights(expert_predictions, average_distribution, mask_matrix=None, targets=None, num_cross_validation_masks=2, num_folds=1, eps=1e-14, cutoff=0.01, do_optimization=True, expert_weights=None, optimal_params=None, special_average=False, *args, **kwargs): """ :param expert_predictions: experts x validation_samples x 600 x :param mask_matrix: experts x validation_samples x :param targets: validation_samples x 600 x :param average_distribution: 600 x :param eps: :return: """ if expert_weights is not None: mask_matrix = mask_matrix[expert_weights>cutoff,:] # remove expert_predictions = expert_predictions[expert_weights>cutoff,:,:] # remove NUM_EXPERTS = expert_predictions.shape[0] NUM_FILTER_PARAMETERS = 2 WINDOW_SIZE = 599 # optimizing weights X = theano.shared(expert_predictions.astype('float32')) # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600) x_coor = theano.shared(np.linspace(-(WINDOW_SIZE-1)/2, (WINDOW_SIZE-1)/2, num=WINDOW_SIZE, dtype='float32')) # targets = (NUM_VALIDATIONS, 600) NUM_VALIDATIONS = expert_predictions.shape[1] ind = theano.shared(np.zeros((NUM_VALIDATIONS,), dtype='int32')) # targets = (NUM_VALIDATIONS, 600) if optimal_params is None: params_init = np.concatenate([ np.ones((NUM_EXPERTS,), dtype='float32'), np.ones((NUM_FILTER_PARAMETERS,), dtype='float32') ]) else: params_init = optimal_params.astype('float32') params = theano.shared(params_init.astype('float32')) #params = T.vector('params', dtype='float32') # expert weights = (NUM_EXPERTS,) C = 0.0001 if not special_average: # Create theano expression # inputs: W = params[:NUM_EXPERTS] weights = T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0) preds = X.take(ind, axis=1) mask = theano.shared(mask_matrix.astype('float32')).take(ind, axis=1) # expression masked_weights = mask * weights tot_masked_weights = T.clip(masked_weights.sum(axis=0), 1e-7, utils.maxfloat) preds_weighted_masked = preds * masked_weights.dimshuffle(0, 1, 'x') cumulative_distribution = preds_weighted_masked.sum(axis=0) / tot_masked_weights.dimshuffle(0, 'x') # loss l1_loss = weights.sum() else: # calculate the weighted average for each of these experts weights = generate_information_weight_matrix(expert_predictions, average_distribution) # = (NUM_EXPERTS, NUM_VALIDATIONS, 600) weight_matrix = theano.shared((mask_matrix[:,:,None]*weights).astype('float32')) pdf = utils.cdf_to_pdf(expert_predictions) x_log = np.log(pdf) x_log[pdf<=0] = np.log(eps) # Compute the mean X_log = theano.shared(x_log.astype('float32')) # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600) X_log_i = X_log.take(ind, axis=1) w_i = weight_matrix.take(ind, axis=1) W = params[:NUM_EXPERTS] w_i = w_i * T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0, 'x') #the different predictions, are the experts geom_av_log = T.sum(X_log_i * w_i, axis=0) / (T.sum(w_i, axis=0) + eps) geom_av_log = geom_av_log - T.max(geom_av_log,axis=-1).dimshuffle(0,'x') # stabilizes rounding errors? geom_av = T.exp(geom_av_log) geom_pdf = geom_av/T.sum(geom_av,axis=-1).dimshuffle(0,'x') l1_loss = 0 cumulative_distribution = T.cumsum(geom_pdf, axis=-1) if not do_optimization: ind.set_value(list(range(NUM_VALIDATIONS))) f_eval = theano.function([], cumulative_distribution) cumulative_distribution = f_eval() return cumulative_distribution[0] else: # convert to theano_values (for regularization) t_valid = theano.shared(targets.astype('float32')) # targets = (NUM_VALIDATIONS, 600) t_train = theano.shared(targets.astype('float32')) # targets = (NUM_VALIDATIONS, 600) CRPS_train = T.mean((cumulative_distribution - t_train.take(ind, axis=0))**2) + C * l1_loss CRPS_valid = T.mean((cumulative_distribution - t_valid.take(ind, axis=0))**2) iter_optimize = theano.function([], CRPS_train, on_unused_input="ignore", updates=lasagne.updates.adam(CRPS_train, [params], 1.0)) f_val = theano.function([], CRPS_valid) def optimize_my_params(): for _ in range(40 if special_average else 100): # early stopping score = iter_optimize() result = params.get_value() return result, score if num_cross_validation_masks==0: ind.set_value(list(range(NUM_VALIDATIONS))) params.set_value(params_init) optimal_params, train_score = optimize_my_params() final_weights = -1e10 * np.ones(expert_weights.shape,) final_weights[np.where(expert_weights>cutoff)] = optimal_params[:NUM_EXPERTS] final_params = np.concatenate(( final_weights, optimal_params[NUM_EXPERTS:])) return softmax(final_weights), train_score, final_params else: final_params = [] final_losses = [] print() print() print() for fold in range(num_folds): for i_cross_validation in range(num_cross_validation_masks): print("\r\033[F\033[F\033[Fcross_validation %d/%d"%(fold*num_cross_validation_masks+i_cross_validation+1, num_folds*num_cross_validation_masks)) val_indices = get_cross_validation_indices(list(range(NUM_VALIDATIONS)), validation_index=i_cross_validation, number_of_splits=num_cross_validation_masks, rng_seed=fold, ) indices = [i for i in range(NUM_VALIDATIONS) if i not in val_indices] #out, crps, d = scipy.optimize.fmin_l_bfgs_b(f, w_init, fprime=g, pgtol=1e-09, epsilon=1e-08, maxfun=10000) ind.set_value(indices) params.set_value(params_init) result, train_score = optimize_my_params() final_params.append(result) ind.set_value(val_indices) validation_score = f_val() print(" Current train value: %.6f" % train_score) print(" Current validation value: %.6f" % validation_score) final_losses.append(validation_score) optimal_params = np.mean(final_params, axis=0) average_loss = np.mean(final_losses) expert_weights_result = softmax(optimal_params[:NUM_EXPERTS]) filter_param_result = optimal_params[NUM_EXPERTS:NUM_EXPERTS+NUM_FILTER_PARAMETERS] #print "filter param result:", filter_param_result return expert_weights_result, average_loss, optimal_params # (NUM_EXPERTS,)
NUM_TRAIN_PATIENTS = num_patients['train'] NUM_VALID_PATIENTS = num_patients['validation'] NUM_TEST_PATIENTS = num_patients['test'] NUM_PATIENTS = NUM_TRAIN_PATIENTS + NUM_VALID_PATIENTS + NUM_TEST_PATIENTS ############## # Sunny data # ############## # This small dataset is loaded into memory _SUNNY_DATA_PATH = os.path.join(_DATA_FOLDER, "pkl_annotated", "data.pkl") _sunny_data = _load_file(_SUNNY_DATA_PATH) num_sunny_images = len(_sunny_data["images"]) _validation_sunny_indices = validation_set.get_cross_validation_indices( indices=range(num_sunny_images)) _train_sunny_indices = [ i for i in range(num_sunny_images) if i not in _validation_sunny_indices ] sunny_train_images = np.array(_sunny_data["images"])[_train_sunny_indices] sunny_train_labels = np.array(_sunny_data["labels"])[_train_sunny_indices] sunny_validation_images = np.array( _sunny_data["images"])[_validation_sunny_indices] sunny_validation_labels = np.array( _sunny_data["labels"])[_validation_sunny_indices] ########################### # Data form preprocessing # ###########################
import glob import numpy as np import re from configuration import config import cPickle as pickle import utils from validation_set import get_cross_validation_indices import random print "Loading data" patient_folders = sorted(glob.glob("/data/dsb15_pkl/pkl_train/*/study/"), key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1))) # glob is non-deterministic! validation_patients_indices = get_cross_validation_indices(indices=range(1,501), validation_index=0) train_patients_indices = [i for i in range(1,501) if i not in validation_patients_indices] VALIDATION_REGEX = "|".join(["(/%d/)"%i for i in validation_patients_indices]) train_patient_folders = [folder for folder in patient_folders if re.search(VALIDATION_REGEX, folder) is None] validation_patient_folders = [folder for folder in patient_folders if folder not in train_patient_folders] import os import os.path def copy(from_folder, to_folder): command = "cp -r %s %s/."%(from_folder, to_folder) print command os.system(command) for folder in train_patient_folders:
def optimize_expert_weights(expert_predictions, average_distribution, mask_matrix=None, targets=None, num_cross_validation_masks=2, num_folds=1, eps=1e-14, cutoff=0.01, do_optimization=True, expert_weights=None, optimal_params=None, special_average=False, *args, **kwargs): """ :param expert_predictions: experts x validation_samples x 600 x :param mask_matrix: experts x validation_samples x :param targets: validation_samples x 600 x :param average_distribution: 600 x :param eps: :return: """ if expert_weights is not None: mask_matrix = mask_matrix[expert_weights>cutoff,:] # remove expert_predictions = expert_predictions[expert_weights>cutoff,:,:] # remove NUM_EXPERTS = expert_predictions.shape[0] NUM_FILTER_PARAMETERS = 2 WINDOW_SIZE = 599 # optimizing weights X = theano.shared(expert_predictions.astype('float32')) # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600) x_coor = theano.shared(np.linspace(-(WINDOW_SIZE-1)/2, (WINDOW_SIZE-1)/2, num=WINDOW_SIZE, dtype='float32')) # targets = (NUM_VALIDATIONS, 600) NUM_VALIDATIONS = expert_predictions.shape[1] ind = theano.shared(np.zeros((NUM_VALIDATIONS,), dtype='int32')) # targets = (NUM_VALIDATIONS, 600) if optimal_params is None: params_init = np.concatenate([ np.ones((NUM_EXPERTS,), dtype='float32'), np.ones((NUM_FILTER_PARAMETERS,), dtype='float32') ]) else: params_init = optimal_params.astype('float32') params = theano.shared(params_init.astype('float32')) #params = T.vector('params', dtype='float32') # expert weights = (NUM_EXPERTS,) C = 0.0001 if not special_average: # Create theano expression # inputs: W = params[:NUM_EXPERTS] weights = T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0) preds = X.take(ind, axis=1) mask = theano.shared(mask_matrix.astype('float32')).take(ind, axis=1) # expression masked_weights = mask * weights tot_masked_weights = T.clip(masked_weights.sum(axis=0), 1e-7, utils.maxfloat) preds_weighted_masked = preds * masked_weights.dimshuffle(0, 1, 'x') cumulative_distribution = preds_weighted_masked.sum(axis=0) / tot_masked_weights.dimshuffle(0, 'x') # loss l1_loss = weights.sum() else: # calculate the weighted average for each of these experts weights = generate_information_weight_matrix(expert_predictions, average_distribution) # = (NUM_EXPERTS, NUM_VALIDATIONS, 600) weight_matrix = theano.shared((mask_matrix[:,:,None]*weights).astype('float32')) pdf = utils.cdf_to_pdf(expert_predictions) x_log = np.log(pdf) x_log[pdf<=0] = np.log(eps) # Compute the mean X_log = theano.shared(x_log.astype('float32')) # source predictions = (NUM_EXPERTS, NUM_VALIDATIONS, 600) X_log_i = X_log.take(ind, axis=1) w_i = weight_matrix.take(ind, axis=1) W = params[:NUM_EXPERTS] w_i = w_i * T.nnet.softmax(W.dimshuffle('x',0)).dimshuffle(1, 0, 'x') #the different predictions, are the experts geom_av_log = T.sum(X_log_i * w_i, axis=0) / (T.sum(w_i, axis=0) + eps) geom_av_log = geom_av_log - T.max(geom_av_log,axis=-1).dimshuffle(0,'x') # stabilizes rounding errors? geom_av = T.exp(geom_av_log) geom_pdf = geom_av/T.sum(geom_av,axis=-1).dimshuffle(0,'x') l1_loss = 0 cumulative_distribution = T.cumsum(geom_pdf, axis=-1) if not do_optimization: ind.set_value(range(NUM_VALIDATIONS)) f_eval = theano.function([], cumulative_distribution) cumulative_distribution = f_eval() return cumulative_distribution[0] else: # convert to theano_values (for regularization) t_valid = theano.shared(targets.astype('float32')) # targets = (NUM_VALIDATIONS, 600) t_train = theano.shared(targets.astype('float32')) # targets = (NUM_VALIDATIONS, 600) CRPS_train = T.mean((cumulative_distribution - t_train.take(ind, axis=0))**2) + C * l1_loss CRPS_valid = T.mean((cumulative_distribution - t_valid.take(ind, axis=0))**2) iter_optimize = theano.function([], CRPS_train, on_unused_input="ignore", updates=lasagne.updates.adam(CRPS_train, [params], 1.0)) f_val = theano.function([], CRPS_valid) def optimize_my_params(): for _ in xrange(40 if special_average else 100): # early stopping score = iter_optimize() result = params.get_value() return result, score if num_cross_validation_masks==0: ind.set_value(range(NUM_VALIDATIONS)) params.set_value(params_init) optimal_params, train_score = optimize_my_params() final_weights = -1e10 * np.ones(expert_weights.shape,) final_weights[np.where(expert_weights>cutoff)] = optimal_params[:NUM_EXPERTS] final_params = np.concatenate(( final_weights, optimal_params[NUM_EXPERTS:])) return softmax(final_weights), train_score, final_params else: final_params = [] final_losses = [] print print print for fold in xrange(num_folds): for i_cross_validation in xrange(num_cross_validation_masks): print "\r\033[F\033[F\033[Fcross_validation %d/%d"%(fold*num_cross_validation_masks+i_cross_validation+1, num_folds*num_cross_validation_masks) val_indices = get_cross_validation_indices(range(NUM_VALIDATIONS), validation_index=i_cross_validation, number_of_splits=num_cross_validation_masks, rng_seed=fold, ) indices = [i for i in range(NUM_VALIDATIONS) if i not in val_indices] #out, crps, d = scipy.optimize.fmin_l_bfgs_b(f, w_init, fprime=g, pgtol=1e-09, epsilon=1e-08, maxfun=10000) ind.set_value(indices) params.set_value(params_init) result, train_score = optimize_my_params() final_params.append(result) ind.set_value(val_indices) validation_score = f_val() print " Current train value: %.6f" % train_score print " Current validation value: %.6f" % validation_score final_losses.append(validation_score) optimal_params = np.mean(final_params, axis=0) average_loss = np.mean(final_losses) expert_weights_result = softmax(optimal_params[:NUM_EXPERTS]) filter_param_result = optimal_params[NUM_EXPERTS:NUM_EXPERTS+NUM_FILTER_PARAMETERS] #print "filter param result:", filter_param_result return expert_weights_result, average_loss, optimal_params # (NUM_EXPERTS,)