args = parser.parse_args() config = load_config(args.config_path) results_path = config['results_path'] class_name = config['class_name'] dataset_name = config['dataset_name'] pooling_operator = config['pooling_operator'] image_prediction_method = 'as_production' predictions_unique_name = 'test_set_CV1_0' predictions_folder_name = 'subsets' #this could be CV or exploratory_exp, all 3 of them can have prediction files predictions_path = build_path_results(results_path, dataset_name, pooling_operator, script_suffix=predictions_folder_name, result_suffix='predictions') performance_path = build_path_results(results_path, dataset_name, pooling_operator, script_suffix=predictions_folder_name, result_suffix='performance') make_directory(performance_path) pool_dict = {'nor': "nor", "lse": "lse", "lse01": "lse", "max": "max"} r = {'nor': 0, "lse": 1.0, "lse01": 0.1, "max": 0} image_labels, image_predictions, \ has_bbox, accurate_localizations, dice_scores = keras_preds.process_prediction(config, predictions_unique_name,
dataset_name = config['dataset_name'] res_path = config['results_path'] pooling_operator = config['pooling_operator'] set_name1 = 'test_set_CV1_0' set_name2 = 'test_set_CV1_1' set_name3 = 'test_set_CV1_2' set_name4 = 'test_set_CV1_3' set_name5 = 'test_set_CV1_4' use_xray, use_pascal = set_dataset_flag(dataset_name) classifiers = [set_name1, set_name2, set_name3, set_name4, set_name5] parent_folder_predictions = 'subsets' predictions_path = build_path_results(res_path, dataset_name, pooling_operator, script_suffix=parent_folder_predictions, result_suffix='predictions') performance_path = build_path_results(res_path, dataset_name, pooling_operator, script_suffix=parent_folder_predictions, result_suffix='performance') stability_path = build_path_results(res_path, dataset_name, pooling_operator, script_suffix=parent_folder_predictions, result_suffix='stability') make_directory(stability_path) if use_xray:
def cross_validation(config, number_splits=5): """ performs cross validation on a specific architecture :param config: yaml config file :param number_splits: number of different cross validation splits to test on :return: Returns predictions, image indices and patch labels saved in .npy file for train,test and validation set and for each CV split. """ skip_processing = config['skip_processing_labels'] image_path = config['image_path'] classication_labels_path = config['classication_labels_path'] localization_labels_path = config['localization_labels_path'] results_path = config['results_path'] train_mode = config['train_mode'] dataset_name = config['dataset_name'] class_name = config['class_name'] mura_test_img_path = config['mura_test_img_path'] mura_train_labels_path = config['mura_train_labels_path'] mura_train_img_path = config['mura_train_img_path'] mura_test_labels_path = config['mura_test_labels_path'] mura_processed_train_labels_path = config[ 'mura_processed_train_labels_path'] mura_processed_test_labels_path = config['mura_processed_test_labels_path'] mura_interpolation = config['mura_interpolation'] pascal_image_path = config['pascal_image_path'] resized_images_before_training = config['resized_images_before_training'] nr_epochs = config['nr_epochs'] lr = config['lr'] reg_weight = config['reg_weight'] pooling_operator = config['pooling_operator'] use_xray, use_pascal = set_dataset_flag(dataset_name) script_suffix = 'CV' trained_models_path = build_path_results(results_path, dataset_name, pooling_operator, script_suffix=script_suffix, result_suffix='trained_models') prediction_results_path = build_path_results(results_path, dataset_name, pooling_operator, script_suffix=script_suffix, result_suffix='predictions') make_directory(trained_models_path) make_directory(prediction_results_path) if use_xray: if resized_images_before_training: xray_df = fetch_preprocessed_images_csv(image_path, 'processed_imgs') #todo: delete - just for testing # xray_df = xray_df[-50:] else: xray_df = load_process_xray14(config) elif use_pascal: pascal_df = load_pascal(pascal_image_path) else: df_train_val, test_df_all_classes = load_mura( skip_processing, mura_processed_train_labels_path, mura_processed_test_labels_path, mura_train_img_path, mura_train_labels_path, mura_test_labels_path, mura_test_img_path) for split in range(0, number_splits): if use_xray: df_train, df_val, df_test, _, _, _ = ld.split_xray_cv( xray_df, number_splits, split, class_name) elif use_pascal: df_train, df_val, df_test = construct_train_test_cv( pascal_df, number_splits, split) else: df_train, df_val = split_data_cv(df_train_val, number_splits, split, random_seed=1, diagnose_col=class_name, ratio_to_keep=None) # df_test = filter_rows_on_class(test_df_all_classes, class_name=class_name) df_test = filter_rows_and_columns(test_df_all_classes, class_name) if train_mode: tf.keras.backend.clear_session() K.clear_session() ############################################ TRAIN ########################################################### train_generator = gen.BatchGenerator( instances=df_train.values, resized_image=resized_images_before_training, batch_size=BATCH_SIZE, net_h=IMAGE_SIZE, net_w=IMAGE_SIZE, norm=keras_utils.normalize, box_size=BOX_SIZE, processed_y=skip_processing, interpolation=mura_interpolation, shuffle=True) valid_generator = gen.BatchGenerator( instances=df_val.values, resized_image=resized_images_before_training, batch_size=BATCH_SIZE, net_h=IMAGE_SIZE, net_w=IMAGE_SIZE, box_size=BOX_SIZE, norm=keras_utils.normalize, processed_y=skip_processing, interpolation=mura_interpolation, shuffle=True) model = keras_model.build_model(reg_weight) model = keras_model.compile_model_accuracy( model, lr, pool_op=pooling_operator) # checkpoint on every epoch is not really needed here, not used, CALLBACK REMOVED from the generator filepath = trained_models_path + "CV_" + str( split) + "_epoch-{epoch:02d}-{val_loss:.2f}.hdf5" checkpoint_on_epoch_end = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, mode='min') lrate = LearningRateScheduler(keras_model.step_decay, verbose=1) print("df train STEPS") print(len(df_train) // BATCH_SIZE) print(train_generator.__len__()) history = model.fit_generator( generator=train_generator, steps_per_epoch=train_generator.__len__(), epochs=nr_epochs, validation_data=valid_generator, validation_steps=valid_generator.__len__(), verbose=1, callbacks=[checkpoint_on_epoch_end]) print("history") print(history.history) print(history.history['keras_accuracy']) np.save(trained_models_path + 'train_info_' + str(split) + '.npy', history.history) settings = np.array({ 'lr: ': lr, 'reg_weight: ': reg_weight, 'pooling_operator: ': pooling_operator }) np.save(trained_models_path + 'train_settings.npy', settings) keras_utils.plot_train_validation(history.history['loss'], history.history['val_loss'], 'train loss', 'validation loss', 'CV_loss' + str(split), 'loss', trained_models_path) ############################################ PREDICTIONS ############################################# predict_patch_and_save_results(model, 'test_set_CV' + str(split), df_test, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) predict_patch_and_save_results(model, 'train_set_CV' + str(split), df_train, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) predict_patch_and_save_results(model, 'val_set_CV' + str(split), df_val, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) ##### EVALUATE function print("evaluate validation") evaluate = model.evaluate_generator( generator=valid_generator, steps=valid_generator.__len__(), verbose=1) evaluate_train = model.evaluate_generator( generator=train_generator, steps=train_generator.__len__(), verbose=1) test_generator = gen.BatchGenerator( instances=df_test.values, resized_image=resized_images_before_training, batch_size=BATCH_SIZE, net_h=IMAGE_SIZE, net_w=IMAGE_SIZE, shuffle=True, norm=keras_utils.normalize, box_size=BOX_SIZE, processed_y=skip_processing, interpolation=mura_interpolation) evaluate_test = model.evaluate_generator( generator=test_generator, steps=test_generator.__len__(), verbose=1) print("Evaluate Train") print(evaluate_train) print("Evaluate Valid") print(evaluate) print("Evaluate test") print(evaluate_test) else: files_found = 0 print(trained_models_path) for file_path in Path(trained_models_path).glob( "CV_patient_split_" + str(split) + "*.hdf5"): print(file_path) files_found += 1 assert files_found == 1, "No model found/ Multiple models found, not clear which to use " print(str(files_found)) model = load_model(str(file_path), custom_objects={ 'keras_loss_v3_nor': keras_loss_v3_nor, 'keras_accuracy': keras_accuracy, 'accuracy_asloss': accuracy_asloss }) model = keras_model.compile_model_accuracy(model, lr, pooling_operator) predict_patch_and_save_results( model, "train_set_CV" + (str(split)), df_train, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) predict_patch_and_save_results(model, "val_set_CV" + (str(split)), df_val, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) predict_patch_and_save_results(model, "test_set_CV" + (str(split)), df_test, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training)
def train_on_subsets(config, number_splits, CV_split_to_use, number_classifiers, subset_seeds, overlap_ratio): """ Trains several classifiers with similar training set, while preserving test and validation set the same. The aim is to compare the performance of these classifiers later in stability module. The script takes a specific cross validation split of training, validation and testing set, and then drops a portion of the samples from the training set. Validation and test set are not changed - they are as the original split. Then the script trains a classifier with each of the training subsets. :param config: yaml config file :param number_splits: number of cross validation splits used in cross validation (CV) (run_cross_validation.py) :param CV_split_to_use: specific CV split for defining train/test/validation set. Value is between [0, number_splits-1] :param number_classifiers: number of classifiers to train :param subset_seeds: seeds used to drop observations from original training set. :param overlap_ratio: ration of observations which are preserved from the original training set, defined by the specific CV split. :return: Returns saved .npy file for the predictions, image_indices and patch labels for the train/test/valid set for each subset. """ skip_processing = config['skip_processing_labels'] image_path = config['image_path'] classication_labels_path = config['classication_labels_path'] localization_labels_path = config['localization_labels_path'] results_path = config['results_path'] processed_labels_path = config['processed_labels_path'] train_mode = config['train_mode'] dataset_name = config['dataset_name'] class_name = config['class_name'] mura_test_img_path = config['mura_test_img_path'] mura_train_labels_path = config['mura_train_labels_path'] mura_train_img_path = config['mura_train_img_path'] mura_test_labels_path = config['mura_test_labels_path'] mura_processed_train_labels_path = config[ 'mura_processed_train_labels_path'] mura_processed_test_labels_path = config['mura_processed_test_labels_path'] mura_interpolation = config['mura_interpolation'] pascal_image_path = config['pascal_image_path'] resized_images_before_training = config['resized_images_before_training'] nr_epochs = config['nr_epochs'] lr = config['lr'] reg_weight = config['reg_weight'] pooling_operator = config['pooling_operator'] IMAGE_SIZE = 512 BATCH_SIZE = 10 BATCH_SIZE_TEST = 1 BOX_SIZE = 16 use_xray, use_pascal = set_dataset_flag(dataset_name) script_suffix = 'subsets' trained_models_path = build_path_results(results_path, dataset_name, pooling_operator, script_suffix=script_suffix, result_suffix='trained_models') prediction_results_path = build_path_results(results_path, dataset_name, pooling_operator, script_suffix=script_suffix, result_suffix='predictions') make_directory(trained_models_path) make_directory(prediction_results_path) if use_xray: if resized_images_before_training: xray_df = fetch_preprocessed_images_csv(image_path, 'processed_imgs') # todo: delete - just for testing # xray_df = xray_df[-50:] else: xray_df = load_xray(skip_processing, processed_labels_path, classication_labels_path, image_path, localization_labels_path, results_path) xray_df = ld.filter_observations(xray_df, class_name, 'No Finding') elif use_pascal: pascal_df = load_pascal(pascal_image_path) else: df_train_val, test_df_all_classes = load_mura( skip_processing, mura_processed_train_labels_path, mura_processed_test_labels_path, mura_train_img_path, mura_train_labels_path, mura_test_labels_path, mura_test_img_path) for split in range(0, number_splits): if use_xray: df_train, df_val, df_test, df_bbox_train, \ df_bbox_test, train_only_class = split_xray_cv(xray_df, number_splits, split, class_name) elif use_pascal: df_train, df_val, df_test = construct_train_test_cv( pascal_df, number_splits, split) else: df_train, df_val = split_data_cv(df_train_val, number_splits, split, random_seed=1, diagnose_col=class_name, ratio_to_keep=None) df_test = filter_rows_and_columns(test_df_all_classes, class_name) for curr_classifier in range(0, number_classifiers): if train_mode and split == CV_split_to_use: print("#####################################################") print("SPLIT :" + str(split)) print("classifier #: " + str(curr_classifier)) if use_xray: class_train_subset = ld.get_train_subset_xray( train_only_class, df_bbox_train.shape[0], random_seed=subset_seeds[curr_classifier], ratio_to_keep=overlap_ratio) print("new subset is :" + str(class_train_subset.shape)) df_train_subset = pd.concat( [df_bbox_train, class_train_subset]) print(df_bbox_train.shape) print(class_train_subset.shape) elif use_pascal: df_train_subset = get_train_subset_mura( df_train, random_seed=subset_seeds[curr_classifier], ratio_to_keep=overlap_ratio) else: df_train_subset = get_train_subset_mura( df_train, random_seed=subset_seeds[curr_classifier], ratio_to_keep=overlap_ratio) tf.keras.backend.clear_session() K.clear_session() ##O##O##_##O#O##_################################ TRAIN ########################################################### train_generator = gen.BatchGenerator( instances=df_train_subset.values, resized_image=resized_images_before_training, batch_size=BATCH_SIZE, net_h=IMAGE_SIZE, net_w=IMAGE_SIZE, norm=keras_utils.normalize, box_size=BOX_SIZE, processed_y=skip_processing, interpolation=mura_interpolation, shuffle=True) valid_generator = gen.BatchGenerator( instances=df_val.values, resized_image=resized_images_before_training, batch_size=BATCH_SIZE, net_h=IMAGE_SIZE, net_w=IMAGE_SIZE, box_size=BOX_SIZE, norm=keras_utils.normalize, processed_y=skip_processing, interpolation=mura_interpolation, shuffle=True) model = keras_model.build_model(reg_weight) model = keras_model.compile_model_accuracy( model, lr, pooling_operator) lrate = LearningRateScheduler(keras_model.step_decay, verbose=1) filepath = trained_models_path + "CV_" + str( split) + '_' + str( curr_classifier) + "_-{epoch:02d}-{val_loss:.2f}.hdf5" checkpoint_on_epoch_end = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, mode='min') print("df train STEPS") print(len(df_train) // BATCH_SIZE) print(train_generator.__len__()) history = model.fit_generator( generator=train_generator, steps_per_epoch=train_generator.__len__(), epochs=nr_epochs, validation_data=valid_generator, validation_steps=valid_generator.__len__(), verbose=1) filepath = trained_models_path + 'subset_' + class_name + "_CV" + str(split) + '_' + str( curr_classifier) + '_' + \ str(overlap_ratio) + ".hdf5" model.save(filepath) print("history") print(history.history) print(history.history['keras_accuracy']) np.save( trained_models_path + 'train_info_' + str(split) + '_' + str(curr_classifier) + '_' + str(overlap_ratio) + '.npy', history.history) settings = np.array({ 'lr: ': lr, 'reg_weight: ': reg_weight, 'pooling_operator: ': pooling_operator }) np.save(trained_models_path + 'train_settings.npy', settings) keras_utils.plot_train_validation( history.history['loss'], history.history['val_loss'], 'train loss', 'validation loss', 'CV_loss' + str(split) + str(curr_classifier), 'loss', trained_models_path) ############################################ PREDICTIONS ############################################# ########################################### TRAINING SET######################################################## predict_patch_and_save_results( model, 'train_set_CV' + str(split) + '_' + str(curr_classifier), df_train, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) ########################################## VALIDATION SET###################################################### predict_patch_and_save_results( model, 'val_set_CV' + str(split) + '_' + str(curr_classifier), df_val, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) ########################################### TESTING SET######################################################## predict_patch_and_save_results( model, 'test_set_CV' + str(split) + '_' + str(curr_classifier), df_test, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) elif not train_mode: files_found = 0 print(trained_models_path) for file_path in Path(trained_models_path).glob( "subset_Cardiomegaly_CV1_" + str(curr_classifier) + "*.hdf5"): print(file_path) files_found += 1 assert files_found == 1, "No model found/ Multiple models found, not clear which to use " print(str(files_found)) model = load_model(str(file_path), custom_objects={ 'keras_loss_v3_nor': keras_loss_v3_nor, 'keras_accuracy': keras_accuracy, 'accuracy_asloss': accuracy_asloss }) model = keras_model.compile_model_accuracy( model, lr, pooling_operator) predict_patch_and_save_results( model, "train_set_CV" + str(split) + str(curr_classifier), df_train, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) predict_patch_and_save_results( model, "val_set_CV" + str(split) + str(curr_classifier), df_val, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training) predict_patch_and_save_results( model, "test_set_CV" + str(split) + str(curr_classifier), df_test, skip_processing, BATCH_SIZE_TEST, BOX_SIZE, IMAGE_SIZE, prediction_results_path, mura_interpolation, resized_images_before_training)
nr_epochs = config['nr_epochs'] lr = config['lr'] reg_weight = config['reg_weight'] pooling_operator = config['pooling_operator'] class_name = config['class_name'] IMAGE_SIZE = 512 BATCH_SIZE = 10 BATCH_SIZE_TEST = 1 BOX_SIZE = 16 use_xray, use_pascal = set_dataset_flag(dataset_name) script_suffix = 'exploratory_exp' trained_models_path = build_path_results(results_path, dataset_name, pooling_operator, script_suffix=script_suffix, result_suffix='trained_models') prediction_results_path = build_path_results(results_path, dataset_name, pooling_operator, script_suffix=script_suffix, result_suffix='predictions') make_directory(trained_models_path) make_directory(prediction_results_path) if use_xray: if resized_images_before_training: xray_df = fetch_preprocessed_images_csv(image_path, 'processed_imgs') #todo: delete after testing # xray_df = xray_df[-50:]