def __init__(self, estimator, override_config={}, **kwargs): """Initialize.""" self.estimator = estimator self.logger = getLogger(__name__) fx.init(**kwargs) if len(override_config) > 0: fx.update_plan(override_config)
def model_outputs_to_disc(data_path, validation_csv, output_path, native_model_path, outputtag='', device='cpu'): fx.init('fets_challenge_workspace') from sys import path, exit file = Path(__file__).resolve() root = file.parent.resolve() # interface root, containing command modules work = Path.cwd().resolve() path.append(str(root)) path.insert(0, str(work)) generate_validation_csv(data_path, validation_csv, working_dir=work) overrides = { 'task_runner.settings.device': device, 'task_runner.settings.val_csv': 'validation_paths.csv', 'task_runner.settings.train_csv': None, } # Update the plan if necessary plan = fx.update_plan(overrides) plan.config['task_runner']['settings']['fets_config_dict'][ 'save_output'] = True plan.config['task_runner']['settings']['fets_config_dict'][ 'output_dir'] = output_path # overwrite datapath value for a single 'InferenceCol' collaborator plan.cols_data_paths['InferenceCol'] = data_path # get the inference data loader data_loader = copy(plan).get_data_loader('InferenceCol') # get the task runner, passing the data loader task_runner = copy(plan).get_task_runner(data_loader) # Populate model weights device = torch.device(device) task_runner.load_native(filepath=native_model_path, map_location=device) task_runner.opt_treatment = 'RESET' logger.info('Starting inference using data from {}\n'.format(data_path)) task_runner.inference('aggregator', -1, task_runner.get_tensor_dict(), apply='global') logger.info( f"\nFinished generating predictions to output folder {output_path}")
# Copyright (C) 2020-2021 Intel Corporation # SPDX-License-Identifier: Apache-2.0 """Python native tests.""" import numpy as np import json import openfl.native as fx def one_hot(labels, classes): """One-hot encode `labels` using `classes` classes.""" return np.eye(classes)[labels] fx.init('torch_cnn_mnist') if __name__ == '__main__': import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torchvision import torchvision.transforms as transforms from openfl.federated import FederatedModel, FederatedDataSet def cross_entropy(output, target): """Binary cross-entropy metric.""" return F.binary_cross_entropy_with_logits(input=output, target=target)
def run_challenge_experiment(aggregation_function, choose_training_collaborators, training_hyper_parameters_for_round, institution_split_csv_filename, brats_training_data_parent_dir, db_store_rounds=5, rounds_to_train=5, device='cpu', save_checkpoints=True, restore_from_checkpoint_folder=None, include_validation_with_hausdorff=True, use_pretrained_model=True): fx.init('fets_challenge_workspace') from sys import path, exit file = Path(__file__).resolve() root = file.parent.resolve() # interface root, containing command modules work = Path.cwd().resolve() path.append(str(root)) path.insert(0, str(work)) # create gandlf_csv and get collaborator names gandlf_csv_path = os.path.join(work, 'gandlf_paths.csv') # split_csv_path = os.path.join(work, institution_split_csv_filename) collaborator_names = construct_fedsim_csv(brats_training_data_parent_dir, institution_split_csv_filename, 0.8, gandlf_csv_path) aggregation_wrapper = CustomAggregationWrapper(aggregation_function) overrides = { 'aggregator.settings.rounds_to_train': rounds_to_train, 'aggregator.settings.db_store_rounds': db_store_rounds, 'tasks.train.aggregation_type': aggregation_wrapper, 'task_runner.settings.device': device, } # Update the plan if necessary plan = fx.update_plan(overrides) if not include_validation_with_hausdorff: plan.config['task_runner']['settings']['fets_config_dict'][ 'metrics'] = ['dice', 'dice_per_label'] # Overwrite collaborator names plan.authorized_cols = collaborator_names # overwrite datapath values with the collaborator name itself for col in collaborator_names: plan.cols_data_paths[col] = col # get the data loaders for each collaborator collaborator_data_loaders = { col: copy(plan).get_data_loader(col) for col in collaborator_names } transformed_csv_dict = extract_csv_partitions( os.path.join(work, 'gandlf_paths.csv')) # get the task runner, passing the first data loader for col in collaborator_data_loaders: #Insert logic to serialize train / val CSVs here transformed_csv_dict[col]['train'].to_csv( os.path.join(work, 'seg_test_train.csv')) transformed_csv_dict[col]['val'].to_csv( os.path.join(work, 'seg_test_val.csv')) task_runner = copy(plan).get_task_runner( collaborator_data_loaders[col]) if use_pretrained_model: print('Loading pretrained model...') if device == 'cpu': checkpoint = torch.load( f'{root}/pretrained_model/resunet_pretrained.pth', map_location=torch.device('cpu')) task_runner.model.load_state_dict(checkpoint['model_state_dict']) task_runner.optimizer.load_state_dict( checkpoint['optimizer_state_dict']) else: checkpoint = torch.load( f'{root}/pretrained_model/resunet_pretrained.pth') task_runner.model.load_state_dict(checkpoint['model_state_dict']) task_runner.optimizer.load_state_dict( checkpoint['optimizer_state_dict']) tensor_pipe = plan.get_tensor_pipe() # Initialize model weights init_state_path = plan.config['aggregator']['settings']['init_state_path'] tensor_dict, _ = split_tensor_dict_for_holdouts( logger, task_runner.get_tensor_dict(False)) model_snap = utils.construct_model_proto(tensor_dict=tensor_dict, round_number=0, tensor_pipe=tensor_pipe) utils.dump_proto(model_proto=model_snap, fpath=init_state_path) # get the aggregator, now that we have the initial weights file set up logger.info('Creating aggregator...') aggregator = plan.get_aggregator() # manually override the aggregator UUID (for checkpoint resume when rounds change) aggregator.uuid = 'aggregator' aggregator._load_initial_tensors() # create our collaborators logger.info('Creating collaborators...') collaborators = { col: copy(plan).get_collaborator(col, task_runner=task_runner, client=aggregator) for col in collaborator_names } collaborator_time_stats = gen_collaborator_time_stats(plan.authorized_cols) collaborators_chosen_each_round = {} collaborator_times_per_round = {} logger.info('Starting experiment') total_simulated_time = 0 best_dice = -1.0 best_dice_over_time_auc = 0 # results dataframe data experiment_results = { 'round': [], 'time': [], 'convergence_score': [], 'round_dice': [], 'dice_label_0': [], 'dice_label_1': [], 'dice_label_2': [], 'dice_label_4': [], } if include_validation_with_hausdorff: experiment_results.update({ 'hausdorff95_label_0': [], 'hausdorff95_label_1': [], 'hausdorff95_label_2': [], 'hausdorff95_label_4': [], }) if restore_from_checkpoint_folder is None: checkpoint_folder = setup_checkpoint_folder() logger.info(f'\nCreated experiment folder {checkpoint_folder}...') starting_round_num = 0 else: if not Path(f'checkpoint/{restore_from_checkpoint_folder}').exists(): logger.warning( f'Could not find provided checkpoint folder: {restore_from_checkpoint_folder}. Exiting...' ) exit(1) else: logger.info( f'Attempting to load last completed round from {restore_from_checkpoint_folder}' ) state = load_checkpoint(restore_from_checkpoint_folder) checkpoint_folder = restore_from_checkpoint_folder [ loaded_collaborator_names, starting_round_num, collaborator_time_stats, total_simulated_time, best_dice, best_dice_over_time_auc, collaborators_chosen_each_round, collaborator_times_per_round, experiment_results, summary, agg_tensor_db ] = state if loaded_collaborator_names != collaborator_names: logger.error( f'Collaborator names found in checkpoint ({loaded_collaborator_names}) ' f'do not match provided collaborators ({collaborator_names})' ) exit(1) logger.info(f'Previous summary for round {starting_round_num}') logger.info(summary) starting_round_num += 1 aggregator.tensor_db.tensor_db = agg_tensor_db aggregator.round_number = starting_round_num for round_num in range(starting_round_num, rounds_to_train): # pick collaborators to train for the round training_collaborators = choose_training_collaborators( collaborator_names, aggregator.tensor_db._iterate(), round_num, collaborators_chosen_each_round, collaborator_times_per_round) logger.info('Collaborators chosen to train for round {}:\n\t{}'.format( round_num, training_collaborators)) # save the collaborators chosen this round collaborators_chosen_each_round[round_num] = training_collaborators # get the hyper-parameters from the competitor hparams = training_hyper_parameters_for_round( collaborator_names, aggregator.tensor_db._iterate(), round_num, collaborators_chosen_each_round, collaborator_times_per_round) learning_rate, epochs_per_round, batches_per_round = hparams if (epochs_per_round is None) == (batches_per_round is None): logger.error( 'Hyper-parameter function error: function must return "None" for either "epochs_per_round" or "batches_per_round" but not both.' ) return hparam_message = "\n\tlearning rate: {}".format(learning_rate) # None gets mapped to -1 in the tensor_db if epochs_per_round is None: epochs_per_round = -1 hparam_message += "\n\tbatches_per_round: {}".format( batches_per_round) elif batches_per_round is None: batches_per_round = -1 hparam_message += "\n\tepochs_per_round: {}".format( epochs_per_round) logger.info("Hyper-parameters for round {}:{}".format( round_num, hparam_message)) # cache each tensor in the aggregator tensor_db hparam_dict = {} tk = TensorKey(tensor_name='learning_rate', origin=aggregator.uuid, round_number=round_num, report=False, tags=('hparam', 'model')) hparam_dict[tk] = np.array(learning_rate) tk = TensorKey(tensor_name='epochs_per_round', origin=aggregator.uuid, round_number=round_num, report=False, tags=('hparam', 'model')) hparam_dict[tk] = np.array(epochs_per_round) tk = TensorKey(tensor_name='batches_per_round', origin=aggregator.uuid, round_number=round_num, report=False, tags=('hparam', 'model')) hparam_dict[tk] = np.array(batches_per_round) aggregator.tensor_db.cache_tensor(hparam_dict) # pre-compute the times for each collaborator times_per_collaborator = compute_times_per_collaborator( collaborator_names, training_collaborators, batches_per_round, epochs_per_round, collaborator_data_loaders, collaborator_time_stats, round_num) collaborator_times_per_round[round_num] = times_per_collaborator aggregator.assigner.set_training_collaborators(training_collaborators) # update the state in the aggregation wrapper aggregation_wrapper.set_state_data_for_round( collaborators_chosen_each_round, collaborator_times_per_round) # turn the times list into a list of tuples and sort it times_list = [(t, col) for col, t in times_per_collaborator.items()] times_list = sorted(times_list) # now call each collaborator in order of time # FIXME: this doesn't break up each task. We need this if we're doing straggler handling for t, col in times_list: # set the task_runner data loader task_runner.data_loader = collaborator_data_loaders[col] # run the collaborator collaborators[col].run_simulation() logger.info( "Collaborator {} took simulated time: {} minutes".format( col, round(t / 60, 2))) # the round time is the max of the times_list round_time = max([t for t, _ in times_list]) total_simulated_time += round_time # get the performace validation scores for the round round_dice = get_metric('valid_dice', round_num, aggregator.tensor_db) dice_label_0 = get_metric('valid_dice_per_label_0', round_num, aggregator.tensor_db) dice_label_1 = get_metric('valid_dice_per_label_1', round_num, aggregator.tensor_db) dice_label_2 = get_metric('valid_dice_per_label_2', round_num, aggregator.tensor_db) dice_label_4 = get_metric('valid_dice_per_label_4', round_num, aggregator.tensor_db) if include_validation_with_hausdorff: hausdorff95_label_0 = get_metric('valid_hd95_per_label_0', round_num, aggregator.tensor_db) hausdorff95_label_1 = get_metric('valid_hd95_per_label_1', round_num, aggregator.tensor_db) hausdorff95_label_2 = get_metric('valid_hd95_per_label_2', round_num, aggregator.tensor_db) hausdorff95_label_4 = get_metric('valid_hd95_per_label_4', round_num, aggregator.tensor_db) # update best score if best_dice < round_dice: best_dice = round_dice # Set the weights for the final model if round_num == 0: # here the initial model was validated (temp model does not exist) logger.info( f'Skipping best model saving to disk as it is a random initialization.' ) elif not os.path.exists( f'checkpoint/{checkpoint_folder}/temp_model.pkl'): raise ValueError( f'Expected temporary model at: checkpoint/{checkpoint_folder}/temp_model.pkl to exist but it was not found.' ) else: # here the temp model was the one validated shutil.copyfile( src=f'checkpoint/{checkpoint_folder}/temp_model.pkl', dst=f'checkpoint/{checkpoint_folder}/best_model.pkl') logger.info( f'Saved model with best average binary DICE: {best_dice} to ~/.local/workspace/checkpoint/{checkpoint_folder}/best_model.pkl' ) ## RUN VALIDATION ON INTERMEDIATE CONSENSUS MODEL # set the task_runner data loader # task_runner.data_loader = collaborator_data_loaders[col] ### DELETE THIS LINE ### # print(f'Collaborator {col} training data count = {task_runner.data_loader.get_train_data_size()}') # run the collaborator #collaborators[col].run_simulation() ## CONVERGENCE METRIC COMPUTATION # update the auc score best_dice_over_time_auc += best_dice * round_time # project the auc score as remaining time * best dice # this projection assumes that the current best score is carried forward for the entire week projected_auc = (MAX_SIMULATION_TIME - total_simulated_time ) * best_dice + best_dice_over_time_auc projected_auc /= MAX_SIMULATION_TIME # End of round summary summary = '"**** END OF ROUND {} SUMMARY *****"'.format(round_num) summary += "\n\tSimulation Time: {} minutes".format( round(total_simulated_time / 60, 2)) summary += "\n\t(Projected) Convergence Score: {}".format( projected_auc) summary += "\n\tDICE Label 0: {}".format(dice_label_0) summary += "\n\tDICE Label 1: {}".format(dice_label_1) summary += "\n\tDICE Label 2: {}".format(dice_label_2) summary += "\n\tDICE Label 4: {}".format(dice_label_4) if include_validation_with_hausdorff: summary += "\n\tHausdorff95 Label 0: {}".format( hausdorff95_label_0) summary += "\n\tHausdorff95 Label 1: {}".format( hausdorff95_label_1) summary += "\n\tHausdorff95 Label 2: {}".format( hausdorff95_label_2) summary += "\n\tHausdorff95 Label 4: {}".format( hausdorff95_label_4) experiment_results['round'].append(round_num) experiment_results['time'].append(total_simulated_time) experiment_results['convergence_score'].append(projected_auc) experiment_results['round_dice'].append(round_dice) experiment_results['dice_label_0'].append(dice_label_0) experiment_results['dice_label_1'].append(dice_label_1) experiment_results['dice_label_2'].append(dice_label_2) experiment_results['dice_label_4'].append(dice_label_4) if include_validation_with_hausdorff: experiment_results['hausdorff95_label_0'].append( hausdorff95_label_0) experiment_results['hausdorff95_label_1'].append( hausdorff95_label_1) experiment_results['hausdorff95_label_2'].append( hausdorff95_label_2) experiment_results['hausdorff95_label_4'].append( hausdorff95_label_4) logger.info(summary) if save_checkpoints: logger.info(f'Saving checkpoint for round {round_num}') logger.info( f'To resume from this checkpoint, set the restore_from_checkpoint_folder parameter to \'{checkpoint_folder}\'' ) save_checkpoint(checkpoint_folder, aggregator, collaborator_names, collaborators, round_num, collaborator_time_stats, total_simulated_time, best_dice, best_dice_over_time_auc, collaborators_chosen_each_round, collaborator_times_per_round, experiment_results, summary) # if the total_simulated_time has exceeded the maximum time, we break # in practice, this means that the previous round's model is the last model scored, # so a long final round should not actually benefit the competitor, since that final # model is never globally validated if total_simulated_time > MAX_SIMULATION_TIME: logger.info("Simulation time exceeded. Ending Experiment") break # save the most recent aggregated model in native format to be copied over as best when appropriate # (note this model has not been validated by the collaborators yet) task_runner.rebuild_model(round_num, aggregator.last_tensor_dict, validation=True) task_runner.save_native( f'checkpoint/{checkpoint_folder}/temp_model.pkl') return pd.DataFrame.from_dict(experiment_results), checkpoint_folder
model.add(Dense(num_classes, activation='softmax')) model.compile(loss=ke.losses.categorical_crossentropy, optimizer=ke.optimizers.Adam(), metrics=['accuracy']) # initialize the optimizer variables opt_vars = model.optimizer.variables() for v in opt_vars: v.initializer.run(session=sess) return model fx.init('keras_cnn_mnist') if __name__ == '__main__': from openfl.federated import FederatedModel, FederatedDataSet from tensorflow.python.keras.utils.data_utils import get_file origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/' path = get_file( 'mnist.npz', origin=origin_folder + 'mnist.npz', file_hash= '731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1') with np.load(path) as f: # get all of mnist X_train = f['x_train']