Exemplo n.º 1
0
 def __init__(self, estimator, override_config={}, **kwargs):
     """Initialize."""
     self.estimator = estimator
     self.logger = getLogger(__name__)
     fx.init(**kwargs)
     if len(override_config) > 0:
         fx.update_plan(override_config)
Exemplo n.º 2
0
def model_outputs_to_disc(data_path,
                          validation_csv,
                          output_path,
                          native_model_path,
                          outputtag='',
                          device='cpu'):

    fx.init('fets_challenge_workspace')

    from sys import path, exit

    file = Path(__file__).resolve()
    root = file.parent.resolve()  # interface root, containing command modules
    work = Path.cwd().resolve()

    path.append(str(root))
    path.insert(0, str(work))

    generate_validation_csv(data_path, validation_csv, working_dir=work)

    overrides = {
        'task_runner.settings.device': device,
        'task_runner.settings.val_csv': 'validation_paths.csv',
        'task_runner.settings.train_csv': None,
    }

    # Update the plan if necessary
    plan = fx.update_plan(overrides)
    plan.config['task_runner']['settings']['fets_config_dict'][
        'save_output'] = True
    plan.config['task_runner']['settings']['fets_config_dict'][
        'output_dir'] = output_path

    # overwrite datapath value for a single 'InferenceCol' collaborator
    plan.cols_data_paths['InferenceCol'] = data_path

    # get the inference data loader
    data_loader = copy(plan).get_data_loader('InferenceCol')

    # get the task runner, passing the data loader
    task_runner = copy(plan).get_task_runner(data_loader)

    # Populate model weights
    device = torch.device(device)
    task_runner.load_native(filepath=native_model_path, map_location=device)
    task_runner.opt_treatment = 'RESET'

    logger.info('Starting inference using data from {}\n'.format(data_path))

    task_runner.inference('aggregator',
                          -1,
                          task_runner.get_tensor_dict(),
                          apply='global')
    logger.info(
        f"\nFinished generating predictions to output folder {output_path}")
Exemplo n.º 3
0
# Copyright (C) 2020-2021 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
"""Python native tests."""

import numpy as np
import json

import openfl.native as fx


def one_hot(labels, classes):
    """One-hot encode `labels` using `classes` classes."""
    return np.eye(classes)[labels]


fx.init('torch_cnn_mnist')

if __name__ == '__main__':
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    import torchvision
    import torchvision.transforms as transforms

    from openfl.federated import FederatedModel, FederatedDataSet

    def cross_entropy(output, target):
        """Binary cross-entropy metric."""
        return F.binary_cross_entropy_with_logits(input=output, target=target)
Exemplo n.º 4
0
def run_challenge_experiment(aggregation_function,
                             choose_training_collaborators,
                             training_hyper_parameters_for_round,
                             institution_split_csv_filename,
                             brats_training_data_parent_dir,
                             db_store_rounds=5,
                             rounds_to_train=5,
                             device='cpu',
                             save_checkpoints=True,
                             restore_from_checkpoint_folder=None,
                             include_validation_with_hausdorff=True,
                             use_pretrained_model=True):

    fx.init('fets_challenge_workspace')

    from sys import path, exit

    file = Path(__file__).resolve()
    root = file.parent.resolve()  # interface root, containing command modules
    work = Path.cwd().resolve()

    path.append(str(root))
    path.insert(0, str(work))

    # create gandlf_csv and get collaborator names
    gandlf_csv_path = os.path.join(work, 'gandlf_paths.csv')
    # split_csv_path = os.path.join(work, institution_split_csv_filename)
    collaborator_names = construct_fedsim_csv(brats_training_data_parent_dir,
                                              institution_split_csv_filename,
                                              0.8, gandlf_csv_path)

    aggregation_wrapper = CustomAggregationWrapper(aggregation_function)

    overrides = {
        'aggregator.settings.rounds_to_train': rounds_to_train,
        'aggregator.settings.db_store_rounds': db_store_rounds,
        'tasks.train.aggregation_type': aggregation_wrapper,
        'task_runner.settings.device': device,
    }

    # Update the plan if necessary
    plan = fx.update_plan(overrides)

    if not include_validation_with_hausdorff:
        plan.config['task_runner']['settings']['fets_config_dict'][
            'metrics'] = ['dice', 'dice_per_label']

    # Overwrite collaborator names
    plan.authorized_cols = collaborator_names
    # overwrite datapath values with the collaborator name itself
    for col in collaborator_names:
        plan.cols_data_paths[col] = col

    # get the data loaders for each collaborator
    collaborator_data_loaders = {
        col: copy(plan).get_data_loader(col)
        for col in collaborator_names
    }

    transformed_csv_dict = extract_csv_partitions(
        os.path.join(work, 'gandlf_paths.csv'))
    # get the task runner, passing the first data loader
    for col in collaborator_data_loaders:
        #Insert logic to serialize train / val CSVs here
        transformed_csv_dict[col]['train'].to_csv(
            os.path.join(work, 'seg_test_train.csv'))
        transformed_csv_dict[col]['val'].to_csv(
            os.path.join(work, 'seg_test_val.csv'))
        task_runner = copy(plan).get_task_runner(
            collaborator_data_loaders[col])

    if use_pretrained_model:
        print('Loading pretrained model...')
        if device == 'cpu':
            checkpoint = torch.load(
                f'{root}/pretrained_model/resunet_pretrained.pth',
                map_location=torch.device('cpu'))
            task_runner.model.load_state_dict(checkpoint['model_state_dict'])
            task_runner.optimizer.load_state_dict(
                checkpoint['optimizer_state_dict'])
        else:
            checkpoint = torch.load(
                f'{root}/pretrained_model/resunet_pretrained.pth')
            task_runner.model.load_state_dict(checkpoint['model_state_dict'])
            task_runner.optimizer.load_state_dict(
                checkpoint['optimizer_state_dict'])

    tensor_pipe = plan.get_tensor_pipe()

    # Initialize model weights
    init_state_path = plan.config['aggregator']['settings']['init_state_path']
    tensor_dict, _ = split_tensor_dict_for_holdouts(
        logger, task_runner.get_tensor_dict(False))

    model_snap = utils.construct_model_proto(tensor_dict=tensor_dict,
                                             round_number=0,
                                             tensor_pipe=tensor_pipe)

    utils.dump_proto(model_proto=model_snap, fpath=init_state_path)

    # get the aggregator, now that we have the initial weights file set up
    logger.info('Creating aggregator...')
    aggregator = plan.get_aggregator()
    # manually override the aggregator UUID (for checkpoint resume when rounds change)
    aggregator.uuid = 'aggregator'
    aggregator._load_initial_tensors()

    # create our collaborators
    logger.info('Creating collaborators...')
    collaborators = {
        col: copy(plan).get_collaborator(col,
                                         task_runner=task_runner,
                                         client=aggregator)
        for col in collaborator_names
    }

    collaborator_time_stats = gen_collaborator_time_stats(plan.authorized_cols)

    collaborators_chosen_each_round = {}
    collaborator_times_per_round = {}

    logger.info('Starting experiment')

    total_simulated_time = 0
    best_dice = -1.0
    best_dice_over_time_auc = 0

    # results dataframe data
    experiment_results = {
        'round': [],
        'time': [],
        'convergence_score': [],
        'round_dice': [],
        'dice_label_0': [],
        'dice_label_1': [],
        'dice_label_2': [],
        'dice_label_4': [],
    }
    if include_validation_with_hausdorff:
        experiment_results.update({
            'hausdorff95_label_0': [],
            'hausdorff95_label_1': [],
            'hausdorff95_label_2': [],
            'hausdorff95_label_4': [],
        })

    if restore_from_checkpoint_folder is None:
        checkpoint_folder = setup_checkpoint_folder()
        logger.info(f'\nCreated experiment folder {checkpoint_folder}...')
        starting_round_num = 0
    else:
        if not Path(f'checkpoint/{restore_from_checkpoint_folder}').exists():
            logger.warning(
                f'Could not find provided checkpoint folder: {restore_from_checkpoint_folder}. Exiting...'
            )
            exit(1)
        else:
            logger.info(
                f'Attempting to load last completed round from {restore_from_checkpoint_folder}'
            )
            state = load_checkpoint(restore_from_checkpoint_folder)
            checkpoint_folder = restore_from_checkpoint_folder

            [
                loaded_collaborator_names, starting_round_num,
                collaborator_time_stats, total_simulated_time, best_dice,
                best_dice_over_time_auc, collaborators_chosen_each_round,
                collaborator_times_per_round, experiment_results, summary,
                agg_tensor_db
            ] = state

            if loaded_collaborator_names != collaborator_names:
                logger.error(
                    f'Collaborator names found in checkpoint ({loaded_collaborator_names}) '
                    f'do not match provided collaborators ({collaborator_names})'
                )
                exit(1)

            logger.info(f'Previous summary for round {starting_round_num}')
            logger.info(summary)

            starting_round_num += 1
            aggregator.tensor_db.tensor_db = agg_tensor_db
            aggregator.round_number = starting_round_num

    for round_num in range(starting_round_num, rounds_to_train):
        # pick collaborators to train for the round
        training_collaborators = choose_training_collaborators(
            collaborator_names, aggregator.tensor_db._iterate(), round_num,
            collaborators_chosen_each_round, collaborator_times_per_round)

        logger.info('Collaborators chosen to train for round {}:\n\t{}'.format(
            round_num, training_collaborators))

        # save the collaborators chosen this round
        collaborators_chosen_each_round[round_num] = training_collaborators

        # get the hyper-parameters from the competitor
        hparams = training_hyper_parameters_for_round(
            collaborator_names, aggregator.tensor_db._iterate(), round_num,
            collaborators_chosen_each_round, collaborator_times_per_round)

        learning_rate, epochs_per_round, batches_per_round = hparams

        if (epochs_per_round is None) == (batches_per_round is None):
            logger.error(
                'Hyper-parameter function error: function must return "None" for either "epochs_per_round" or "batches_per_round" but not both.'
            )
            return

        hparam_message = "\n\tlearning rate: {}".format(learning_rate)

        # None gets mapped to -1 in the tensor_db
        if epochs_per_round is None:
            epochs_per_round = -1
            hparam_message += "\n\tbatches_per_round: {}".format(
                batches_per_round)
        elif batches_per_round is None:
            batches_per_round = -1
            hparam_message += "\n\tepochs_per_round: {}".format(
                epochs_per_round)

        logger.info("Hyper-parameters for round {}:{}".format(
            round_num, hparam_message))

        # cache each tensor in the aggregator tensor_db
        hparam_dict = {}
        tk = TensorKey(tensor_name='learning_rate',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(learning_rate)
        tk = TensorKey(tensor_name='epochs_per_round',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(epochs_per_round)
        tk = TensorKey(tensor_name='batches_per_round',
                       origin=aggregator.uuid,
                       round_number=round_num,
                       report=False,
                       tags=('hparam', 'model'))
        hparam_dict[tk] = np.array(batches_per_round)
        aggregator.tensor_db.cache_tensor(hparam_dict)

        # pre-compute the times for each collaborator
        times_per_collaborator = compute_times_per_collaborator(
            collaborator_names, training_collaborators, batches_per_round,
            epochs_per_round, collaborator_data_loaders,
            collaborator_time_stats, round_num)
        collaborator_times_per_round[round_num] = times_per_collaborator

        aggregator.assigner.set_training_collaborators(training_collaborators)

        # update the state in the aggregation wrapper
        aggregation_wrapper.set_state_data_for_round(
            collaborators_chosen_each_round, collaborator_times_per_round)

        # turn the times list into a list of tuples and sort it
        times_list = [(t, col) for col, t in times_per_collaborator.items()]
        times_list = sorted(times_list)

        # now call each collaborator in order of time
        # FIXME: this doesn't break up each task. We need this if we're doing straggler handling
        for t, col in times_list:
            # set the task_runner data loader
            task_runner.data_loader = collaborator_data_loaders[col]

            # run the collaborator
            collaborators[col].run_simulation()

            logger.info(
                "Collaborator {} took simulated time: {} minutes".format(
                    col, round(t / 60, 2)))

        # the round time is the max of the times_list
        round_time = max([t for t, _ in times_list])
        total_simulated_time += round_time

        # get the performace validation scores for the round
        round_dice = get_metric('valid_dice', round_num, aggregator.tensor_db)
        dice_label_0 = get_metric('valid_dice_per_label_0', round_num,
                                  aggregator.tensor_db)
        dice_label_1 = get_metric('valid_dice_per_label_1', round_num,
                                  aggregator.tensor_db)
        dice_label_2 = get_metric('valid_dice_per_label_2', round_num,
                                  aggregator.tensor_db)
        dice_label_4 = get_metric('valid_dice_per_label_4', round_num,
                                  aggregator.tensor_db)
        if include_validation_with_hausdorff:
            hausdorff95_label_0 = get_metric('valid_hd95_per_label_0',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_1 = get_metric('valid_hd95_per_label_1',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_2 = get_metric('valid_hd95_per_label_2',
                                             round_num, aggregator.tensor_db)
            hausdorff95_label_4 = get_metric('valid_hd95_per_label_4',
                                             round_num, aggregator.tensor_db)

        # update best score
        if best_dice < round_dice:
            best_dice = round_dice
            # Set the weights for the final model
            if round_num == 0:
                # here the initial model was validated (temp model does not exist)
                logger.info(
                    f'Skipping best model saving to disk as it is a random initialization.'
                )
            elif not os.path.exists(
                    f'checkpoint/{checkpoint_folder}/temp_model.pkl'):
                raise ValueError(
                    f'Expected temporary model at: checkpoint/{checkpoint_folder}/temp_model.pkl to exist but it was not found.'
                )
            else:
                # here the temp model was the one validated
                shutil.copyfile(
                    src=f'checkpoint/{checkpoint_folder}/temp_model.pkl',
                    dst=f'checkpoint/{checkpoint_folder}/best_model.pkl')
                logger.info(
                    f'Saved model with best average binary DICE: {best_dice} to ~/.local/workspace/checkpoint/{checkpoint_folder}/best_model.pkl'
                )

        ## RUN VALIDATION ON INTERMEDIATE CONSENSUS MODEL
        # set the task_runner data loader
        # task_runner.data_loader = collaborator_data_loaders[col]
        ### DELETE THIS LINE ###
        # print(f'Collaborator {col} training data count = {task_runner.data_loader.get_train_data_size()}')

        # run the collaborator
        #collaborators[col].run_simulation()

        ## CONVERGENCE METRIC COMPUTATION
        # update the auc score
        best_dice_over_time_auc += best_dice * round_time

        # project the auc score as remaining time * best dice
        # this projection assumes that the current best score is carried forward for the entire week
        projected_auc = (MAX_SIMULATION_TIME - total_simulated_time
                         ) * best_dice + best_dice_over_time_auc
        projected_auc /= MAX_SIMULATION_TIME

        # End of round summary
        summary = '"**** END OF ROUND {} SUMMARY *****"'.format(round_num)
        summary += "\n\tSimulation Time: {} minutes".format(
            round(total_simulated_time / 60, 2))
        summary += "\n\t(Projected) Convergence Score: {}".format(
            projected_auc)
        summary += "\n\tDICE Label 0: {}".format(dice_label_0)
        summary += "\n\tDICE Label 1: {}".format(dice_label_1)
        summary += "\n\tDICE Label 2: {}".format(dice_label_2)
        summary += "\n\tDICE Label 4: {}".format(dice_label_4)
        if include_validation_with_hausdorff:
            summary += "\n\tHausdorff95 Label 0: {}".format(
                hausdorff95_label_0)
            summary += "\n\tHausdorff95 Label 1: {}".format(
                hausdorff95_label_1)
            summary += "\n\tHausdorff95 Label 2: {}".format(
                hausdorff95_label_2)
            summary += "\n\tHausdorff95 Label 4: {}".format(
                hausdorff95_label_4)

        experiment_results['round'].append(round_num)
        experiment_results['time'].append(total_simulated_time)
        experiment_results['convergence_score'].append(projected_auc)
        experiment_results['round_dice'].append(round_dice)
        experiment_results['dice_label_0'].append(dice_label_0)
        experiment_results['dice_label_1'].append(dice_label_1)
        experiment_results['dice_label_2'].append(dice_label_2)
        experiment_results['dice_label_4'].append(dice_label_4)
        if include_validation_with_hausdorff:
            experiment_results['hausdorff95_label_0'].append(
                hausdorff95_label_0)
            experiment_results['hausdorff95_label_1'].append(
                hausdorff95_label_1)
            experiment_results['hausdorff95_label_2'].append(
                hausdorff95_label_2)
            experiment_results['hausdorff95_label_4'].append(
                hausdorff95_label_4)
        logger.info(summary)

        if save_checkpoints:
            logger.info(f'Saving checkpoint for round {round_num}')
            logger.info(
                f'To resume from this checkpoint, set the restore_from_checkpoint_folder parameter to \'{checkpoint_folder}\''
            )
            save_checkpoint(checkpoint_folder, aggregator, collaborator_names,
                            collaborators, round_num, collaborator_time_stats,
                            total_simulated_time, best_dice,
                            best_dice_over_time_auc,
                            collaborators_chosen_each_round,
                            collaborator_times_per_round, experiment_results,
                            summary)

        # if the total_simulated_time has exceeded the maximum time, we break
        # in practice, this means that the previous round's model is the last model scored,
        # so a long final round should not actually benefit the competitor, since that final
        # model is never globally validated
        if total_simulated_time > MAX_SIMULATION_TIME:
            logger.info("Simulation time exceeded. Ending Experiment")
            break

        # save the most recent aggregated model in native format to be copied over as best when appropriate
        # (note this model has not been validated by the collaborators yet)
        task_runner.rebuild_model(round_num,
                                  aggregator.last_tensor_dict,
                                  validation=True)
        task_runner.save_native(
            f'checkpoint/{checkpoint_folder}/temp_model.pkl')

    return pd.DataFrame.from_dict(experiment_results), checkpoint_folder
Exemplo n.º 5
0
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss=ke.losses.categorical_crossentropy,
                  optimizer=ke.optimizers.Adam(),
                  metrics=['accuracy'])

    # initialize the optimizer variables
    opt_vars = model.optimizer.variables()

    for v in opt_vars:
        v.initializer.run(session=sess)

    return model


fx.init('keras_cnn_mnist')

if __name__ == '__main__':
    from openfl.federated import FederatedModel, FederatedDataSet
    from tensorflow.python.keras.utils.data_utils import get_file

    origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
    path = get_file(
        'mnist.npz',
        origin=origin_folder + 'mnist.npz',
        file_hash=
        '731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1')

    with np.load(path) as f:
        # get all of mnist
        X_train = f['x_train']