def run_fedavg_round(aggregator: 'BaseAggregatorParticipant', participants: List['BaseTrainingParticipant'], training_args: TrainArgs, client_fraction=1.0): """ Routine to run a training round with the given clients based on the server model and then aggregate the results :param client_fraction: client fraction to train with :param aggregator: aggregator participant that will aggregate the resulting training models :param participants: training participants in this round :param training_args: training arguments for this round :return: """ logger.debug('distribute the initial model to the clients.') initial_model_state = aggregator.model.state_dict() success_threshold = max(int(len(participants) * client_fraction), 1) if client_fraction < 1.0 else -1 participant_fraction = sample_randomly_by_fraction(participants, client_fraction) logger.debug( f'starting training round with {len(participant_fraction)}/{len(participants)}.' ) trained_participants = run_fedavg_train_round(initial_model_state, participant_fraction, training_args, success_threshold=-1) logger.debug('starting aggregation.') num_train_samples = [p.num_train_samples for p in trained_participants] aggregator.aggregate(trained_participants, num_train_samples=num_train_samples) logger.debug('distribute the aggregated global model to clients') resulting_model_state = aggregator.model.state_dict() overwrite_participants_models(resulting_model_state, participants)
def reptile_train_step(aggregator: ReptileServer, participants: List[ReptileClient], inner_training_args: TrainArgs, meta_training_args: TrainArgs = None, evaluation_mode: bool = False, *args, **kwargs): """ Routine to run a Reptile training step :param aggregator: aggregator participant that will aggregate the resulting training models :param participants: training participants in this round :param inner_training_args: training arguments for participant models :param meta_training_args: training arguments for meta model :param evaluation_mode: is evaluation step :return: """ logger.debug('distribute the initial model to the clients.') initial_model_state = copy.deepcopy(aggregator.model.state_dict()) overwrite_participants_models(initial_model_state, participants) logger.debug('starting training round.') run_train_round(participants, inner_training_args) # Aggregate only when not in evaluation mode if not evaluation_mode: assert meta_training_args is not None, ('Argument meta_training_args ' 'must not be None when not in evaluation_mode') logger.debug( (f"Starting aggregation: num_participants={len(participants)}, " f"meta_learning_rate={meta_training_args.kwargs['meta_learning_rate']}") ) aggregator.aggregate( participants=participants, meta_learning_rate=meta_training_args.kwargs['meta_learning_rate'] )
def run_fedavg_train_round( initial_model_state: Dict[str, Tensor], participants: List['BaseTrainingParticipant'], training_args: TrainArgs, success_threshold=-1) -> List['BaseTrainingParticipant']: """ Routine to run a single round of training on the clients and return the results additional args are passed to the clients training routines. :param initial_model_state: model state to communicate before training :param participants: participants to train in this round :param training_args: arguments passed for training :param success_threshold: threshold for how many clients should at least participate in the round :return: """ overwrite_participants_models(initial_model_state, participants) successful_participants = [] for participant in participants: try: logger.debug( f'invoking training on participant {participant._name}') participant.train(training_args) successful_participants.append(participant) if success_threshold != -1 and success_threshold <= len( successful_participants): break except GradientExplodingError as gradient_exception: logger.error( f'participant {participant._name} failed due to exploding gradients', gradient_exception) except Exception as e: logger.error(f'training on participant {participant._name} failed', e) if success_threshold != -1 and len( successful_participants) < success_threshold: raise ExecutionError( 'Failed to execute training round, not enough clients participated successfully' ) return successful_participants
def run_hierarchical_clustering(local_evaluation_steps, seed, lr, name, total_fedavg_rounds, cluster_initialization_rounds, client_fraction, local_epochs, batch_size, num_clients, sample_threshold, num_label_limit, train_args, dataset, partitioner_class, linkage_mech, criterion, dis_metric, max_value_criterion, reallocate_clients, threshold_min_client_cluster, use_colored_images, use_pattern, train_cluster_args=None, mean=None, std=None): fix_random_seeds(seed) global_tag = 'global_performance' global_tag_local = 'global_performance_personalized' initialize_clients_fn = DEFAULT_CLIENT_INIT_FN if dataset == 'ham10k': fed_dataset = load_ham10k_federated(partitions=num_clients, batch_size=batch_size, mean=mean, std=std) initialize_clients_fn = initialize_ham10k_clients else: raise ValueError(f'dataset "{dataset}" unknown') if not hasattr(max_value_criterion, '__iter__'): max_value_criterion = [max_value_criterion] if not hasattr(lr, '__iter__'): lr = [lr] for cf in client_fraction: for lr_i in lr: optimizer_args = OptimizerArgs(optim.SGD, lr=lr_i) model_args = ModelArgs(MobileNetV2Lightning, optimizer_args=optimizer_args, num_classes=7) fedavg_context = FedAvgExperimentContext( name=name, client_fraction=cf, local_epochs=local_epochs, lr=lr_i, batch_size=batch_size, optimizer_args=optimizer_args, model_args=model_args, train_args=train_args, dataset_name=dataset) experiment_specification = f'{fedavg_context}' experiment_logger = create_tensorboard_logger( fedavg_context.name, experiment_specification) fedavg_context.experiment_logger = experiment_logger for init_rounds, max_value in generate_configuration( cluster_initialization_rounds, max_value_criterion): # load the model state round_model_state = load_fedavg_state(fedavg_context, init_rounds) server = FedAvgServer('initial_server', fedavg_context.model_args, fedavg_context) server.overwrite_model_state(round_model_state) logger.info('initializing clients ...') clients = initialize_clients_fn(fedavg_context, fed_dataset, server.model.state_dict()) overwrite_participants_models(round_model_state, clients) # initialize the cluster configuration round_configuration = { 'num_rounds_init': init_rounds, 'num_rounds_cluster': total_fedavg_rounds - init_rounds } if partitioner_class == DatadependentPartitioner: clustering_dataset = load_femnist_colored_dataset( str((REPO_ROOT / 'data').absolute()), num_clients=num_clients, batch_size=batch_size, sample_threshold=sample_threshold) dataloader = load_n_of_each_class( clustering_dataset, n=5, tabu=list(fed_dataset.train_data_local_dict.keys())) cluster_args = ClusterArgs( partitioner_class, linkage_mech=linkage_mech, criterion=criterion, dis_metric=dis_metric, max_value_criterion=max_value, plot_dendrogram=False, reallocate_clients=reallocate_clients, threshold_min_client_cluster= threshold_min_client_cluster, dataloader=dataloader, **round_configuration) else: cluster_args = ClusterArgs( partitioner_class, linkage_mech=linkage_mech, criterion=criterion, dis_metric=dis_metric, max_value_criterion=max_value, plot_dendrogram=False, reallocate_clients=reallocate_clients, threshold_min_client_cluster= threshold_min_client_cluster, **round_configuration) # create new logger for cluster experiment experiment_specification = f'{fedavg_context}_{cluster_args}' experiment_logger = create_tensorboard_logger( fedavg_context.name, experiment_specification) fedavg_context.experiment_logger = experiment_logger initial_train_fn = partial(run_fedavg_train_round, round_model_state, training_args=train_cluster_args) create_aggregator_fn = partial(FedAvgServer, model_args=model_args, context=fedavg_context) federated_round_fn = partial(run_fedavg_round, training_args=train_args, client_fraction=cf) after_post_clustering_evaluation = [ partial(log_after_round_evaluation, experiment_logger, 'post_clustering') ] after_clustering_round_evaluation = [ partial(log_after_round_evaluation, experiment_logger) ] after_federated_round_evaluation = [ partial(log_after_round_evaluation, experiment_logger, ['final hierarchical', global_tag]) ] after_clustering_fn = [ partial(log_cluster_distribution, experiment_logger, num_classes=fed_dataset.class_num), partial(log_sample_images_from_each_client, experiment_logger) ] after_federated_round_fn = [ partial( log_personalized_global_cluster_performance, experiment_logger, ['final hierarchical personalized', global_tag_local], local_evaluation_steps) ] run_fedavg_hierarchical( server, clients, cluster_args, initial_train_fn, federated_round_fn, create_aggregator_fn, after_post_clustering_evaluation, after_clustering_round_evaluation, after_federated_round_evaluation, after_clustering_fn, after_federated_round=after_federated_round_fn)
def clustering_test(mean, std, seed, lr, local_epochs, client_fraction, optimizer_args, total_fedavg_rounds, batch_size, num_clients, model_args, train_args, train_cluster_args, initialization_rounds, partitioner_class, linkage_mech, criterion, dis_metric, max_value_criterion): fix_random_seeds(seed) fed_dataset = load_ham10k_federated(partitions=num_clients, batch_size=batch_size, mean=mean, std=std) initialize_clients_fn = initialize_ham10k_clients fedavg_context = FedAvgExperimentContext(name='ham10k_clustering', client_fraction=client_fraction, local_epochs=local_epochs, lr=lr, batch_size=batch_size, optimizer_args=optimizer_args, model_args=model_args, train_args=train_args, dataset_name='ham10k') experiment_specification = f'{fedavg_context}' experiment_logger = create_tensorboard_logger(fedavg_context.name, experiment_specification) log_dataset_distribution(experiment_logger, 'full dataset', fed_dataset) server, clients = run_fedavg(context=fedavg_context, num_rounds=total_fedavg_rounds, dataset=fed_dataset, save_states=True, restore_state=True, evaluate_rounds=False, initialize_clients_fn=initialize_clients_fn) for init_rounds in initialization_rounds: # load the model state round_model_state = load_fedavg_state(fedavg_context, init_rounds) overwrite_participants_models(round_model_state, clients) run_fedavg_train_round(round_model_state, training_args=train_cluster_args, participants=clients) for max_value in max_value_criterion: # initialize the cluster configuration round_configuration = { 'num_rounds_init': init_rounds, 'num_rounds_cluster': total_fedavg_rounds - init_rounds } cluster_args = ClusterArgs(partitioner_class, linkage_mech=linkage_mech, criterion=criterion, dis_metric=dis_metric, max_value_criterion=max_value, plot_dendrogram=False, reallocate_clients=False, threshold_min_client_cluster=-1, **round_configuration) experiment_logger = create_tensorboard_logger( fedavg_context.name, f'{experiment_specification}{cluster_args}') partitioner = cluster_args() cluster_clients_dic = partitioner.cluster(clients, server) log_cluster_distribution(experiment_logger, cluster_clients_dic, 7)
def run_hierarchical_clustering_reptile( seed, name, dataset, num_clients, batch_size, num_label_limit, use_colored_images, sample_threshold, hc_lr, hc_cluster_initialization_rounds, hc_client_fraction, hc_local_epochs, hc_train_args, hc_partitioner_class, hc_linkage_mech, hc_criterion, hc_dis_metric, hc_max_value_criterion, # distance threshold hc_reallocate_clients, # hc_threshold_min_client_cluster, # only with hc_reallocate_clients = True, # results in clusters having at least this number of clients hc_train_cluster_args, rp_sgd, # True -> Use SGD as inner optimizer; False -> Use Adam rp_adam_betas, # Used only if sgd = False rp_meta_batch_size, rp_num_meta_steps, rp_meta_learning_rate_initial, rp_meta_learning_rate_final, rp_eval_interval, rp_inner_learning_rate, rp_num_inner_steps, rp_num_inner_steps_eval): fix_random_seeds(seed) global_tag = 'global_performance' if dataset == 'femnist': if use_colored_images: fed_dataset = load_femnist_colored_dataset( data_dir=str((REPO_ROOT / 'data').absolute()), num_clients=num_clients, batch_size=batch_size, sample_threshold=sample_threshold) else: fed_dataset = load_femnist_dataset( data_dir=str((REPO_ROOT / 'data').absolute()), num_clients=num_clients, batch_size=batch_size, sample_threshold=sample_threshold) if num_label_limit != -1: fed_dataset = scratch_labels(fed_dataset, num_label_limit) else: raise ValueError(f'dataset "{dataset}" unknown') if not hasattr(hc_max_value_criterion, '__iter__'): hc_max_value_criterion = [hc_max_value_criterion] if not hasattr(hc_lr, '__iter__'): hc_lr = [hc_lr] input_channels = 3 if use_colored_images else 1 data_distribution_logged = False for cf in hc_client_fraction: for lr_i in hc_lr: # Initialize experiment context parameters fedavg_optimizer_args = OptimizerArgs(optim.SGD, lr=lr_i) fedavg_model_args = ModelArgs(CNNLightning, optimizer_args=fedavg_optimizer_args, input_channels=input_channels, only_digits=False) fedavg_context = FedAvgExperimentContext( name=name, client_fraction=cf, local_epochs=hc_local_epochs, lr=lr_i, batch_size=batch_size, optimizer_args=fedavg_optimizer_args, model_args=fedavg_model_args, train_args=hc_train_args, dataset_name=dataset) reptile_context = ReptileExperimentContext( name=name, dataset_name=dataset, swap_labels=False, num_classes_per_client=0, num_shots_per_class=0, seed=seed, model_class=CNNLightning, sgd=rp_sgd, adam_betas=rp_adam_betas, num_clients_train=num_clients, num_clients_test=0, meta_batch_size=rp_meta_batch_size, num_meta_steps=rp_num_meta_steps, meta_learning_rate_initial=rp_meta_learning_rate_initial, meta_learning_rate_final=rp_meta_learning_rate_final, eval_interval=rp_eval_interval, num_eval_clients_training=-1, do_final_evaluation=True, num_eval_clients_final=-1, inner_batch_size=batch_size, inner_learning_rate=rp_inner_learning_rate, num_inner_steps=rp_num_inner_steps, num_inner_steps_eval=rp_num_inner_steps_eval) experiment_specification = f'{fedavg_context}' experiment_logger = create_tensorboard_logger( name, experiment_specification) if not data_distribution_logged: log_dataset_distribution(experiment_logger, 'full dataset', fed_dataset) data_distribution_logged = True log_after_round_evaluation_fns = [ partial(log_after_round_evaluation, experiment_logger, 'fedavg'), partial(log_after_round_evaluation, experiment_logger, global_tag) ] server, clients = run_fedavg( context=fedavg_context, num_rounds=max(hc_cluster_initialization_rounds), dataset=fed_dataset, save_states=True, restore_state=True, after_round_evaluation=log_after_round_evaluation_fns) for init_rounds, max_value in generate_configuration( hc_cluster_initialization_rounds, hc_max_value_criterion): # load the model state round_model_state = load_fedavg_state(fedavg_context, init_rounds) overwrite_participants_models(round_model_state, clients) # initialize the cluster configuration round_configuration = { 'num_rounds_init': init_rounds, 'num_rounds_cluster': 0 } cluster_args = ClusterArgs( hc_partitioner_class, linkage_mech=hc_linkage_mech, criterion=hc_criterion, dis_metric=hc_dis_metric, max_value_criterion=max_value, plot_dendrogram=False, reallocate_clients=hc_reallocate_clients, threshold_min_client_cluster= hc_threshold_min_client_cluster, **round_configuration) # create new logger for cluster experiment experiment_specification = f'{fedavg_context}_{cluster_args}_{reptile_context}' experiment_logger = create_tensorboard_logger( name, experiment_specification) fedavg_context.experiment_logger = experiment_logger initial_train_fn = partial(run_fedavg_train_round, round_model_state, training_args=hc_train_cluster_args) create_aggregator_fn = partial(FedAvgServer, model_args=fedavg_model_args, context=fedavg_context) # HIERARCHICAL CLUSTERING logger.debug('starting local training before clustering.') trained_participants = initial_train_fn(clients) if len(trained_participants) != len(clients): raise ValueError( 'not all clients successfully participated in the clustering round' ) # Clustering of participants by model updates partitioner = cluster_args() cluster_clients_dic = partitioner.cluster(clients, server) _cluster_clients_dic = dict() for cluster_id, participants in cluster_clients_dic.items(): _cluster_clients_dic[cluster_id] = [ c._name for c in participants ] log_cluster_distribution(experiment_logger, cluster_clients_dic, 62) # Initialize cluster models cluster_server_dic = {} for cluster_id, participants in cluster_clients_dic.items(): intermediate_cluster_server = create_aggregator_fn( 'cluster_server' + cluster_id) intermediate_cluster_server.aggregate(participants) cluster_server = ReptileServer( participant_name=f'cluster_server{cluster_id}', model_args=reptile_context.meta_model_args, context=reptile_context, initial_model_state=intermediate_cluster_server.model. state_dict()) #create_aggregator_fn('cluster_server' + cluster_id) #cluster_server.aggregate(participants) cluster_server_dic[cluster_id] = cluster_server # REPTILE TRAINING INSIDE CLUSTERS after_round_evaluation = [log_after_round_evaluation] RANDOM = random.Random(seed) # Perform training for i in range(reptile_context.num_meta_steps): for cluster_id, participants in cluster_clients_dic.items( ): if reptile_context.meta_batch_size == -1: meta_batch = participants else: meta_batch = [ participants[k] for k in cyclerange( start=i * reptile_context.meta_batch_size % len(participants), interval=reptile_context.meta_batch_size, total_len=len(participants)) ] # Meta training step reptile_train_step( aggregator=cluster_server_dic[cluster_id], participants=meta_batch, inner_training_args=reptile_context. get_inner_training_args(), meta_training_args=reptile_context. get_meta_training_args( frac_done=i / reptile_context.num_meta_steps)) # Evaluation on train and test clients if i % reptile_context.eval_interval == 0: global_step = init_rounds + i global_loss, global_acc = [], [] for cluster_id, participants in cluster_clients_dic.items( ): # Test on all clients inside clusters reptile_train_step( aggregator=cluster_server_dic[cluster_id], participants=participants, inner_training_args=reptile_context. get_inner_training_args(eval=True), evaluation_mode=True) result = evaluate_local_models( participants=participants) loss = result.get('test/loss') acc = result.get('test/acc') # Log if after_round_evaluation is not None: for c in after_round_evaluation: c(experiment_logger, f'cluster_{cluster_id}', loss, acc, global_step) loss_list = loss.tolist() acc_list = acc.tolist() global_loss.extend(loss_list if isinstance( loss_list, list) else [loss_list]) global_acc.extend(acc_list if isinstance( acc_list, list) else [acc_list]) if after_round_evaluation is not None: for c in after_round_evaluation: c(experiment_logger, 'mean_over_all_clients', Tensor(global_loss), Tensor(global_acc), global_step) logger.info(f'Finished Reptile training round {i}') # Final evaluation at end of training if reptile_context.do_final_evaluation: global_loss, global_acc = [], [] for cluster_id, participants in cluster_clients_dic.items( ): # Final evaluation on train and test clients # Test on all clients inside clusters reptile_train_step( aggregator=cluster_server_dic[cluster_id], participants=participants, inner_training_args=reptile_context. get_inner_training_args(eval=True), evaluation_mode=True) result = evaluate_local_models( participants=participants) loss = result.get('test/loss') acc = result.get('test/acc') print( f'Cluster {cluster_id} ({len(participants)} part.): loss = {loss}, acc = {acc}' ) loss_list = loss.tolist() acc_list = acc.tolist() global_loss.extend(loss_list if isinstance( loss_list, list) else [loss_list]) global_acc.extend(acc_list if isinstance( acc_list, list) else [acc_list]) # Log if after_round_evaluation is not None: for c in after_round_evaluation: c(experiment_logger, f'cluster_{cluster_id}', loss, acc, reptile_context.num_meta_steps) log_loss_and_acc('overall_mean', Tensor(global_loss), Tensor(global_acc), experiment_logger, 0)