def __init__(self, id, storage, server, modelservice): self.run_configs_lock = Lock() self.run_configs = [] self.storage = storage self.id = id self.server = server self.modelservice = modelservice self.config = {} self.validations = {} # TODO: make choice of helper configurable on Recucer level self.helper = KerasSequentialHelper() self.model_updates = queue.Queue()
def __init__(self, statestore): self.__state = ReducerState.idle self.statestore = statestore self.combiners = [] # TODO remove temporary hardcoded config of storage persistance backend s3_config = {'storage_access_key': os.environ['FEDN_MINIO_ACCESS_KEY'], 'storage_secret_key': os.environ['FEDN_MINIO_SECRET_KEY'], 'storage_bucket': 'models', 'storage_secure_mode': False, 'storage_hostname': os.environ['FEDN_MINIO_HOST'], 'storage_port': int(os.environ['FEDN_MINIO_PORT'])} from fedn.common.storage.s3.s3repo import S3ModelRepository self.model_repository = S3ModelRepository(s3_config) self.bucket_name = s3_config["storage_bucket"] # TODO: Make configurable self.helper = KerasSequentialHelper()
def __init__(self, address, port, id, role, storage): super().__init__(address, port, id, role) self.storage = storage self.id = id self.model_id = None # TODO refactor since we are now getting config on RUN cmd. self.db = connect_to_mongodb() self.coll = self.db['orchestrators'] self.config = {} # TODO: Use MongoDB self.validations = {} # TODO: make choice of helper configurable self.helper = KerasSequentialHelper() # Queue for model updates to be processed. self.model_updates = queue.Queue()
class FEDAVGCombiner(CombinerClient): """ A Local SGD / Federated Averaging (FedAvg) combiner. """ def __init__(self, address, port, id, role, storage): super().__init__(address, port, id, role) self.storage = storage self.id = id self.model_id = None # TODO refactor since we are now getting config on RUN cmd. self.db = connect_to_mongodb() self.coll = self.db['orchestrators'] self.config = {} # TODO: Use MongoDB self.validations = {} # TODO: make choice of helper configurable self.helper = KerasSequentialHelper() # Queue for model updates to be processed. self.model_updates = queue.Queue() def get_model_id(self): return self.model_id def report_status(self, msg, log_level=alliance.Status.INFO, type=None, request=None, flush=True): print("COMBINER({}):{} {}".format(self.id, log_level, msg), flush=flush) def receive_model_candidate(self, model_id): """ Callback when a new model version is reported by a client. We simply put the model_id on a queue to be processed later. """ try: self.report_status( "COMBINER: callback received model {}".format(model_id), log_level=alliance.Status.INFO) # TODO - here would be a place to do some additional validation of the model contribution. self.model_updates.put(model_id) except Exception as e: self.report_status( "COMBINER: Failed to receive candidate model! {}".format(e), log_level=alliance.Status.WARNING) print("Failed to receive candidate model!") pass def receive_validation(self, validation): """ Callback for a validation request """ # TODO: Track this in a DB model_id = validation.model_id data = json.loads(validation.data) try: self.validations[model_id].append(data) except KeyError: self.validations[model_id] = [data] self.report_status("COMBINER: callback processed validation {}".format( validation.model_id), log_level=alliance.Status.INFO) def combine_models(self, nr_expected_models=None, timeout=120): """ Compute an iterative/running average of models arriving to the combiner. """ round_time = 0.0 print("COMBINER: combining model updates...") # First model in the update round try: model_id = self.model_updates.get(timeout=timeout) print("combining ", model_id) # Fetch the model data blob from storage model_str = self.get_model(model_id) model = self.helper.load_model(model_str.getbuffer()) nr_processed_models = 1 self.model_updates.task_done() except queue.Empty as e: self.report_status("COMBINER: training round timed out.", log_level=alliance.Status.WARNING) return None while nr_processed_models < nr_expected_models: try: model_id = self.model_updates.get(block=False) self.report_status( "Received model update with id {}".format(model_id)) model_next = self.helper.load_model( self.get_model(model_id).getbuffer()) self.helper.increment_average(model, model_next, nr_processed_models) nr_processed_models += 1 self.model_updates.task_done() except Exception as e: self.report_status("COMBINER failcode: {}".format(e)) time.sleep(1.0) round_time += 1.0 if round_time >= timeout: self.report_status("COMBINER: training round timed out.", log_level=alliance.Status.WARNING) print("COMBINER: Round timed out.") return None self.report_status( "ORCHESTRATOR: Training round completed, combined {} models.". format(nr_processed_models), log_level=alliance.Status.INFO) print("DONE, combined {} models".format(nr_processed_models)) return model def __assign_clients(self, n): """ Obtain a list of clients to talk to in a round. """ # TODO: If we want global sampling without replacement the server needs to assign clients active_trainers = self.get_active_trainers() # If the number of requested trainers exceeds the number of available, use all available. if n > len(active_trainers): n = len(active_trainers) import random self.trainers = random.sample(active_trainers, n) # TODO: In the general case, validators could be other clients as well self.validators = self.trainers def __training_round(self): # We flush the queue at a beginning of a round (no stragglers allowed) # TODO: Support other ways to handle stragglers. with self.model_updates.mutex: self.model_updates.queue.clear() self.report_status( "COMBINER: Initiating training round, participating members: {}". format(self.trainers)) self.request_model_update(self.model_id, clients=self.trainers) # Apply combiner model = self.combine_models(nr_expected_models=len(self.trainers), timeout=self.config['round_timeout']) return model def __validation_round(self): self.request_model_validation(self.model_id, from_clients=self.validators) def run(self, config): """ Coordinates training and validation tasks with clints, as specified in the config (CombinerConfiguration) """ self.config = config self.model_id = self.config['model_id'] print("COMBINER starting from model {}".format(self.model_id)) # Fetch the input model blob from storage and load in local memory timeout_retry = 3 import time tries = 0 while True: try: model = self.storage.get_model_stream(self.model_id) if model: break except Exception as e: print( "COMBINER could not fetch model from bucket. retrying in {}" .format(timeout_retry), flush=True) time.sleep(timeout_retry) tries += 1 if tries > 2: print("COMBINER exiting. could not fetch seed model.") return self.set_model(model, self.model_id) # Check that the minimal number of required clients to start a round are connected import time ready = False while not ready: active = self.nr_active_trainers() if active >= config['clients_required']: ready = True else: print("waiting for {} clients to get started, currently: {}". format(config['clients_required'] - active, active), flush=True) time.sleep(1) # Execute the configured number of rounds for r in range(1, config['rounds'] + 1): print("STARTING ROUND {}".format(r), flush=True) print("\t FEDAVG: Starting training round {}".format(r), flush=True) self.__assign_clients(self.config['clients_requested']) model = self.__training_round() if model: print("\t FEDAVG: Round completed.", flush=True) # TODO: Use configuration to decide if we use a scratchspace to checkpoint the model. fod, outfile_name = tempfile.mkstemp(suffix='.h5') model.save(outfile_name) # Upload new model to storage repository (persistent) # and save to local storage for sharing with clients. # TODO: Refactor - Checkpointing in the configured combiner-private storage # should be handled by self.set_model probably. model_id = self.storage.set_model(outfile_name, is_file=True) from io import BytesIO a = BytesIO() with open(outfile_name, 'rb') as f: a.write(f.read()) # Stream aggregated model to server # TODO: Not strictly necessary to stream model here, can be slight waste of resources. self.set_model(a, model_id) os.unlink(outfile_name) self.model_id = model_id print("...done. New aggregated model: {}".format( self.model_id)) print("\t Starting validation round {}".format(r)) self.__validation_round() print("------------------------------------------") print("FEDAVG: ROUND COMPLETED.", flush=True) print("\n") else: print( "\t Failed to update global model in round {0}!".format(r))
class ReducerControl: def __init__(self, statestore): self.__state = ReducerState.idle self.statestore = statestore self.combiners = [] # TODO remove temporary hardcoded config of storage persistance backend s3_config = {'storage_access_key': os.environ['FEDN_MINIO_ACCESS_KEY'], 'storage_secret_key': os.environ['FEDN_MINIO_SECRET_KEY'], 'storage_bucket': 'models', 'storage_secure_mode': False, 'storage_hostname': os.environ['FEDN_MINIO_HOST'], 'storage_port': int(os.environ['FEDN_MINIO_PORT'])} from fedn.common.storage.s3.s3repo import S3ModelRepository self.model_repository = S3ModelRepository(s3_config) self.bucket_name = s3_config["storage_bucket"] # TODO: Make configurable self.helper = KerasSequentialHelper() def get_latest_model(self): return self.statestore.get_latest() def get_model_info(self): return self.statestore.get_model_info() def commit(self, model_id, model=None): """ Commit a model. This establishes this model as the lastest consensus model. """ if model: fod, outfile_name = tempfile.mkstemp(suffix='.h5') model.save(outfile_name) model_id = self.model_repository.set_model(outfile_name, is_file=True) os.unlink(outfile_name) self.statestore.set_latest(model_id) def _out_of_sync(self,combiners=None): if not combiners: combiners = self.combiners osync = [] for combiner in combiners: model_id = combiner.get_model_id() if model_id != self.get_latest_model(): osync.append(combiner) return osync def check_round_participation_policy(self,compute_plan,combiner_state): """ Evaluate reducer level policy for combiner round-paarticipation. This is a decision on ReducerControl level, additional checks applies on combiner level. Not all reducer control flows might need or want to use a participation policy. """ if int(compute_plan['clients_required']) <= int(combiner_state['nr_active_clients']): return True else: return False def check_round_start_policy(self,combiners): """ Check if the overall network state meets a policy to start the round. """ if len(combiners) > 0: return True else: return False def check_round_validity_policy(self,combiners): """ Before committing a model we check if a round validity policy has been met. """ if len(combiners) > 0: return True else: return False def round(self, config): """ Execute one global round. """ # TODO: Set / update reducer states and such if len(self.combiners) < 1: print("REDUCER: No combiners connected!") return # 1. Formulate compute plans for this round and decide which combiners should participate in the round. compute_plan = copy.deepcopy(config) compute_plan['rounds'] = 1 compute_plan['task'] = 'training' compute_plan['model_id'] = self.get_latest_model() combiners = [] for combiner in self.combiners: combiner_state = combiner.report() is_participating = self.check_round_participation_policy(compute_plan,combiner_state) if is_participating: combiners.append((combiner,compute_plan)) print("PARTICIPATING: {}".format(combiners),flush=True) round_start = self.check_round_start_policy(combiners) print("ROUND START POLICY: {}".format(round_start),flush=True) if not round_start: return None # 2. Sync up and ask participating combiners to coordinate model updates for combiner,compute_plan in combiners: self.sync_combiners([combiner],self.get_latest_model()) print(combiner,compute_plan,flush=True) response = combiner.start(compute_plan) # Wait until all participating combiners have a model that is out of sync with the current global model. # TODO: Implement strategies to handle timeouts. # TODO: We do not need to wait until all combiners complete before we start reducing. cl = [] for combiner,plan in combiners: cl.append(combiner) wait = 0.0 while len(self._out_of_sync(cl)) < len(combiners): time.sleep(1.0) wait += 1.0 if wait >= config['round_timeout']: break # OBS! Here we are checking agains all combiners, not just those that computed in this round. # This means we let straggling combiners participate in the update updated = self._out_of_sync() print("UPDATED: {}".format(updated),flush=True) round_valid = self.check_round_validity_policy(updated) if not round_valid: # TODO: Should we reset combiner state here? return None # 3. Reduce combiner models into a global model # TODO, check success model = self.reduce(updated) if model: import uuid model_id = uuid.uuid4() self.commit(model_id,model) # 4. Trigger participating combiner nodes to execute a validation round for the current model combiner_config = copy.deepcopy(config) combiner_config['model_id'] = self.get_latest_model() combiner_config['task'] = 'validation' for combiner in updated: combiner.start(combiner_config) return model_id else: print("REDUCER: failed to updated model in round with config {}".format(config),flush=True) return None def sync_combiners(self, combiners, model_id): """ Spread the current consensus model to all active combiner nodes. """ if not model_id: print("GOT NO MODEL TO SET! Have you seeded the FedML model?", flush=True) return for combiner in combiners: response = combiner.set_model_id(model_id) def instruct(self, config): """ Main entrypoint, executes the compute plan. """ if self.__state == ReducerState.instructing: print("Already set in INSTRUCTING state", flush=True) return self.__state = ReducerState.instructing if not self.get_latest_model(): print("No model in model chain, please seed the alliance!") self.__state = ReducerState.monitoring for round in range(int(config['rounds'])): model_id = self.round(config) if model_id: print("REDUCER: Global round completed, new model: {}".format(model_id),flush=True) else: print("REDUCER: Global round failed!") self.__state = ReducerState.idle def reduce(self, combiners): """ Combine current models at Combiner nodes into one global model. """ i = 1 for combiner in combiners: data = combiner.get_model() if data: try: model_next = self.helper.load_model(combiner.get_model().getbuffer()) self.helper.increment_average(model, model_next, i) except: model = self.helper.load_model(data.getbuffer()) i = i+1 return model def reduce_random(self, combiners): """ This is only used for debugging purposes. s""" import random combiner = random.sample(combiners, 1)[0] import uuid model_id = uuid.uuid4() return self.helper.load_model(combiner.get_model().getbuffer()),model_id def resolve(self): """ At the end of resolve, all combiners have the same model state. """ combiners = self._out_of_sync() if len(combiners) > 0: model = self.reduce(combiners) return model def monitor(self, config=None): """ monitor """ #if self.__state == ReducerState.monitoring: #print("monitoring") def add(self, combiner): if self.__state != ReducerState.idle: print("Reducer is not idle, cannot add additional combiner") return if self.find(combiner.name): return print("adding combiner {}".format(combiner.name), flush=True) self.combiners.append(combiner) def remove(self, combiner): if self.__state != ReducerState.idle: print("Reducer is not idle, cannot remove combiner") return self.combiners.remove(combiner) def find(self, name): for combiner in self.combiners: if name == combiner.name: return combiner return None def find_available_combiner(self): # TODO: Extend with more types of client allocation schemes. for combiner in self.combiners: if combiner.allowing_clients(): return combiner return None def state(self): return self.__state
class ReducerControl: def __init__(self, statestore): self.__state = ReducerState.setup self.statestore = statestore self.combiners = [] s3_config = { 'storage_access_key': os.environ['FEDN_MINIO_ACCESS_KEY'], 'storage_secret_key': os.environ['FEDN_MINIO_SECRET_KEY'], 'storage_bucket': 'models', 'storage_secure_mode': False, 'storage_hostname': os.environ['FEDN_MINIO_HOST'], 'storage_port': int(os.environ['FEDN_MINIO_PORT']) } from fedn.common.storage.s3.s3repo import S3ModelRepository self.model_repository = S3ModelRepository(s3_config) self.bucket_name = s3_config["storage_bucket"] # TODO: Make configurable self.helper = KerasSequentialHelper() if self.statestore.is_inited(): self.__state = ReducerState.idle def get_latest_model(self): return self.statestore.get_latest() def get_model_info(self): return self.statestore.get_model_info() def get_compute_context(self): definition = self.statestore.get_compute_context() if definition: try: context = definition['filename'] return context except IndexError: print("No context filename set for compute context definition", flush=True) else: return None def set_compute_context(self, filename): self.statestore.set_compute_context(filename) def commit(self, model_id, model=None): """ Commit a model. This establishes this model as the lastest consensus model. """ if model: fod, outfile_name = tempfile.mkstemp(suffix='.h5') model.save(outfile_name) model_id = self.model_repository.set_model(outfile_name, is_file=True) os.unlink(outfile_name) self.statestore.set_latest(model_id) def _out_of_sync(self, combiners=None): if not combiners: combiners = self.combiners osync = [] for combiner in combiners: try: model_id = combiner.get_model_id() except CombinerUnavailableError: self._handle_unavailable_combiner(combiner) model_id = None if model_id and (model_id != self.get_latest_model()): osync.append(combiner) return osync def check_round_participation_policy(self, compute_plan, combiner_state): """ Evaluate reducer level policy for combiner round-paarticipation. This is a decision on ReducerControl level, additional checks applies on combiner level. Not all reducer control flows might need or want to use a participation policy. """ if int(compute_plan['clients_required']) <= int( combiner_state['nr_active_clients']): return True else: return False def check_round_start_policy(self, combiners): """ Check if the overall network state meets the policy to start a round. """ if len(combiners) > 0: return True else: return False def check_round_validity_policy(self, combiners): """ At the end of the round, before committing a model to the model ledger, we check if a round validity policy has been met. This can involve e.g. asserting that a certain number of combiners have reported in an updated model, or that criteria on model performance have been met. """ if combiners == []: return False else: return True def _handle_unavailable_combiner(self, combiner): """ This callback is triggered if a combiner is found to be unresponsive. """ # TODO: Implement print("REDUCER CONTROL: Combiner {} unavailable.".format( combiner.name), flush=True) def round(self, config): """ Execute one global round. """ # TODO: Set / update reducer states and such # TODO: Do a General Health check on Combiners in the beginning of the round. if len(self.combiners) < 1: print("REDUCER: No combiners connected!") return # 1. Formulate compute plans for this round and decide which combiners should participate in the round. compute_plan = copy.deepcopy(config) compute_plan['rounds'] = 1 compute_plan['task'] = 'training' compute_plan['model_id'] = self.get_latest_model() combiners = [] for combiner in self.combiners: try: combiner_state = combiner.report() except CombinerUnavailableError: self._handle_unavailable_combiner(combiner) combiner_state = None if combiner_state: is_participating = self.check_round_participation_policy( compute_plan, combiner_state) if is_participating: combiners.append((combiner, compute_plan)) print("REDUCER CONTROL: Participating combiners: {}".format(combiners), flush=True) round_start = self.check_round_start_policy(combiners) print("ROUND START POLICY: {}".format(round_start), flush=True) if not round_start: print( "REDUCER CONTROL: Round start policy not met, skipping round!", flush=True) return None # 2. Sync up and ask participating combiners to coordinate model updates for combiner, compute_plan in combiners: try: self.sync_combiners([combiner], self.get_latest_model()) response = combiner.start(compute_plan) except CombinerUnavailableError: # This is OK, handled by round accept policy self._handle_unavailable_combiner(combiner) pass except: # Unknown error raise # Wait until participating combiners have a model that is out of sync with the current global model. # TODO: Implement strategies to handle timeouts. # TODO: We do not need to wait until all combiners complete before we start reducing. cl = [] for combiner, plan in combiners: cl.append(combiner) wait = 0.0 while len(self._out_of_sync(cl)) < len(combiners): time.sleep(1.0) wait += 1.0 if wait >= config['round_timeout']: break # OBS! Here we are checking against all combiners, not just those that computed in this round. # This means we let straggling combiners participate in the update updated = self._out_of_sync() print("UPDATED: {}".format(updated), flush=True) round_valid = self.check_round_validity_policy(updated) if round_valid == False: # TODO: Should we reset combiner state here? print("REDUCER CONTROL: Round invalid!", flush=True) return None # 3. Reduce combiner models into a global model try: model = self.reduce(updated) except: print( "REDUCER CONTROL: Failed to reduce models from combiners: {}". format(updated), flush=True) return None if model: # Commit to model ledger import uuid model_id = uuid.uuid4() self.commit(model_id, model) else: print("REDUCER: failed to update model in round with config {}". format(config), flush=True) return None # 4. Trigger participating combiner nodes to execute a validation round for the current model # TODO: Move to config - are we validating in a round, and if so, in what way. validate = True if validate: combiner_config = copy.deepcopy(config) combiner_config['model_id'] = self.get_latest_model() combiner_config['task'] = 'validation' for combiner in updated: try: combiner.start(combiner_config) except CombinerUnavailableError: # OK if validation fails for a combiner self._handle_unavailable_combiner(combiner) pass return model_id def sync_combiners(self, combiners, model_id): """ Spread the current consensus model to all active combiner nodes. """ if not model_id: print("GOT NO MODEL TO SET! Have you seeded the FedML model?", flush=True) return for combiner in combiners: response = combiner.set_model_id(model_id) def instruct(self, config): """ Main entrypoint, executes the compute plan. """ if self.__state == ReducerState.instructing: print("Already set in INSTRUCTING state", flush=True) return self.__state = ReducerState.instructing # TODO - move seeding from config to explicit step, use Reducer REST API reducer/seed/... ? if not self.get_latest_model(): print("No model in model chain, please seed the alliance!") self.__state = ReducerState.monitoring for round in range(int(config['rounds'])): model_id = self.round(config) if model_id: print("REDUCER: Global round completed, new model: {}".format( model_id), flush=True) else: print("REDUCER: Global round failed!") self.__state = ReducerState.idle def reduce(self, combiners): """ Combine current models at Combiner nodes into one global model. """ i = 1 model = None for combiner in combiners: # TODO: Handle inactive RPC error in get_model and raise specific error try: data = combiner.get_model() except: pass if data: try: model_next = self.helper.load_model( combiner.get_model().getbuffer()) self.helper.increment_average(model, model_next, i) except: model = self.helper.load_model(data.getbuffer()) i = i + 1 return model def resolve(self): """ At the end of resolve, all combiners have the same model state. """ combiners = self._out_of_sync() if len(combiners) > 0: model = self.reduce(combiners) return model def monitor(self, config=None): pass """ monitor """ #if self.__state == ReducerState.monitoring: #print("monitoring") def add(self, combiner): if self.__state != ReducerState.idle: print("Reducer is not idle, cannot add additional combiner") return if self.find(combiner.name): return print("adding combiner {}".format(combiner.name), flush=True) self.combiners.append(combiner) def remove(self, combiner): if self.__state != ReducerState.idle: print("Reducer is not idle, cannot remove combiner") return self.combiners.remove(combiner) def find(self, name): for combiner in self.combiners: if name == combiner.name: return combiner return None def find_available_combiner(self): for combiner in self.combiners: if combiner.allowing_clients(): return combiner return None def state(self): return self.__state
class FEDAVGCombiner: """ A Local SGD / Federated Averaging (FedAvg) combiner. This class is resonsible for coordinating the update of the Combiner global model by requesting and aggregating model updates from Clients. """ def __init__(self, id, storage, server, modelservice): self.run_configs_lock = Lock() self.run_configs = [] self.storage = storage self.id = id self.server = server self.modelservice = modelservice self.config = {} self.validations = {} # TODO: make choice of helper configurable on Recucer level self.helper = KerasSequentialHelper() self.model_updates = queue.Queue() def report_status(self, msg, log_level=fedn.Status.INFO, type=None, request=None, flush=True): print("COMBINER({}):{} {}".format(self.id, log_level, msg), flush=flush) def receive_model_candidate(self, model_id): """ Callback when a new model version is reported by a client. """ try: self.report_status( "COMBINER: callback received model {}".format(model_id), log_level=fedn.Status.INFO) # TODO - here would be a place to do some additional validation of the model contribution. self.model_updates.put(model_id) except Exception as e: self.report_status( "COMBINER: Failed to receive candidate model! {}".format(e), log_level=fedn.Status.WARNING) self.report_status("Failed to receive candidate model!") pass def receive_validation(self, validation): """ Callback for a validation request """ model_id = validation.model_id data = json.loads(validation.data) try: self.validations[model_id].append(data) except KeyError: self.validations[model_id] = [data] self.report_status("COMBINER: callback processed validation {}".format( validation.model_id), log_level=fedn.Status.INFO) def _load_model_fault_tolerant(self, model_id): # Try reading it from local disk/combiner memory model_str = self.modelservice.models.get(model_id) # And if we cannot access that, try downloading from the server if model_str == None: model_str = self.modelservice.get_model(model_id) # TODO: use retrying library tries = 0 while tries < 3: tries += 1 if not model_str or sys.getsizeof(model_str) == 80: self.report_status( "COMBINER: Model download failed. retrying", flush=True) import time time.sleep(1) model_str = self.modelservice.get_model(model_id) return model_str def combine_models(self, nr_expected_models=None, nr_required_models=1, timeout=120): """ Compute an iterative/running average of models arriving to the combiner. """ import time round_time = 0.0 print("COMBINER: combining model updates from Clients...") nr_processed_models = 0 while nr_processed_models < nr_expected_models: try: model_id = self.model_updates.get(block=False) self.report_status( "Received model update with id {}".format(model_id)) model_str = self._load_model_fault_tolerant(model_id) if model_str: try: model_next = self.helper.load_model( model_str.getbuffer()) except IOError: self.report_status("COMBINER: Failed to load model!") raise else: raise if nr_processed_models == 0: model = model_next else: self.helper.increment_average(model, model_next, nr_processed_models) nr_processed_models += 1 self.model_updates.task_done() except queue.Empty: self.report_status( "COMBINER: waiting for model updates: {} of {} completed.". format(nr_processed_models, nr_expected_models)) time.sleep(1.0) round_time += 1.0 except IOError: self.report_status( "COMBINER: Failed to read model update, skipping!") self.model_updates.task_done() nr_expected_models -= 1 if nr_expected_models <= 0: # This hack lets the timeout policy handle the failure round_time = timeout break except Exception as e: self.report_status( "COMBINER: Exception in combine_models: {}".format(e)) time.sleep(1.0) round_time += 1.0 if round_time >= timeout: self.report_status("COMBINER: training round timed out.", log_level=fedn.Status.WARNING) print("COMBINER: Round timed out.") # TODO: Generalize policy for what to do in case of timeout. if nr_processed_models >= nr_required_models: break else: return None self.report_status( "ORCHESTRATOR: Training round completed, combined {} models.". format(nr_processed_models), log_level=fedn.Status.INFO) self.report_status( "DONE, combined {} models".format(nr_processed_models)) return model def __training_round(self, config, clients): # We flush the queue at a beginning of a round (no stragglers allowed) # TODO: Support other ways to handle stragglers. with self.model_updates.mutex: self.model_updates.queue.clear() self.report_status( "COMBINER: Initiating training round, participating members: {}". format(clients)) self.server.request_model_update(config['model_id'], clients=clients) model = self.combine_models(nr_expected_models=len(clients), nr_required_models=int( config['clients_required']), timeout=int(config['round_timeout'])) return model def __validation_round(self, config, clients, model_id): self.server.request_model_validation(model_id, from_clients=clients) def push_run_config(self, plan): self.run_configs_lock.acquire() import uuid plan['_job_id'] = str(uuid.uuid4()) self.run_configs.append(plan) self.run_configs_lock.release() return plan['_job_id'] def run(self): import time try: while True: time.sleep(1) self.run_configs_lock.acquire() if len(self.run_configs) > 0: compute_plan = self.run_configs.pop() self.run_configs_lock.release() self.config = compute_plan ready = self.__check_nr_round_clients(compute_plan, timeout=10.0) if ready: if compute_plan['task'] == 'training': self.exec_training(compute_plan) elif compute_plan['task'] == 'validation': self.exec_validation(compute_plan, compute_plan['model_id']) else: self.report_status( "COMBINER: Compute plan contains unkown task type.", flush=True) else: self.report_status( "COMBINER: Failed to meet client allocation requirements for this compute plan.", flush=True) if self.run_configs_lock.locked(): self.run_configs_lock.release() except (KeyboardInterrupt, SystemExit): pass def stage_model(self, model_id): """ Download model from persistent storage. """ # If the model is already in memory at the server we do not need to do anything. #TODO ugly ! Need to be refactored if self.modelservice.models.exist(model_id): return # If it is not there, download it from storage and stage it in memory at the server. timeout_retry = 3 import time tries = 0 while True: try: model = self.storage.get_model_stream(model_id) if model: break except Exception as e: self.report_status( "COMBINER could not fetch model from bucket. retrying in {}" .format(timeout_retry), flush=True) time.sleep(timeout_retry) tries += 1 if tries > 2: self.report_status( "COMBINER exiting. could not fetch seed model.", flush=True) return self.modelservice.set_model(model, model_id) def __assign_round_clients(self, n): """ Obtain a list of clients to talk to in a round. """ active_trainers = self.server.get_active_trainers() # If the number of requested trainers exceeds the number of available, use all available. if n > len(active_trainers): n = len(active_trainers) # If not, we pick a random subsample of all available clients. import random clients = random.sample(active_trainers, n) return clients def __check_nr_round_clients(self, config, timeout=10.0): """ Check that the minimal number of required clients to start a round are connected. """ import time ready = False t = 0.0 while not ready: active = self.server.nr_active_trainers() if active >= int(config['clients_requested']): return True else: self.report_status( "waiting for {} clients to get started, currently: {}". format(int(config['clients_requested']) - active, active), flush=True) time.sleep(1.0) t += 1.0 if t >= timeout: if active >= int(config['clients_required']): return True else: return False return ready def exec_validation(self, config, model_id): """ Coordinate validation rounds as specified in config. """ self.report_status( "COMBINER orchestrating validation of model {}".format(model_id)) self.stage_model(model_id) #validators = self.__assign_round_clients(int(config['clients_requested'])) validators = self.__assign_round_clients(self.server.max_clients) self.__validation_round(config, validators, model_id) def exec_training(self, config): """ Coordinates clients to executee training and validation tasks. """ #print("COMBINER starting from model {}".format(config['model_id'])) self.stage_model(config['model_id']) # Execute the configured number of rounds for r in range(1, int(config['rounds']) + 1): self.report_status( "COMBINER: Starting training round {}".format(r), flush=True) #clients = self.__assign_round_clients(int(config['clients_requested'])) clients = self.__assign_round_clients(self.server.max_clients) model = self.__training_round(config, clients) if not model: self.report_status( "\t Failed to update global model in round {0}!".format(r)) if model: fod, outfile_name = tempfile.mkstemp(suffix='.h5') model.save(outfile_name) # Save to local storage for sharing with clients. from io import BytesIO a = BytesIO() a.seek(0, 0) with open(outfile_name, 'rb') as f: a.write(f.read()) # Send aggregated model to server model_id = str(uuid.uuid4()) self.modelservice.set_model(a, model_id) os.unlink(outfile_name) # Update Combiner latest model self.server.set_active_model(model_id) print("------------------------------------------") self.report_status("COMBINER: TRAINING ROUND COMPLETED.", flush=True) print("\n")