예제 #1
0
    def __init__(self, id, storage, server, modelservice):

        self.run_configs_lock = Lock()
        self.run_configs = []
        self.storage = storage
        self.id = id
        self.server = server
        self.modelservice = modelservice

        self.config = {}
        self.validations = {}

        # TODO: make choice of helper configurable on Recucer level
        self.helper = KerasSequentialHelper()
        self.model_updates = queue.Queue()
예제 #2
0
파일: control.py 프로젝트: jensfrid/fedn
    def __init__(self, statestore):
        self.__state = ReducerState.idle
        self.statestore = statestore
        self.combiners = []

        # TODO remove temporary hardcoded config of storage persistance backend
        s3_config = {'storage_access_key': os.environ['FEDN_MINIO_ACCESS_KEY'],
                     'storage_secret_key': os.environ['FEDN_MINIO_SECRET_KEY'],
                     'storage_bucket': 'models',
                     'storage_secure_mode': False,
                     'storage_hostname': os.environ['FEDN_MINIO_HOST'],
                     'storage_port': int(os.environ['FEDN_MINIO_PORT'])}

        from fedn.common.storage.s3.s3repo import S3ModelRepository
        self.model_repository = S3ModelRepository(s3_config)
        self.bucket_name = s3_config["storage_bucket"]

        # TODO: Make configurable
        self.helper = KerasSequentialHelper()
예제 #3
0
    def __init__(self, address, port, id, role, storage):

        super().__init__(address, port, id, role)

        self.storage = storage
        self.id = id
        self.model_id = None

        # TODO  refactor since we are now getting config on RUN cmd.
        self.db = connect_to_mongodb()
        self.coll = self.db['orchestrators']

        self.config = {}
        # TODO: Use MongoDB
        self.validations = {}

        # TODO: make choice of helper configurable
        self.helper = KerasSequentialHelper()
        # Queue for model updates to be processed.
        self.model_updates = queue.Queue()
예제 #4
0
class FEDAVGCombiner(CombinerClient):
    """ A Local SGD / Federated Averaging (FedAvg) combiner. """
    def __init__(self, address, port, id, role, storage):

        super().__init__(address, port, id, role)

        self.storage = storage
        self.id = id
        self.model_id = None

        # TODO  refactor since we are now getting config on RUN cmd.
        self.db = connect_to_mongodb()
        self.coll = self.db['orchestrators']

        self.config = {}
        # TODO: Use MongoDB
        self.validations = {}

        # TODO: make choice of helper configurable
        self.helper = KerasSequentialHelper()
        # Queue for model updates to be processed.
        self.model_updates = queue.Queue()

    def get_model_id(self):
        return self.model_id

    def report_status(self,
                      msg,
                      log_level=alliance.Status.INFO,
                      type=None,
                      request=None,
                      flush=True):
        print("COMBINER({}):{} {}".format(self.id, log_level, msg),
              flush=flush)

    def receive_model_candidate(self, model_id):
        """ Callback when a new model version is reported by a client. 
            We simply put the model_id on a queue to be processed later. """
        try:
            self.report_status(
                "COMBINER: callback received model {}".format(model_id),
                log_level=alliance.Status.INFO)
            # TODO - here would be a place to do some additional validation of the model contribution.
            self.model_updates.put(model_id)
        except Exception as e:
            self.report_status(
                "COMBINER: Failed to receive candidate model! {}".format(e),
                log_level=alliance.Status.WARNING)
            print("Failed to receive candidate model!")
            pass

    def receive_validation(self, validation):
        """ Callback for a validation request """

        # TODO: Track this in a DB
        model_id = validation.model_id
        data = json.loads(validation.data)
        try:
            self.validations[model_id].append(data)
        except KeyError:
            self.validations[model_id] = [data]

        self.report_status("COMBINER: callback processed validation {}".format(
            validation.model_id),
                           log_level=alliance.Status.INFO)

    def combine_models(self, nr_expected_models=None, timeout=120):
        """ Compute an iterative/running average of models arriving to the combiner. """

        round_time = 0.0
        print("COMBINER: combining model updates...")

        # First model in the update round
        try:
            model_id = self.model_updates.get(timeout=timeout)
            print("combining ", model_id)
            # Fetch the model data blob from storage
            model_str = self.get_model(model_id)
            model = self.helper.load_model(model_str.getbuffer())
            nr_processed_models = 1
            self.model_updates.task_done()
        except queue.Empty as e:
            self.report_status("COMBINER: training round timed out.",
                               log_level=alliance.Status.WARNING)
            return None

        while nr_processed_models < nr_expected_models:
            try:
                model_id = self.model_updates.get(block=False)
                self.report_status(
                    "Received model update with id {}".format(model_id))

                model_next = self.helper.load_model(
                    self.get_model(model_id).getbuffer())
                self.helper.increment_average(model, model_next,
                                              nr_processed_models)

                nr_processed_models += 1
                self.model_updates.task_done()
            except Exception as e:
                self.report_status("COMBINER failcode: {}".format(e))
                time.sleep(1.0)
                round_time += 1.0

            if round_time >= timeout:
                self.report_status("COMBINER: training round timed out.",
                                   log_level=alliance.Status.WARNING)
                print("COMBINER: Round timed out.")
                return None

        self.report_status(
            "ORCHESTRATOR: Training round completed, combined {} models.".
            format(nr_processed_models),
            log_level=alliance.Status.INFO)
        print("DONE, combined {} models".format(nr_processed_models))
        return model

    def __assign_clients(self, n):
        """  Obtain a list of clients to talk to in a round. """

        # TODO: If we want global sampling without replacement the server needs to assign clients
        active_trainers = self.get_active_trainers()

        # If the number of requested trainers exceeds the number of available, use all available.
        if n > len(active_trainers):
            n = len(active_trainers)

        import random
        self.trainers = random.sample(active_trainers, n)
        # TODO: In the general case, validators could be other clients as well
        self.validators = self.trainers

    def __training_round(self):

        # We flush the queue at a beginning of a round (no stragglers allowed)
        # TODO: Support other ways to handle stragglers.
        with self.model_updates.mutex:
            self.model_updates.queue.clear()

        self.report_status(
            "COMBINER: Initiating training round, participating members: {}".
            format(self.trainers))
        self.request_model_update(self.model_id, clients=self.trainers)

        # Apply combiner
        model = self.combine_models(nr_expected_models=len(self.trainers),
                                    timeout=self.config['round_timeout'])
        return model

    def __validation_round(self):
        self.request_model_validation(self.model_id,
                                      from_clients=self.validators)

    def run(self, config):
        """ Coordinates training and validation tasks with clints, as specified in the 
            config (CombinerConfiguration) """

        self.config = config
        self.model_id = self.config['model_id']

        print("COMBINER starting from model {}".format(self.model_id))

        # Fetch the input model blob from storage and load in local memory
        timeout_retry = 3
        import time
        tries = 0
        while True:
            try:
                model = self.storage.get_model_stream(self.model_id)
                if model:
                    break
            except Exception as e:
                print(
                    "COMBINER could not fetch model from bucket. retrying in {}"
                    .format(timeout_retry),
                    flush=True)
                time.sleep(timeout_retry)
                tries += 1
                if tries > 2:
                    print("COMBINER exiting. could not fetch seed model.")
                    return

        self.set_model(model, self.model_id)

        # Check that the minimal number of required clients to start a round are connected
        import time
        ready = False
        while not ready:
            active = self.nr_active_trainers()
            if active >= config['clients_required']:
                ready = True
            else:
                print("waiting for {} clients to get started, currently: {}".
                      format(config['clients_required'] - active, active),
                      flush=True)
            time.sleep(1)

        # Execute the configured number of rounds
        for r in range(1, config['rounds'] + 1):
            print("STARTING ROUND {}".format(r), flush=True)
            print("\t FEDAVG: Starting training round {}".format(r),
                  flush=True)

            self.__assign_clients(self.config['clients_requested'])
            model = self.__training_round()

            if model:
                print("\t FEDAVG: Round completed.", flush=True)

                # TODO: Use configuration to decide if we use a scratchspace to checkpoint the model.
                fod, outfile_name = tempfile.mkstemp(suffix='.h5')
                model.save(outfile_name)
                # Upload new model to storage repository (persistent)
                # and save to local storage for sharing with clients.

                # TODO: Refactor - Checkpointing in the configured combiner-private storage
                # should be handled by self.set_model probably.
                model_id = self.storage.set_model(outfile_name, is_file=True)
                from io import BytesIO
                a = BytesIO()
                with open(outfile_name, 'rb') as f:
                    a.write(f.read())

                # Stream aggregated model to server
                # TODO: Not strictly necessary to stream model here, can be slight waste of resources.
                self.set_model(a, model_id)
                os.unlink(outfile_name)

                self.model_id = model_id

                print("...done. New aggregated model: {}".format(
                    self.model_id))

                print("\t Starting validation round {}".format(r))
                self.__validation_round()

                print("------------------------------------------")
                print("FEDAVG: ROUND COMPLETED.", flush=True)
                print("\n")
            else:
                print(
                    "\t Failed to update global model in round {0}!".format(r))
예제 #5
0
파일: control.py 프로젝트: jensfrid/fedn
class ReducerControl:

    def __init__(self, statestore):
        self.__state = ReducerState.idle
        self.statestore = statestore
        self.combiners = []

        # TODO remove temporary hardcoded config of storage persistance backend
        s3_config = {'storage_access_key': os.environ['FEDN_MINIO_ACCESS_KEY'],
                     'storage_secret_key': os.environ['FEDN_MINIO_SECRET_KEY'],
                     'storage_bucket': 'models',
                     'storage_secure_mode': False,
                     'storage_hostname': os.environ['FEDN_MINIO_HOST'],
                     'storage_port': int(os.environ['FEDN_MINIO_PORT'])}

        from fedn.common.storage.s3.s3repo import S3ModelRepository
        self.model_repository = S3ModelRepository(s3_config)
        self.bucket_name = s3_config["storage_bucket"]

        # TODO: Make configurable
        self.helper = KerasSequentialHelper()

    def get_latest_model(self):
        return self.statestore.get_latest()

    def get_model_info(self):
        return self.statestore.get_model_info()
     
    def commit(self, model_id, model=None):
        """ Commit a model. This establishes this model as the lastest consensus model. """

        if model:
            fod, outfile_name = tempfile.mkstemp(suffix='.h5')
            model.save(outfile_name)
            model_id = self.model_repository.set_model(outfile_name, is_file=True)
            os.unlink(outfile_name)

        self.statestore.set_latest(model_id)

    def _out_of_sync(self,combiners=None):

        if not combiners:
            combiners = self.combiners

        osync = []
        for combiner in combiners:
            model_id = combiner.get_model_id()
            if model_id != self.get_latest_model():
                osync.append(combiner)
        return osync

    def check_round_participation_policy(self,compute_plan,combiner_state):
        """ Evaluate reducer level policy for combiner round-paarticipation. 
            This is a decision on ReducerControl level, additional checks
            applies on combiner level. Not all reducer control flows might
            need or want to use a participation policy.  """
        if int(compute_plan['clients_required']) <= int(combiner_state['nr_active_clients']):
            return True
        else:
            return False

    def check_round_start_policy(self,combiners):
        """ Check if the overall network state meets a policy to start the round. """
        if len(combiners) > 0:
            return True
        else:
            return False

    def check_round_validity_policy(self,combiners):
        """ Before committing a model we check if a round validity policy has been met. """
        if len(combiners) > 0:
            return True
        else:
            return False 


    def round(self, config):
        """ Execute one global round. """

        # TODO: Set / update reducer states and such
        if len(self.combiners) < 1:
            print("REDUCER: No combiners connected!")
            return

        # 1. Formulate compute plans for this round and decide which combiners should participate in the round.
        compute_plan = copy.deepcopy(config)
        compute_plan['rounds'] = 1
        compute_plan['task'] = 'training'
        compute_plan['model_id'] = self.get_latest_model()

        combiners = []
        for combiner in self.combiners:
            combiner_state = combiner.report()
            is_participating = self.check_round_participation_policy(compute_plan,combiner_state)
            if is_participating:
                combiners.append((combiner,compute_plan))

        print("PARTICIPATING: {}".format(combiners),flush=True)

        round_start = self.check_round_start_policy(combiners)
        print("ROUND START POLICY: {}".format(round_start),flush=True)
        if not round_start:
            return None


        # 2. Sync up and ask participating combiners to coordinate model updates
        for combiner,compute_plan in combiners:        
            self.sync_combiners([combiner],self.get_latest_model())
            print(combiner,compute_plan,flush=True)
            response = combiner.start(compute_plan)

        # Wait until all participating combiners have a model that is out of sync with the current global model.
        # TODO: Implement strategies to handle timeouts. 
        # TODO: We do not need to wait until all combiners complete before we start reducing. 
        cl = []
        for combiner,plan in combiners:
            cl.append(combiner)

        wait = 0.0
        while len(self._out_of_sync(cl)) < len(combiners):
            time.sleep(1.0)
            wait += 1.0
            if wait >= config['round_timeout']:
                break

        # OBS! Here we are checking agains all combiners, not just those that computed in this round.
        # This means we let straggling combiners participate in the update 
        updated = self._out_of_sync()
        print("UPDATED: {}".format(updated),flush=True)


        round_valid = self.check_round_validity_policy(updated)
        if not round_valid:
            # TODO: Should we reset combiner state here? 
            return None

        # 3. Reduce combiner models into a global model
        # TODO, check success
        model = self.reduce(updated)
        
        if model:
            import uuid
            model_id = uuid.uuid4()
            self.commit(model_id,model)

            # 4. Trigger participating combiner nodes to execute a validation round for the current model
            combiner_config = copy.deepcopy(config)
            combiner_config['model_id'] = self.get_latest_model()
            combiner_config['task'] = 'validation'
            for combiner in updated:
                combiner.start(combiner_config)
            return model_id
        else:
            print("REDUCER: failed to updated model in round with config {}".format(config),flush=True)
            return None

    def sync_combiners(self, combiners, model_id):
        """ Spread the current consensus model to all active combiner nodes. """
        if not model_id:
            print("GOT NO MODEL TO SET! Have you seeded the FedML model?", flush=True)
            return

        for combiner in combiners:
            response = combiner.set_model_id(model_id)

    def instruct(self, config):
        """ Main entrypoint, executes the compute plan. """

        if self.__state == ReducerState.instructing:
            print("Already set in INSTRUCTING state", flush=True)
            return

        self.__state = ReducerState.instructing

        if not self.get_latest_model():
            print("No model in model chain, please seed the alliance!")

        self.__state = ReducerState.monitoring

        for round in range(int(config['rounds'])):
            model_id = self.round(config)
            if model_id:
                print("REDUCER: Global round completed, new model: {}".format(model_id),flush=True)
            else:
                print("REDUCER: Global round failed!")


        self.__state = ReducerState.idle

    def reduce(self, combiners):
        """ Combine current models at Combiner nodes into one global model. """
        i = 1
        for combiner in combiners:
            data = combiner.get_model()
            if data:
                try:
                    model_next = self.helper.load_model(combiner.get_model().getbuffer())
                    self.helper.increment_average(model, model_next, i)
                except:
                    model = self.helper.load_model(data.getbuffer())
                i = i+1
        return model

    def reduce_random(self, combiners):
        """ This is only used for debugging purposes. s"""
        import random
        combiner = random.sample(combiners, 1)[0]
        import uuid
        model_id = uuid.uuid4()
        return self.helper.load_model(combiner.get_model().getbuffer()),model_id

    def resolve(self):
        """ At the end of resolve, all combiners have the same model state. """

        combiners = self._out_of_sync()
        if len(combiners) > 0:
            model = self.reduce(combiners)
        return model

    def monitor(self, config=None):
        """ monitor """
        #if self.__state == ReducerState.monitoring:
            #print("monitoring")

    def add(self, combiner):
        if self.__state != ReducerState.idle:
            print("Reducer is not idle, cannot add additional combiner")
            return
        if self.find(combiner.name):
            return
        print("adding combiner {}".format(combiner.name), flush=True)
        self.combiners.append(combiner)

    def remove(self, combiner):
        if self.__state != ReducerState.idle:
            print("Reducer is not idle, cannot remove combiner")
            return
        self.combiners.remove(combiner)

    def find(self, name):
        for combiner in self.combiners:
            if name == combiner.name:
                return combiner
        return None

    def find_available_combiner(self):
        # TODO: Extend with more types of client allocation schemes. 
        for combiner in self.combiners:
            if combiner.allowing_clients():
                return combiner
        return None

    def state(self):
        return self.__state
예제 #6
0
파일: control.py 프로젝트: ahellander/fedn
class ReducerControl:
    def __init__(self, statestore):
        self.__state = ReducerState.setup
        self.statestore = statestore
        self.combiners = []

        s3_config = {
            'storage_access_key': os.environ['FEDN_MINIO_ACCESS_KEY'],
            'storage_secret_key': os.environ['FEDN_MINIO_SECRET_KEY'],
            'storage_bucket': 'models',
            'storage_secure_mode': False,
            'storage_hostname': os.environ['FEDN_MINIO_HOST'],
            'storage_port': int(os.environ['FEDN_MINIO_PORT'])
        }

        from fedn.common.storage.s3.s3repo import S3ModelRepository
        self.model_repository = S3ModelRepository(s3_config)
        self.bucket_name = s3_config["storage_bucket"]

        # TODO: Make configurable
        self.helper = KerasSequentialHelper()

        if self.statestore.is_inited():
            self.__state = ReducerState.idle

    def get_latest_model(self):
        return self.statestore.get_latest()

    def get_model_info(self):
        return self.statestore.get_model_info()

    def get_compute_context(self):
        definition = self.statestore.get_compute_context()
        if definition:
            try:
                context = definition['filename']
                return context
            except IndexError:
                print("No context filename set for compute context definition",
                      flush=True)
        else:
            return None

    def set_compute_context(self, filename):
        self.statestore.set_compute_context(filename)

    def commit(self, model_id, model=None):
        """ Commit a model. This establishes this model as the lastest consensus model. """

        if model:
            fod, outfile_name = tempfile.mkstemp(suffix='.h5')
            model.save(outfile_name)
            model_id = self.model_repository.set_model(outfile_name,
                                                       is_file=True)
            os.unlink(outfile_name)

        self.statestore.set_latest(model_id)

    def _out_of_sync(self, combiners=None):

        if not combiners:
            combiners = self.combiners

        osync = []
        for combiner in combiners:
            try:
                model_id = combiner.get_model_id()
            except CombinerUnavailableError:
                self._handle_unavailable_combiner(combiner)
                model_id = None
            if model_id and (model_id != self.get_latest_model()):
                osync.append(combiner)
        return osync

    def check_round_participation_policy(self, compute_plan, combiner_state):
        """ Evaluate reducer level policy for combiner round-paarticipation.
            This is a decision on ReducerControl level, additional checks
            applies on combiner level. Not all reducer control flows might
            need or want to use a participation policy.  """
        if int(compute_plan['clients_required']) <= int(
                combiner_state['nr_active_clients']):
            return True
        else:
            return False

    def check_round_start_policy(self, combiners):
        """ Check if the overall network state meets the policy to start a round. """
        if len(combiners) > 0:
            return True
        else:
            return False

    def check_round_validity_policy(self, combiners):
        """ 
            At the end of the round, before committing a model to the model ledger, 
            we check if a round validity policy has been met. This can involve 
            e.g. asserting that a certain number of combiners have reported in an
            updated model, or that criteria on model performance have been met. 
        """
        if combiners == []:
            return False
        else:
            return True

    def _handle_unavailable_combiner(self, combiner):
        """ This callback is triggered if a combiner is found to be unresponsive. """
        # TODO: Implement
        print("REDUCER CONTROL: Combiner {} unavailable.".format(
            combiner.name),
              flush=True)

    def round(self, config):
        """ Execute one global round. """

        # TODO: Set / update reducer states and such
        # TODO: Do a General Health check on Combiners in the beginning of the round.
        if len(self.combiners) < 1:
            print("REDUCER: No combiners connected!")
            return

        # 1. Formulate compute plans for this round and decide which combiners should participate in the round.
        compute_plan = copy.deepcopy(config)
        compute_plan['rounds'] = 1
        compute_plan['task'] = 'training'
        compute_plan['model_id'] = self.get_latest_model()

        combiners = []
        for combiner in self.combiners:

            try:
                combiner_state = combiner.report()
            except CombinerUnavailableError:
                self._handle_unavailable_combiner(combiner)
                combiner_state = None

            if combiner_state:
                is_participating = self.check_round_participation_policy(
                    compute_plan, combiner_state)
                if is_participating:
                    combiners.append((combiner, compute_plan))

        print("REDUCER CONTROL: Participating combiners: {}".format(combiners),
              flush=True)

        round_start = self.check_round_start_policy(combiners)
        print("ROUND START POLICY: {}".format(round_start), flush=True)
        if not round_start:
            print(
                "REDUCER CONTROL: Round start policy not met, skipping round!",
                flush=True)
            return None

        # 2. Sync up and ask participating combiners to coordinate model updates
        for combiner, compute_plan in combiners:
            try:
                self.sync_combiners([combiner], self.get_latest_model())
                response = combiner.start(compute_plan)
            except CombinerUnavailableError:
                # This is OK, handled by round accept policy
                self._handle_unavailable_combiner(combiner)
                pass
            except:
                # Unknown error
                raise

        # Wait until participating combiners have a model that is out of sync with the current global model.
        # TODO: Implement strategies to handle timeouts.
        # TODO: We do not need to wait until all combiners complete before we start reducing.
        cl = []
        for combiner, plan in combiners:
            cl.append(combiner)

        wait = 0.0
        while len(self._out_of_sync(cl)) < len(combiners):
            time.sleep(1.0)
            wait += 1.0
            if wait >= config['round_timeout']:
                break

        # OBS! Here we are checking against all combiners, not just those that computed in this round.
        # This means we let straggling combiners participate in the update
        updated = self._out_of_sync()
        print("UPDATED: {}".format(updated), flush=True)

        round_valid = self.check_round_validity_policy(updated)
        if round_valid == False:
            # TODO: Should we reset combiner state here?
            print("REDUCER CONTROL: Round invalid!", flush=True)
            return None

        # 3. Reduce combiner models into a global model
        try:
            model = self.reduce(updated)
        except:
            print(
                "REDUCER CONTROL: Failed to reduce models from combiners: {}".
                format(updated),
                flush=True)
            return None

        if model:
            # Commit to model ledger
            import uuid
            model_id = uuid.uuid4()
            self.commit(model_id, model)

        else:
            print("REDUCER: failed to update model in round with config {}".
                  format(config),
                  flush=True)
            return None

        # 4. Trigger participating combiner nodes to execute a validation round for the current model
        # TODO: Move to config - are we validating in a round, and if so, in what way.
        validate = True
        if validate:
            combiner_config = copy.deepcopy(config)
            combiner_config['model_id'] = self.get_latest_model()
            combiner_config['task'] = 'validation'
            for combiner in updated:
                try:
                    combiner.start(combiner_config)
                except CombinerUnavailableError:
                    # OK if validation fails for a combiner
                    self._handle_unavailable_combiner(combiner)
                    pass

        return model_id

    def sync_combiners(self, combiners, model_id):
        """ Spread the current consensus model to all active combiner nodes. """
        if not model_id:
            print("GOT NO MODEL TO SET! Have you seeded the FedML model?",
                  flush=True)
            return

        for combiner in combiners:
            response = combiner.set_model_id(model_id)

    def instruct(self, config):
        """ Main entrypoint, executes the compute plan. """

        if self.__state == ReducerState.instructing:
            print("Already set in INSTRUCTING state", flush=True)
            return

        self.__state = ReducerState.instructing

        # TODO - move seeding from config to explicit step, use Reducer REST API reducer/seed/... ?
        if not self.get_latest_model():
            print("No model in model chain, please seed the alliance!")

        self.__state = ReducerState.monitoring

        for round in range(int(config['rounds'])):
            model_id = self.round(config)
            if model_id:
                print("REDUCER: Global round completed, new model: {}".format(
                    model_id),
                      flush=True)
            else:
                print("REDUCER: Global round failed!")

        self.__state = ReducerState.idle

    def reduce(self, combiners):
        """ Combine current models at Combiner nodes into one global model. """
        i = 1
        model = None
        for combiner in combiners:

            # TODO: Handle inactive RPC error in get_model and raise specific error
            try:
                data = combiner.get_model()
            except:
                pass

            if data:
                try:
                    model_next = self.helper.load_model(
                        combiner.get_model().getbuffer())
                    self.helper.increment_average(model, model_next, i)
                except:
                    model = self.helper.load_model(data.getbuffer())
                i = i + 1
        return model

    def resolve(self):
        """ At the end of resolve, all combiners have the same model state. """

        combiners = self._out_of_sync()
        if len(combiners) > 0:
            model = self.reduce(combiners)
        return model

    def monitor(self, config=None):
        pass
        """ monitor """
        #if self.__state == ReducerState.monitoring:
        #print("monitoring")

    def add(self, combiner):
        if self.__state != ReducerState.idle:
            print("Reducer is not idle, cannot add additional combiner")
            return
        if self.find(combiner.name):
            return
        print("adding combiner {}".format(combiner.name), flush=True)
        self.combiners.append(combiner)

    def remove(self, combiner):
        if self.__state != ReducerState.idle:
            print("Reducer is not idle, cannot remove combiner")
            return
        self.combiners.remove(combiner)

    def find(self, name):
        for combiner in self.combiners:
            if name == combiner.name:
                return combiner
        return None

    def find_available_combiner(self):
        for combiner in self.combiners:
            if combiner.allowing_clients():
                return combiner
        return None

    def state(self):
        return self.__state
예제 #7
0
class FEDAVGCombiner:
    """ 
        A Local SGD / Federated Averaging (FedAvg) combiner. This 
        class is resonsible for coordinating the update of the Combiner global 
        model by requesting and aggregating model updates from Clients. 

    """
    def __init__(self, id, storage, server, modelservice):

        self.run_configs_lock = Lock()
        self.run_configs = []
        self.storage = storage
        self.id = id
        self.server = server
        self.modelservice = modelservice

        self.config = {}
        self.validations = {}

        # TODO: make choice of helper configurable on Recucer level
        self.helper = KerasSequentialHelper()
        self.model_updates = queue.Queue()

    def report_status(self,
                      msg,
                      log_level=fedn.Status.INFO,
                      type=None,
                      request=None,
                      flush=True):
        print("COMBINER({}):{} {}".format(self.id, log_level, msg),
              flush=flush)

    def receive_model_candidate(self, model_id):
        """ Callback when a new model version is reported by a client. """
        try:
            self.report_status(
                "COMBINER: callback received model {}".format(model_id),
                log_level=fedn.Status.INFO)
            # TODO - here would be a place to do some additional validation of the model contribution.
            self.model_updates.put(model_id)
        except Exception as e:
            self.report_status(
                "COMBINER: Failed to receive candidate model! {}".format(e),
                log_level=fedn.Status.WARNING)
            self.report_status("Failed to receive candidate model!")
            pass

    def receive_validation(self, validation):
        """ Callback for a validation request """

        model_id = validation.model_id
        data = json.loads(validation.data)
        try:
            self.validations[model_id].append(data)
        except KeyError:
            self.validations[model_id] = [data]

        self.report_status("COMBINER: callback processed validation {}".format(
            validation.model_id),
                           log_level=fedn.Status.INFO)

    def _load_model_fault_tolerant(self, model_id):
        # Try reading it from local disk/combiner memory
        model_str = self.modelservice.models.get(model_id)
        # And if we cannot access that, try downloading from the server
        if model_str == None:
            model_str = self.modelservice.get_model(model_id)
            # TODO: use retrying library
            tries = 0
            while tries < 3:
                tries += 1
                if not model_str or sys.getsizeof(model_str) == 80:
                    self.report_status(
                        "COMBINER: Model download failed. retrying",
                        flush=True)
                    import time
                    time.sleep(1)
                    model_str = self.modelservice.get_model(model_id)

        return model_str

    def combine_models(self,
                       nr_expected_models=None,
                       nr_required_models=1,
                       timeout=120):
        """ Compute an iterative/running average of models arriving to the combiner. """

        import time
        round_time = 0.0
        print("COMBINER: combining model updates from Clients...")

        nr_processed_models = 0
        while nr_processed_models < nr_expected_models:
            try:
                model_id = self.model_updates.get(block=False)
                self.report_status(
                    "Received model update with id {}".format(model_id))
                model_str = self._load_model_fault_tolerant(model_id)
                if model_str:
                    try:
                        model_next = self.helper.load_model(
                            model_str.getbuffer())
                    except IOError:
                        self.report_status("COMBINER: Failed to load model!")
                        raise
                else:
                    raise

                if nr_processed_models == 0:
                    model = model_next
                else:
                    self.helper.increment_average(model, model_next,
                                                  nr_processed_models)

                nr_processed_models += 1
                self.model_updates.task_done()
            except queue.Empty:
                self.report_status(
                    "COMBINER: waiting for model updates: {} of {} completed.".
                    format(nr_processed_models, nr_expected_models))
                time.sleep(1.0)
                round_time += 1.0
            except IOError:
                self.report_status(
                    "COMBINER: Failed to read model update, skipping!")
                self.model_updates.task_done()
                nr_expected_models -= 1
                if nr_expected_models <= 0:
                    # This hack lets the timeout policy handle the failure
                    round_time = timeout
                    break
            except Exception as e:
                self.report_status(
                    "COMBINER: Exception in combine_models: {}".format(e))
                time.sleep(1.0)
                round_time += 1.0

            if round_time >= timeout:
                self.report_status("COMBINER: training round timed out.",
                                   log_level=fedn.Status.WARNING)
                print("COMBINER: Round timed out.")
                # TODO: Generalize policy for what to do in case of timeout.
                if nr_processed_models >= nr_required_models:
                    break
                else:
                    return None

        self.report_status(
            "ORCHESTRATOR: Training round completed, combined {} models.".
            format(nr_processed_models),
            log_level=fedn.Status.INFO)
        self.report_status(
            "DONE, combined {} models".format(nr_processed_models))
        return model

    def __training_round(self, config, clients):

        # We flush the queue at a beginning of a round (no stragglers allowed)
        # TODO: Support other ways to handle stragglers.
        with self.model_updates.mutex:
            self.model_updates.queue.clear()

        self.report_status(
            "COMBINER: Initiating training round, participating members: {}".
            format(clients))
        self.server.request_model_update(config['model_id'], clients=clients)
        model = self.combine_models(nr_expected_models=len(clients),
                                    nr_required_models=int(
                                        config['clients_required']),
                                    timeout=int(config['round_timeout']))
        return model

    def __validation_round(self, config, clients, model_id):
        self.server.request_model_validation(model_id, from_clients=clients)

    def push_run_config(self, plan):
        self.run_configs_lock.acquire()
        import uuid
        plan['_job_id'] = str(uuid.uuid4())
        self.run_configs.append(plan)
        self.run_configs_lock.release()
        return plan['_job_id']

    def run(self):

        import time
        try:
            while True:
                time.sleep(1)

                self.run_configs_lock.acquire()
                if len(self.run_configs) > 0:

                    compute_plan = self.run_configs.pop()
                    self.run_configs_lock.release()
                    self.config = compute_plan

                    ready = self.__check_nr_round_clients(compute_plan,
                                                          timeout=10.0)
                    if ready:
                        if compute_plan['task'] == 'training':
                            self.exec_training(compute_plan)
                        elif compute_plan['task'] == 'validation':
                            self.exec_validation(compute_plan,
                                                 compute_plan['model_id'])
                        else:
                            self.report_status(
                                "COMBINER: Compute plan contains unkown task type.",
                                flush=True)
                    else:
                        self.report_status(
                            "COMBINER: Failed to meet client allocation requirements for this compute plan.",
                            flush=True)

                if self.run_configs_lock.locked():
                    self.run_configs_lock.release()

        except (KeyboardInterrupt, SystemExit):
            pass

    def stage_model(self, model_id):
        """ Download model from persistent storage. """

        # If the model is already in memory at the server we do not need to do anything.
        #TODO ugly ! Need to be refactored
        if self.modelservice.models.exist(model_id):
            return

        # If it is not there, download it from storage and stage it in memory at the server.
        timeout_retry = 3
        import time
        tries = 0
        while True:
            try:
                model = self.storage.get_model_stream(model_id)
                if model:
                    break
            except Exception as e:
                self.report_status(
                    "COMBINER could not fetch model from bucket. retrying in {}"
                    .format(timeout_retry),
                    flush=True)
                time.sleep(timeout_retry)
                tries += 1
                if tries > 2:
                    self.report_status(
                        "COMBINER exiting. could not fetch seed model.",
                        flush=True)
                    return

        self.modelservice.set_model(model, model_id)

    def __assign_round_clients(self, n):
        """  Obtain a list of clients to talk to in a round. """

        active_trainers = self.server.get_active_trainers()
        # If the number of requested trainers exceeds the number of available, use all available.
        if n > len(active_trainers):
            n = len(active_trainers)

        # If not, we pick a random subsample of all available clients.
        import random
        clients = random.sample(active_trainers, n)

        return clients

    def __check_nr_round_clients(self, config, timeout=10.0):
        """ Check that the minimal number of required clients to start a round are connected. """

        import time
        ready = False
        t = 0.0
        while not ready:
            active = self.server.nr_active_trainers()

            if active >= int(config['clients_requested']):
                return True
            else:
                self.report_status(
                    "waiting for {} clients to get started, currently: {}".
                    format(int(config['clients_requested']) - active, active),
                    flush=True)
            time.sleep(1.0)
            t += 1.0
            if t >= timeout:
                if active >= int(config['clients_required']):
                    return True
                else:
                    return False

        return ready

    def exec_validation(self, config, model_id):
        """ Coordinate validation rounds as specified in config. """

        self.report_status(
            "COMBINER orchestrating validation of model {}".format(model_id))
        self.stage_model(model_id)
        #validators = self.__assign_round_clients(int(config['clients_requested']))
        validators = self.__assign_round_clients(self.server.max_clients)
        self.__validation_round(config, validators, model_id)

    def exec_training(self, config):
        """ Coordinates clients to executee training and validation tasks. """

        #print("COMBINER starting from model {}".format(config['model_id']))
        self.stage_model(config['model_id'])

        # Execute the configured number of rounds
        for r in range(1, int(config['rounds']) + 1):
            self.report_status(
                "COMBINER: Starting training round {}".format(r), flush=True)
            #clients = self.__assign_round_clients(int(config['clients_requested']))
            clients = self.__assign_round_clients(self.server.max_clients)
            model = self.__training_round(config, clients)

            if not model:
                self.report_status(
                    "\t Failed to update global model in round {0}!".format(r))

        if model:
            fod, outfile_name = tempfile.mkstemp(suffix='.h5')
            model.save(outfile_name)

            # Save to local storage for sharing with clients.
            from io import BytesIO
            a = BytesIO()
            a.seek(0, 0)
            with open(outfile_name, 'rb') as f:
                a.write(f.read())

            # Send aggregated model to server
            model_id = str(uuid.uuid4())
            self.modelservice.set_model(a, model_id)
            os.unlink(outfile_name)

            # Update Combiner latest model
            self.server.set_active_model(model_id)

            print("------------------------------------------")
            self.report_status("COMBINER: TRAINING ROUND COMPLETED.",
                               flush=True)
            print("\n")