Пример #1
0
class Reclaimer:

    def __init__(self, com: ICommunication_Controller, logger: Logger = None):
        self.__com = com
        if logger is None:
            self.__log = Logger(title_info='Retrieve', log_to_file=True)
        else:
            self.__log = logger

    def require_client_log(self):
        """
            Require client_log file from all workers.
        :return: None
        """
        # send request
        for id in self.__com.available_clients:
            self.__com.send_one(id, RequestWorkingLog())
            self.__log.log_message('Acquire log file from worker({}).'.format(id))

        try:
            nodes_ready = set()
            total_nodes = set(self.__com.available_clients)
            while nodes_ready != total_nodes:

                id_from, log = self.__com.get_one()

                if isinstance(log, DoneType):
                    log.restore()
                    file_format = "\n\t\t--> ".join([filename for filename in log.file_list])
                    self.__log.log_message('Save file for {}.\n\tList:\n\t\t--> {}'.format(id_from, file_format))
                    nodes_ready.add(id_from)
                    self.__log.log_message('Node({}) is done, {} is done.'.format(id_from, nodes_ready))

        except Exception as e:
            # print DEBUG message
            import sys
            import traceback
            exc_type, exc_value, exc_tb = sys.exc_info()
            exc_tb = traceback.format_exception(exc_type, exc_value, exc_tb)
            exc_format = "".join(exc_tb)
            self.__log.log_error('Exception occurred: {}\n\t{}'.format(e, exc_format))
            # print DEBUG message

        self.__log.log_message('Done.')
Пример #2
0
class PSGDPSExecutor(AbsExecutor):
    def __init__(self, node_id, offset):
        super().__init__(node_id, offset)
        # wait
        self.__log = Logger('ParaServer'.format(node_id), log_to_file=True)
        self.__done: [bool] = False
        self.__transfer: [ITransfer] = None

    def requests(self):
        return [Req.Setting, Req.Transfer_PS]

    def satisfy(self, reply: list) -> list:
        # check list
        for obj in reply:

            if isinstance(obj, net_setting):
                GlobalSettings.deprecated_default_settings = obj.setting()

            if isinstance(obj, ITransfer):
                self.__transfer = obj
                self.__log.log_message('Transfer thread is ready.')

        return []

    def ready(self) -> bool:
        return self.__transfer is not None \
                and GlobalSettings.deprecated_default_settings is not None

    def start(self, com: ICommunication_Controller) -> None:
        data_send_start = com.Com.bytes_sent
        data_recv_start = com.Com.bytes_read

        GlobalSettings.deprecated_global_logger = self.__log
        self.__transfer.start_transfer(com, printer=self.__log, group_offset=0)

        from utils.constants import Initialization_Server
        while set(com.available_clients) - {Initialization_Server} != set():
            sleep(7)

        data_sent_end = com.Com.bytes_sent
        data_recv_end = com.Com.bytes_read
        self.__log.log_message(
            'Execution complete, Total bytes sent: {}.'.format(
                data_sent_end - data_send_start))
        self.__log.log_message(
            'Execution complete, Total bytes read: {}.'.format(
                data_recv_end - data_recv_start))

    def trace_files(self) -> list:
        return [self.__log.File_Name]

    def done(self) -> bool:
        return self.__done
Пример #3
0
class PSGDWorkerExecutor(AbsExecutor):

    def __init__(self, node_id, offset):
        super().__init__(node_id, offset)
        self.__log = Logger('Fit-{}'.format(node_id), log_to_file=True)
        self.__trace_filename = [self.__log.File_Name]
        # waiting for those
        self.__model: [Model] = None
        self.__optimizer: [IPSGDOpContainer] = None
        self.__batch_iter: [IBatchIter] = None
        self.__trans: [ITransfer] = None
        self.__data: [IDataset] = None
        self.__misc: [misc_package] = None
        self.__done: bool = False

    def requests(self) -> List[object]:
        return [Req.Setting, Req.Model, Req.Optimizer, Req.Transfer, Req.Data_Package, Req.Other_Stuff]

    def satisfy(self, reply: list) -> list:
        unsatisfied = []
        # check list
        for obj in reply:

            if isinstance(obj, net_setting):
                GlobalSettings.deprecated_default_settings = obj.setting()

            if isinstance(obj, net_model):
                self.__model = obj.model
                self.__batch_iter = obj.batch_iter

            if isinstance(obj, IPSGDOpContainer):
                self.__optimizer = obj

            if isinstance(obj, ITransfer):
                self.__trans = obj

            if isinstance(obj, misc_package):
                self.__misc = obj

            if isinstance(obj, IDataset):
                if not obj.check():
                    unsatisfied.append(Requests(Req.Data_Content))
                else:
                    self.__data = obj

        return unsatisfied

    def ready(self) -> bool:
        return self.__check()[0]

    def __check(self) -> Tuple[bool, List[str]]:
        status = []
        s1 = isinstance(self.__optimizer, IPSGDOpContainer)
        status.append("Optimizer:{}".format("OK" if s1 else "ABSENT"))
        s2 = isinstance(self.__model, IModel)
        status.append("Model:{}".format("OK" if s2 else "ABSENT"))
        s3 = isinstance(self.__data, IDataset)
        status.append("Dataset:{}".format("OK" if s3 else "ABSENT"))
        s4 = isinstance(self.__misc, misc_package)
        status.append("Others:{}".format("OK" if s4 else "ABSENT"))
        s5 = isinstance(self.__trans, ITransfer)
        status.append("Transfer:{}".format("OK" if s5 else "ABSENT"))
        s6 = isinstance(self.__batch_iter, IBatchIter)
        status.append("Batch Iterator:{}".format("OK" if s6 else "ABSENT"))
        s7 = isinstance(GlobalSettings.deprecated_default_settings, ISetting)
        status.append("Settings:{}".format("OK" if s7 else "ABSENT"))
        return s1 and s2 and s3 and s4 and s5 and s6 and s7, status

    def done(self) -> bool:
        return self.__done

    def start(self, com: ICommunication_Controller) -> None:
        state, report = self.__check()
        self.__log.log_message("Ready:{} \n\t Check List:\n\t\t--> {}".format(state, "\n\t\t--> ".join(report)))
        # get dataset
        train_x, train_y, test_x, test_y = self.__data.load()
        self.__log.log_message('Dataset is ready, type: ({})'.format(self.__data))
        # build data feeder
        block_ids = GlobalSettings.get_default().node_2_block[com.Node_Id]
        feeder = PSGDBlockDataFeeder(train_x, train_y, batch_iter=self.__batch_iter, block_ids=block_ids)
        # assemble optimizer
        self.__optimizer.assemble(transfer=self.__trans, block_mgr=feeder)
        # compile model
        self.__model.compile(self.__optimizer)
        # summary
        summary = self.__model.summary()
        self.__log.log_message(summary)
        trace_head = '{}-N({})'.format(self.__misc.mission_title, self.node_id)
        self.__log.log_message('Model set to ready.')

        log_head = self.__log.Title
        # start !
        GlobalSettings.deprecated_global_logger = self.__log
        self.__trans.start_transfer(com, group_offset=list(self.group)[0], printer=self.__log)
        # record data
        time_start = time.time()
        data_send_start = com.Com.bytes_sent
        data_recv_start = com.Com.bytes_read

        evaluation_history = []
        title = []
        r = {}
        # do until reach the target accuracy
        for i in range(self.__misc.epoch):
            # change title
            self.__log.Title = log_head + "-Epo-{}".format(i + 1)
            history = self.__model.fit(feeder, epoch=1, printer=self.__log)
            # do tests
            r = self.__model.evaluate(test_x, test_y)
            title = r.keys()
            row = r.values()
            self.__log.log_message('Evaluate result: {}'.format(r))
            evaluation_history.append(row)

            if self.__misc.target_acc is not None:
                # only one metric in model metrics list.
                # evaluation[0] refers to loss
                # evaluation[1] refers to accuracy.
                if r[1] > self.__misc.target_acc:
                    break

        # record data
        time_end = time.time()
        data_sent_end = com.Com.bytes_sent
        data_recv_end = com.Com.bytes_read

        training_history = self.__model.fit_history()
        # save training history data
        training_name = "TR-" + trace_head + ".csv"
        training_trace = pd.DataFrame(training_history.history, columns=training_history.title)
        training_trace.to_csv(training_name, index=False)
        # save evaluation history data
        evaluation_name = "EV-" + trace_head + ".csv"
        evaluation_trace = pd.DataFrame(evaluation_history, columns=title)
        evaluation_trace.to_csv(evaluation_name, index=False)
        # save model
        model_name = "MODEL-" + trace_head + ".model"
        self.__model.compile(nn.gradient_descent.SGDOptimizer(learn_rate=1e-5))
        self.__model.save(model_name)
        self.__trace_filename.append(training_name)
        self.__trace_filename.append(evaluation_name)
        self.__trace_filename.append(model_name)

        self.__log.log_message('Execution complete, time: {}.'.format(time_end - time_start))
        self.__log.log_message('Execution complete, Total bytes sent: {}.'.format(data_sent_end - data_send_start))
        self.__log.log_message('Execution complete, Total bytes read: {}.'.format(data_recv_end - data_recv_start))
        self.__log.log_message('Trace file has been saved to {}.'.format(trace_head))

        # set marker
        self.__done = True
        # dispose
        self.__model.clear()
        del train_x, train_y, test_x, test_y

        # return last evaluation result
        return r

    def trace_files(self) -> list:
        return self.__trace_filename
Пример #4
0
class PSGD_Worker:

    Training_TimeOut_Limit = 180

    def __init__(self):
        self.__running_thread = None
        self.client_logger = Logger(title_info='Worker-{}'.format(get_repr()),
                                    log_to_file=True)
        self.__training_log = None

        self.client_logger.log_message(
            'Working started and ready for job submission.')

    def slave_forever(self):
        # set up listening port
        constructor = Worker_Communication_Constructor(
            '0.0.0.0',
            STAR_NET_WORKING_PORTS,
            worker_register=CLZ_WORKER_REGISTER())
        while True:
            com = None
            try:
                self.client_logger.log_message(
                    'Worker started, prepare for connection...')
                register = constructor.buildCom()
                com = Communication_Controller(CLZ_COM_PROCESS(register))
                com.establish_communication()

                self.client_logger.log_message(
                    'Job submission received. Node assigned node_id({})'.
                    format(com.Node_Id))

                if self.init_PSGD(com):
                    self.do_training(com)

                GlobalSettings.clear_default()
                self.client_logger.log_message(
                    'Current session closed, node_id({}).'.format(com.Node_Id))

            except Exception as e:
                self.client_logger.log_error(
                    'Exception occurred: {}'.format(e))

                # print DEBUG message
                import sys
                import traceback
                exc_type, exc_value, exc_tb = sys.exc_info()
                exc_tb = traceback.format_exception(exc_type, exc_value,
                                                    exc_tb)
                for line in exc_tb:
                    self.client_logger.log_message(line)
                # print DEBUG message

            except KeyboardInterrupt:
                self.client_logger.log_error(
                    'Worker shutdown by interruption.')
                constructor.close()
                break
            finally:
                time.sleep(10)
                if isinstance(com, Communication_Controller):
                    com.close()

            self.client_logger.log_message('Worker restarting...')
            # wait for safe closure

    def init_PSGD(self, com: Communication_Controller) -> bool:
        self.client_logger.log_message(
            'ACK job submission and request global settings.')

        # ignore other data
        def acquire(com):
            id_from, data = com.get_one()
            while id_from != Initialization_Server:
                id_from, data = com.get_one()
            return data

        # initialize global settings
        com.send_one(Initialization_Server, Init.GlobalSettings)
        # get data
        data = acquire(com)
        # restore global settings
        if not isinstance(data, Reply.global_setting_package):
            if data == Reply.I_Need_Your_Working_Log:
                self.client_logger.log_message(
                    'Nothing needs to be done, send back logfile and exit process.'
                )
                com.send_one(Initialization_Server,
                             Binary_File_Package(self.client_logger.File_Name))
                if isinstance(self.__training_log, Logger):
                    com.send_one(
                        Initialization_Server,
                        Binary_File_Package(self.__training_log.File_Name))
                if isinstance(self.__running_thread, PSGDTraining_Client):
                    com.send_one(
                        Initialization_Server,
                        Binary_File_Package(self.__running_thread.Trace_Eval))
                    com.send_one(
                        Initialization_Server,
                        Binary_File_Package(self.__running_thread.Trace_Train))
                com.send_one(Initialization_Server, Done_Type())
            return False

        data.restore()

        self.client_logger.log_message('Request codec and sgd class.')
        # initialize codec and sgd type
        com.send_one(Initialization_Server, Init.Codec_And_SGD_Type)

        data = acquire(com)
        assert isinstance(data, Reply.codec_and_sgd_package)

        codec, sgd = data.restore()

        self.client_logger.log_message('Request weights and layer type.')
        # initialize weights and layer
        com.send_one(Initialization_Server, Init.Weights_And_Layers)
        data = acquire(com)
        assert isinstance(data, Reply.weights_and_layers_package)

        layers = data.restore()

        self.client_logger.log_message('Request other stuff.')
        # others
        com.send_one(Initialization_Server, Init.MISC)
        data = acquire(com)
        assert isinstance(data, Reply.misc_package)

        loss_t = data.loss_type
        target_acc = data.target_acc
        epoch = data.epoch
        learn_rate = data.learn_rate
        w_type = data.w_types
        op = data.optimizer
        metric = data.metric

        self.__training_log = Logger('Training log @ node-{}'.format(
            com.Node_Id),
                                     log_to_file=True)

        if com.Node_Id != Parameter_Server:

            self.client_logger.log_message('Request data samples.')
            # initialize dataset
            com.send_one(Initialization_Server, Init.Samples)
            data = acquire(com)
            # restore
            assert isinstance(data, Reply.data_sample_package)

            train_x, train_y, eval_x, eval_y = data.restore()

            self.__running_thread = PSGDTraining_Client(
                model_init=layers,
                loss=loss_t,
                codec_type=codec,
                sync_class=sgd,
                com=com,
                w_types=w_type,
                tags=build_tags(node_id=com.Node_Id),
                train_x=train_x,
                train_y=train_y,
                eval_x=eval_x,
                eval_y=eval_y,
                optimizer=op,
                batch_size=GlobalSettings.get_default().batch.batch_size,
                epochs=epoch,
                logger=self.__training_log,
                learn_rate=learn_rate,
                target_acc=target_acc,
                metrics=metric)
        else:
            self.__running_thread = PSGDTraining_Parameter_Server(
                model_init=layers,
                ps_codec=codec,
                ps_sgd_type=sgd,
                com=com,
                w_types=w_type,
                logger=self.__training_log)

        self.client_logger.log_message(
            'Submit stage complete, Total bytes sent: {}'.format(
                com.Com.bytes_sent))
        self.client_logger.log_message(
            'Submit stage complete, Total bytes read: {}'.format(
                com.Com.bytes_read))
        return True

    def do_training(self, com: Communication_Controller):
        self.client_logger.log_message('Prepare to start training process.')
        # check
        assert isinstance(self.__running_thread, Thread)
        assert isinstance(self.__training_log, Logger)

        ready_state = {}
        self.client_logger.log_message('Synchronize timeline with cluster.')

        len_ready = len(com.available_clients())
        time_count = 0
        # check ready states
        while len(ready_state) != len_ready:
            # require
            n, d = com.get_one(False)
            if isinstance(d, Ready_Type):
                ready_state[n] = True
                time_count = 0
            if len(com.available_clients()) < len_ready:
                raise OSError('Minimal number of clients cannot be satisfied.')
            if time_count > PSGD_Worker.Training_TimeOut_Limit:
                raise AssertionError(
                    'Maximal waiting time exceed, give up waiting and reset environment.'
                )
            for node_id in com.available_clients():
                com.send_one(node_id, Ready_Type())
            time.sleep(1)
            time_count += 1

        try:
            self.client_logger.log_message('Execution process started.')
            data_sent_mark = com.Com.bytes_sent
            data_recv_mark = com.Com.bytes_read
            begin = time.time()
            self.__running_thread.start()
            self.__running_thread.join()
            end = time.time()

            self.__training_log.log_message(
                'Execution complete, time:{}'.format(end - begin))
            self.__training_log.log_message(
                'Bytes sent: {}'.format(com.Com.bytes_sent - data_sent_mark))
            self.__training_log.log_message(
                'Bytes read: {}'.format(com.Com.bytes_read - data_recv_mark))

            self.client_logger.log_message(
                'Execution complete, time:{}'.format(end - begin))
            self.client_logger.log_message(
                'Training stage complete, Total bytes sent: {}'.format(
                    com.Com.bytes_sent))
            self.client_logger.log_message(
                'Training stage complete, Total bytes read: {}'.format(
                    com.Com.bytes_read))

            if isinstance(self.__running_thread, PSGDTraining_Client):
                train_csv = Binary_File_Package(
                    self.__running_thread.Trace_Train)
                eval_csv = Binary_File_Package(
                    self.__running_thread.Trace_Eval)

                self.client_logger.log_message('Post training log.')
                com.send_one(Initialization_Server, train_csv)
                com.send_one(Initialization_Server, eval_csv)

        except Exception as error:
            self.client_logger.log_error(
                'Error encountered while executing : {}'.format(error))
            self.__training_log.log_error(
                'Error encountered while executing : {}'.format(error))

        self.client_logger.log_message('Training process exited.')
        log_file = Binary_File_Package(self.__training_log.File_Name)
        com.send_one(Initialization_Server, log_file)
Пример #5
0
class Coordinator:
    def __init__(self, hyper_model: IServerModel, logger=None):
        self.__com = None
        self.__model = hyper_model
        if logger is None:
            self.__log = Logger(title_info='Coordinator-{}'.format(get_repr()),
                                log_to_file=True)
        else:
            self.__log = logger

    def set_workers(self, works: list, nodes_required) -> bool:
        """
            Set worker list.
        :param works: list of tuples
                        like: [ (rule1, address1), (rule2, address2), ... ]
        :return: None, raise exceptions if two workers with same id are assigned.
        """
        pkg = IPA()
        uuid_for_this_task = str(random.randint(0, 0x7fffffff))
        current_node_id_assigned = 0
        # set all address
        for rule, addr in works:
            # Stop connecting, if required nodes count were satisfied.
            if current_node_id_assigned >= nodes_required and rule == "Worker":
                self.__log.log_message('Number of nodes satisfied.')
                break
            if rule == "PS":
                _id = Parameter_Server
            else:
                _id = current_node_id_assigned
                current_node_id_assigned += 1
            pkg.put(_id, uuid_for_this_task, addr)
            self.__log.log_message(
                'Add worker (Rule: {}, Id: {}, Address: {}).'.format(
                    rule, _id, addr))

        self.__log.log_message('Try connecting to the cluster.')
        self.__com = NET(pkg)
        self.__com = Communication_Controller(self.__com)
        self.__com.establish_communication()
        self.__log.log_message('Connection with cluster established.')

        return True

    def resources_dispatch(self):
        """
            Reply to worker's requirements, prepare for the job
        :return:
        """

        # assertion
        assert isinstance(self.__com, Communication_Controller)
        assert isinstance(self.__model, IServerModel)

        total_node_count = len(self.__com.available_clients())
        node_ready = set()
        key_interrupted_before = False

        while not self.__com.is_closed():
            try:
                id_from, data = self.__com.get_one()

                if isinstance(data, Init):
                    if data == Init.GlobalSettings:
                        reply = Reply.global_setting_package(
                            GlobalSettings.get_default())

                    elif data == Init.Weights_And_Layers:
                        reply = Reply.weights_and_layers_package(
                            self.__model.getWeightsInit())

                    elif data == Init.Codec_And_SGD_Type:
                        if id_from != Parameter_Server:
                            reply = Reply.codec_and_sgd_package(
                                self.__model.codec_ctrl(),
                                self.__model.psgd_type())
                        else:
                            reply = Reply.codec_and_sgd_package(
                                self.__model.psgd_server_codec(),
                                self.__model.psgd_server_type())

                    elif data == Init.Samples:
                        reply = Reply.data_sample_package(
                            *self.__model.train_data(),
                            *self.__model.eval_data())

                    elif data == Init.MISC:
                        reply = Reply.misc_package(
                            self.__model.epoches(), self.__model.loss_type(),
                            self.__model.learn_rate(),
                            self.__model.target_acc(),
                            self.__model.weights_types(),
                            self.__model.optimizer_type(),
                            self.__model.metric())

                    else:
                        reply = None

                    self.__log.log_message(
                        'Reply requirements to node({}), type({}).'.format(
                            id_from, reply.__class__.__name__))
                    self.__com.send_one(id_from, reply)

                elif isinstance(data, Ready_Type):
                    self.__com.send_one(id_from, Ready_Type())
                    if id_from in node_ready:
                        continue
                    node_ready.add(id_from)
                    self.__log.log_message(
                        'Node({}) is ready, {} nodes total, {} is ready.'.
                        format(id_from, total_node_count, node_ready))

                elif isinstance(data, Binary_File_Package):
                    self.__log.log_message(
                        'Restoring data ({}) from {}.'.format(
                            data.filename, id_from))
                    data.restore()

            except KeyboardInterrupt:
                if len(node_ready) < total_node_count:
                    self.__log.log_error(
                        'Some of workers is not ready, close anyway?')
                    self.__log.log_message(
                        'Press Ctrl+C again to shutdown immediately.')
                    key_interrupted_before = True
                if key_interrupted_before or len(
                        node_ready) >= total_node_count:
                    self.__log.log_error('Coordinator closed by user.')
                    break

        self.__com.close()
        self.__log.log_message('Dispatcher closed.')

    def require_client_log(self):
        """
            Require client_log file from all workers.
        :return: None
        """
        assert isinstance(self.__com, Communication_Controller)
        # self.__log.log_message('Acquire log file from each worker.')
        # take all ACK
        for id in self.__com.available_clients():
            _, _ = self.__com.get_one()

        # send request
        for id in self.__com.available_clients():
            self.__com.send_one(id, Reply.I_Need_Your_Working_Log)

        try:
            # get result
            for id in self.__com.available_clients():
                self.__log.log_message(
                    'Acquire log file from worker({}).'.format(id))
                log = None
                while not isinstance(log, Done_Type):
                    _, log = self.__com.get_one()
                    if isinstance(log, Binary_File_Package):
                        log.restore()
                        self.__log.log_message(
                            'Save log file for worker({}).'.format(id))
        except:
            self.__log.log_error('Connection lost.')

        self.__com.close()
        self.__log.log_message('Done.')

        return
Пример #6
0
class Coordinator:

    def __init__(self, com: ICommunication_Controller, estimate_bandwidth: int = 10, logger: IPrinter = None):
        """
            Coordinator
        :param com: Communication Thread
        :param estimate_bandwidth: bandwidth estimation, Bytes per second
        :param logger: IPrinter
        """
        self.__com = com
        if logger is None:
            self.__log = Logger(title_info='Coordinator', log_to_file=True)
        else:
            self.__log = logger
        self.__estimate_bandwidth = estimate_bandwidth
        self.__group_allocated = set()
        self.__global_allocated = set()
        self.__log.log_message("Coordinator version: {}.".format(VERSION))

    @property
    def allocated_nodes(self):
        return self.__global_allocated | self.__group_allocated

    def resources_dispatch(self, dispatch_map: Callable[[int, object], IReplyPackage]):
        """
            Reply to worker's requirements, prepare for the job
        :param dispatch_map: Callable object, receive a IRequestPackage instance and returns IReplyPackage instance
                            for reply.
        :return:
        """
        # dispatch to certain group
        node_ready = set()

        while node_ready != self.allocated_nodes:

            try:
                id_from, data = self.__com.get_one()
                reply = None

                if isinstance(data, IRequestPackage):
                    reply = dispatch_map(id_from, data.content())

                    self.__log.log_message(
                        'Reply requirements to node({}), type({}).'.format(id_from, reply.__class__.__name__))

                elif isinstance(data, ReadyType):
                    reply = ReadyType(node_ready)

                    if id_from in node_ready:
                        continue

                    node_ready.add(id_from)
                    self.__log.log_message('Node({}) is ready, {} is ready.'.format(id_from, node_ready))

                elif isinstance(data, Version):
                    reply = Version(Initialization_Server)

                    self.__log.log_message("{}".format(data))

                self.__com.send_one(id_from, reply)

            except KeyboardInterrupt:
                if len(node_ready) < len(self.allocated_nodes):
                    self.__log.log_error('Some workers are not ready.')
                self.__log.log_error('Coordinator closed by user.')

        self.__log.log_message('Dispatch complete.')

    def join(self) -> Dict[int, object]:
        """
            Join all workers, wait for all task.
            :return: Returns a dict, indicates what has been returned from executor on each worker.
        """
        # Join all nodes.
        node_ready = set()
        # Collect result.
        results: Dict[int, object] = {}

        self.__log.log_message("Waiting for ({}) ...".format(self.allocated_nodes))

        while node_ready != self.allocated_nodes:

            id_from, data = self.__com.get_one()

            if isinstance(data, IReplyPackage):
                data.restore()
                self.__log.log_message('Restoring data ({}) from {}.'.format(data, id_from))

            if isinstance(data, DoneType):
                file_format = "\n\t\t--> ".join([filename for filename in data.file_list])
                self.__log.log_message('Save file for {}.\n\tList:\n\t\t--> {}'.format(id_from, file_format))

                node_ready.add(id_from)
                self.__log.log_message('Node({}) is done, {} is done.'.format(id_from, node_ready))

                results[id_from] = data.result

        self.__log.log_message("All task is complete.")
        return results

    def submit_group(self, worker_executor: Type[IExecutor], working_group: Iterable[int] = None, package_size: int = 1e9):
        """
            Submit a job to a specified worker group.
            Nodes inside this group will wait for each other and synchronize start time.
            Group will also wait for all single nodes were ready.
        :param worker_executor: executor class, implementation of IExecutor
        :param working_group: Worker group list, iterable object, contains id of each worker in the group.
        :param package_size: Package size in transmission. Potentially required by executor, and provided by dispatch.
        :return: None
        """
        # set work group
        if working_group is None:
            working_group = set(self.__com.available_clients)
        if not isinstance(working_group, set):
            working_group = set(working_group)
        # check for duplication
        assert len(self.__group_allocated & working_group) == 0, "Cannot submit a task to node which already has a job."
        # calculate data size
        dataset_ett = self.__com.available_clients_count * package_size / self.__estimate_bandwidth + 1
        # send request
        for _id in working_group:
            self.__com.send_one(_id, SubmitJob(working_group | self.__global_allocated, dataset_ett, worker_executor))

        self.__group_allocated = self.__group_allocated | working_group
        self.__log.log_message("Group submission complete ({}).".format(working_group))

    def submit_single(self, worker_executor: Type[IExecutor], worker_id: int, package_size: int = 1e9):
        """
            Submit a job to a specified node.
            This global node will start execution immediately when itself was ready.
        :param worker_executor: executor class, implementation of IExecutor
        :param worker_id: Worker id.
        :param package_size: Package size in transmission. Potentially required by executor, and provided by dispatch.
        :return:
        """
        # check for duplication
        assert worker_id not in self.__global_allocated, "Cannot submit a task to node which already has a job."
        # calculate data size
        dataset_ett = self.__com.available_clients_count * package_size / self.__estimate_bandwidth + 0.6
        # send request
        self.__com.send_one(worker_id, SubmitJob({worker_id}, dataset_ett, worker_executor))

        self.__global_allocated.add(worker_id)
        self.__log.log_message("Single node submission complete.")
Пример #7
0
class ParallelSGD:
    """
        P-SGD 主调类
        P-SGD RPC Controller
    """

    def __init__(self, model: Model, data: AbsDataset, transform: ITransformer):
        """
            初始化一个P-SGD主调对象
        :param model: 用于数据并行化的模型。
        :param data: 用于并行化的数据集。
        :param transform: 数据集处理转换策略。Dataset在每个节点上都是可见的,由 transform 对数据集
                          进行状态转换处理,数据集的处理操作是本地化执行的,数据集转换策略是链表形式
                          组织的,以管道的形式运行,数据集依次经过每个 transform 操作,最终进入由
                          BatchIter 调度。
        """
        self.__model = model
        self.__data = data
        self.__transform = transform
        self.__log = Logger(title_info="P-SGD Submit", log_to_file=True)

    def parallel(self,
                 nodes: NodeAssignment,
                 redundancy: int = 1,
                 block_size: int = 64,
                 epoch: int = 10,
                 assignment_type: Type[AbsBlockAssignment] = IIDBlockAssignment,
                 sync_type: Type[IParallelSGD] = SynchronizedSGD,
                 op_type: Type[IOptimizer] = PSGDOptimizer,
                 gd_type: Type[IGradientDescent] = ADAMOptimizer,
                 codec: Union[Dict[int, Type[Codec]], Type[Codec]] = None,
                 gd_params: Tuple[object] = (),
                 ps_codec: Union[Dict[int, Type[Codec]], Type[Codec], None] = None,
                 network_bandwidth: int = 1048576,
                 mission_title: str = "P-SGD",
                 ssgd_timeout_limit: int = 10000):
        """
            执行并行化。
        :param ssgd_timeout_limit: Sync-SGD等待超时限制,单位为毫秒,数值为整型。
        :param network_bandwidth: 可用的网络带宽,用作计算预估传输时间,设置 pre_commit 超时计时器。
        :param mission_title:   任务标题,作为本次任务的log文件文件名。
        :param nodes:           由 network 模块提供的 NodeAssignment 接口,指示了当前并行化操作调用的节点数目。
                                参数服务器的节点编号由 utils.constant.Parameter_Server 指定,其余工作节点的id
                                从 0 开始依次递增(为整数型)。
        :param redundancy:      冗余设置,适用于能够处理冗余的 codec 和 block assignment。
                                继承自 AbsBlockAssignment 总能够处理含有冗余参数的提交任务,codec 的冗余处理则由
                                codec 自行定义。
        :param block_size:      节点粒度的 Batch 大小,由 codec 控制具体的更新策略,块大小与批次大小并没有具体对应关系。
                                若 codec 在每个 Block 后给出结果,则实践意义上 Block size 和 Batch size 是等价的,
                                若 codec 总是等待所有 Block 的训练完成后再同步结果,则实践意义上 Batch size 等于 Block size
                                乘以 Block 总数。
        :param epoch:           训练批次数,由 codec 和 sync type 共同决定 epoch 内的同步策略,当使用参数服务器时,参数服务器
                                也参与到同步状态的维持中。
                                若 codec 不允许异步执行,则所有节点都会在同一时刻结束 epoch,若 codec 或 sync type 允许跨批次
                                执行,则节点会根据自己的计算能立先后结束计算。
        :param assignment_type: 样本分配策略,一般与冗余分配结合使用,需要实现了 profiles.ISetting 接口的类型。
                                初衷是提供冗余的数据集分配策略,现可以提供静态数据量分配。
        :param sync_type:       同步方式。分同步和异步两种,需要实现了 psgd.sync.IParallelSGD 接口的类型。
                                同步方式下,每个 worker 在调用 get_weights() 获取权重时才会处理接收数据。
                                异步方式下,每个 Worker 收到数据就处理并更新结果集。
                                具体的数据处理流程和结果集更新策略都由 codec 定义。
        :param gd_type:         梯度处理策略类型,实现了 nn.IOptimizer 接口的类型。
                                负责处理梯度更新策略。
        :param op_type:         梯度生成策略,实现了 nn.gradient_descent.IGradientDescent 接口的类型。
                                负责生成待处理的更新增量。
        :param codec:           编码器类型,实现了 codec.interface.Codec 接口的类型。
        :param gd_params:       梯度生成器参数
        :param ps_codec:        编码器类型,实现了 codec.interface.Codec 接口的类型。
                                用于参数服务器进行数据处理。
        :return:
        """
        # 初始化适合的Codec
        if codec is None:
            codec = dict()
        if ps_codec is None:
            ps_codec = dict()

        # 默认填充Codec
        default_codec = DummyCodec
        default_ps_codec = DummyCodec
        # 如果传入确定的Codec
        if isinstance(codec, type):
            default_codec = codec
            codec = dict()
        if isinstance(ps_codec, type):
            default_ps_codec = ps_codec
            ps_codec = dict()

        # 获取所有的合法Slave
        node_count = 0
        has_ps = False
        for _id, _ in nodes:
            if _id >= 0:
                node_count += 1
            else:
                has_ps = True

        # 任务分配策略
        assignment: ISetting = assignment_type(node_count, redundancy)
        # 分配策略实例
        setting: net_setting = net_setting(assignment_type, node_count, redundancy)
        # 模型实例
        model: net_model = net_model(self.__model, BatchIter(block_size, assignment.block_count))
        # 优化器实例
        optimizer: net_optimizer = net_optimizer(op_type, gd_type, op_params=gd_params)
        # 变量表
        var_ids = [var.id for var in self.__model.trainable_variables()]
        # 变量表Codec字典
        var_codec = {var_id: (sync_type, codec.get(var_id, default_codec)) for var_id in var_ids}
        # Transfer 实例
        transfer_worker: net_transfer = net_transfer(var_codec)
        # PS Codec 变量表字典
        var_ps_codec = {var_id: (AsynchronizedSGD, ps_codec.get(var_id, default_ps_codec)) for var_id in var_ids}
        # PS Transfer 实例
        transfer_ps: [net_transfer] = net_transfer(var_ps_codec) if has_ps else None
        # 其他信息
        misc: misc_package = misc_package(mission_title, epoch, None, ssgd_timeout_limit)

        replies = {
            Req.Model: model,
            Req.Setting: setting,
            Req.Optimizer: optimizer,
            Req.Transfer: transfer_worker,
            Req.Transfer_PS: transfer_ps,
            Req.Other_Stuff: misc,
            Req.Data_Package: data_package(self.__data, self.__transform),
            Req.Data_Content: data_content(self.__data, self.__transform)
        }

        req = Request()
        self.__log.log_message("Start job.")
        self.__log.log_message("Workers: {}".format(nodes))

        with req.request(nodes) as com:
            coordinator = Coordinator(com, estimate_bandwidth=network_bandwidth, logger=self.__log)
            if has_ps:
                coordinator.submit_single(PSGDPSExecutor, Parameter_Server, self.__data.estimate_size())
            coordinator.submit_group(PSGDWorkerExecutor, assignment.nodes, self.__data.estimate_size())

            coordinator.resources_dispatch(lambda _id, x: replies[x])
            return coordinator.join()
Пример #8
0
    parse.add_argument("--workers",
                       type=str,
                       default='worker.json',
                       help='Worker list file, json type')

    parse.add_argument("--network-bandwidth",
                       dest='bandwidth',
                       type=int,
                       default=100,
                       help='Network bandwidth in Mbps.')

    arg = parse.parse_args()

    logger = Logger(title_info='P-SGD User Submit', log_to_file=True)

    logger.log_message('Initializing with parameters: ')
    logger.log_message('\t --model <file name {}>'.format(arg.model_file))
    logger.log_message('\t --node_count <node count {}>'.format(arg.n))
    logger.log_message('\t --batch_size <batch size {}>'.format(arg.b))
    logger.log_message('\t --redundancy <r {}>'.format(arg.r))
    logger.log_message(
        '\t --codec <communication codec and protocol {}>'.format(arg.codec))
    logger.log_message(
        '\t --optimizer <optimizer for model training {}>'.format(arg.op))
    logger.log_message('\t --epochs <training epochs {}>'.format(arg.epochs))
    logger.log_message('\t --dataset <using {}>'.format(arg.dataset))
    logger.log_message('\t --non-iid <{}>'.format(arg.make_iid_dataset))

    logger.log_message('\t --gradient-descent <Gradient method {}>'.format(
        arg.gd))
    logger.log_message(
Пример #9
0
class Worker:
    def __init__(self):
        self.client_logger = Logger(title_info='Worker-{}'.format(get_repr()),
                                    log_to_file=True)
        self.client_logger.log_message('Worker version: {}.'.format(VERSION))
        self.__job_executor: [IExecutor] = None

    def slave_forever(self):
        # set up listening port
        listener = Serve(net_type='fcnet')
        try:
            while True:
                self.client_logger.log_message(
                    'Worker started with network type \'FCNet\'.')
                try:
                    with listener.acquire() as com:
                        self.client_logger.log_message(
                            'Job submission received. Node assigned node_id({})'
                            .format(com.Node_Id))

                        self.dispatch(com)

                        self.client_logger.log_message(
                            'Current session closed, node_id({}).'.format(
                                com.Node_Id))
                        self.client_logger.log_message('Worker restarting...')
                        time.sleep(1)
                except OSError:
                    self.client_logger.log_message(
                        "Initialization server exited without report.")
                except ConnectionResetError:
                    self.client_logger.log_message(
                        "Initialization server exited without report.")

        except KeyboardInterrupt:
            self.client_logger.log_error('Worker shutdown by interruption.')
            listener.close()

    @staticmethod
    def __recv_pack(com: ICommunication_Controller, timeout: int = 100):
        data = None
        id_from = None
        time_out_end = time.time() + timeout
        # requests with timeout check
        while data is None:
            id_from, data = com.get_one(blocking=False)
            time.sleep(0.01)
            # Assertion, this node count as one
            assert Initialization_Server in com.available_clients, "Initialization server exited without finishing the initialization."
            assert time.time() < time_out_end, "Maximum waiting time exceed."
        return id_from, data

    def dispatch(self, com: ICommunication_Controller):
        """
            Get first package and find out what to do.
            All exceptions will be handled here, and trace back information will
            be recorded to client_logger.
            Used job_submit.py --retrieve to get trace back log.
        :param com:
        :return:
        """
        results = None
        try:
            id_from = com.Node_Id
            req = None
            while id_from != Initialization_Server:
                id_from, req = Worker.__recv_pack(
                    com, Init_Job_Submission_Timeout_Limit_Sec)

            if isinstance(req, SubmitJob):
                self.client_logger.log_message('ACK job submission.')
                if self.initialize(com, req):
                    results = self.do_training(com)

            if isinstance(req, RequestWorkingLog):
                self.client_logger.log_message('ACK logfile reclaim.')

        except Exception as e:
            # print DEBUG message
            import sys
            import traceback
            exc_type, exc_value, exc_tb = sys.exc_info()
            exc_tb = traceback.format_exception(exc_type, exc_value, exc_tb)
            exc_format = "".join(exc_tb)
            self.client_logger.log_error('Exception occurred: {}\n\t{}'.format(
                e, exc_format))
            # print DEBUG message

        self.post_log(com, results)

    def post_log(self, com: ICommunication_Controller, other_contents: object):
        """
            Post worker log file to coordinator.
        :param other_contents: other content can be attached
        :param com:
        :return:
        """
        posting_files = [self.client_logger.File_Name]
        if isinstance(self.__job_executor, AbsExecutor):
            for filename in self.__job_executor.trace_files():
                posting_files.append(filename)

        # Post files
        com.send_one(Initialization_Server,
                     DoneType(com.Node_Id, posting_files, other_contents))

    def initialize(self, com: ICommunication_Controller,
                   job_info: SubmitJob) -> bool:
        """
            Initialize execution environment
        :param com: Communication process
        :param job_info: job info
        :return:
        """
        # restoring data
        job_info.restore()
        # get info
        ready_state = set()
        total_nodes = job_info.work_group
        eta_waiting_time = job_info.waiting_time

        self.__job_executor: AbsExecutor = job_info.executioner(
            com.Node_Id, job_info.work_group)

        # Report Version
        com.send_one(Initialization_Server, Version(node_id=com.Node_Id))
        # Acknowledge requests
        requests = self.__job_executor.requests()
        replies = []
        # Ask for replies
        for req in requests:
            com.send_one(Initialization_Server, RequestPackage(req))

        req_format = "\tRequests List:\n\t\t--> {}".format("\n\t\t--> ".join(
            [str(req) for req in requests]))
        self.client_logger.log_message('Request data: ({})\n{}'.format(
            len(requests), req_format))
        self.client_logger.log_message('ETA: ({})'.format(eta_waiting_time))
        # Set job executor to ready state
        while not self.__job_executor.ready():

            id_from, data = Worker.__recv_pack(com, eta_waiting_time)

            self.client_logger.log_message('Ack package, type: ({})'.format(
                data.__class__.__name__))
            # restoring data
            if isinstance(data, IReplyPackage):
                data.restore()
                replies.append(data)

                if len(replies) == len(requests):
                    requests = self.__job_executor.satisfy(replies)
                    for req in requests:
                        com.send_one(Initialization_Server,
                                     RequestPackage(req))
                    self.client_logger.log_message(
                        'Request data: ({}).'.format(requests))
                    self.client_logger.log_message(
                        'ETA: ({})'.format(eta_waiting_time))
                    replies.clear()

            # pass to sync
            elif isinstance(data, ReadyType):
                ready_state = ready_state | data.current_ready()

        self.client_logger.log_message(
            'Submit stage complete, Total bytes sent: {}'.format(
                com.Com.bytes_sent))
        self.client_logger.log_message(
            'Submit stage complete, Total bytes read: {}'.format(
                com.Com.bytes_read))

        self.client_logger.log_message('Synchronize timeline with cluster.')

        Worker.synchronize(com, ready_state, total_nodes, eta_waiting_time)

        return True

    @staticmethod
    def synchronize(com: ICommunication_Controller, ready_state: set,
                    total_nodes: set, timeout: int):
        """
            Synchronize timeline with cluster.
            Make sure all nodes exits this method with same time.
        :param com: communication controller
        :param ready_state: how much nodes is ready now
        :param total_nodes: how much nodes we need for the job
        :param timeout: timeout limit in seconds, vaguely accuracy
        :return:
        """
        dead_line = time.time() + timeout

        ready_state.add(com.Node_Id)
        for id in com.available_clients:
            com.send_one(id, ReadyType(ready_state))

        while ready_state & total_nodes != total_nodes:
            assert time.time() < dead_line, "Maximum waiting time exceed."

            current_active = set(com.available_clients) | {com.Node_Id}
            assert current_active & total_nodes == total_nodes, \
                "Current nodes: {}, required nodes: {}.".format(current_active, total_nodes)
            # inc time clock
            time.sleep(0.01)

            # check ready state
            id_from, data = com.get_one(blocking=False)
            if isinstance(data, ReadyType):
                ready_state = ready_state | data.current_ready()

    def do_training(self, com: ICommunication_Controller) -> object:
        """
            Execute job.
        """
        self.client_logger.log_message('Execution process started.')
        begin = time.time()
        result = self.__job_executor.start(com)
        end = time.time()

        self.client_logger.log_message(
            'Execution complete, time:{}'.format(end - begin))
        self.client_logger.log_message(
            'Execution stage complete, Total bytes sent: {}'.format(
                com.Com.bytes_sent))
        self.client_logger.log_message(
            'Execution stage complete, Total bytes read: {}'.format(
                com.Com.bytes_read))
        self.client_logger.log_message('Execution process exited.')

        return result