class Reclaimer: def __init__(self, com: ICommunication_Controller, logger: Logger = None): self.__com = com if logger is None: self.__log = Logger(title_info='Retrieve', log_to_file=True) else: self.__log = logger def require_client_log(self): """ Require client_log file from all workers. :return: None """ # send request for id in self.__com.available_clients: self.__com.send_one(id, RequestWorkingLog()) self.__log.log_message('Acquire log file from worker({}).'.format(id)) try: nodes_ready = set() total_nodes = set(self.__com.available_clients) while nodes_ready != total_nodes: id_from, log = self.__com.get_one() if isinstance(log, DoneType): log.restore() file_format = "\n\t\t--> ".join([filename for filename in log.file_list]) self.__log.log_message('Save file for {}.\n\tList:\n\t\t--> {}'.format(id_from, file_format)) nodes_ready.add(id_from) self.__log.log_message('Node({}) is done, {} is done.'.format(id_from, nodes_ready)) except Exception as e: # print DEBUG message import sys import traceback exc_type, exc_value, exc_tb = sys.exc_info() exc_tb = traceback.format_exception(exc_type, exc_value, exc_tb) exc_format = "".join(exc_tb) self.__log.log_error('Exception occurred: {}\n\t{}'.format(e, exc_format)) # print DEBUG message self.__log.log_message('Done.')
class PSGDPSExecutor(AbsExecutor): def __init__(self, node_id, offset): super().__init__(node_id, offset) # wait self.__log = Logger('ParaServer'.format(node_id), log_to_file=True) self.__done: [bool] = False self.__transfer: [ITransfer] = None def requests(self): return [Req.Setting, Req.Transfer_PS] def satisfy(self, reply: list) -> list: # check list for obj in reply: if isinstance(obj, net_setting): GlobalSettings.deprecated_default_settings = obj.setting() if isinstance(obj, ITransfer): self.__transfer = obj self.__log.log_message('Transfer thread is ready.') return [] def ready(self) -> bool: return self.__transfer is not None \ and GlobalSettings.deprecated_default_settings is not None def start(self, com: ICommunication_Controller) -> None: data_send_start = com.Com.bytes_sent data_recv_start = com.Com.bytes_read GlobalSettings.deprecated_global_logger = self.__log self.__transfer.start_transfer(com, printer=self.__log, group_offset=0) from utils.constants import Initialization_Server while set(com.available_clients) - {Initialization_Server} != set(): sleep(7) data_sent_end = com.Com.bytes_sent data_recv_end = com.Com.bytes_read self.__log.log_message( 'Execution complete, Total bytes sent: {}.'.format( data_sent_end - data_send_start)) self.__log.log_message( 'Execution complete, Total bytes read: {}.'.format( data_recv_end - data_recv_start)) def trace_files(self) -> list: return [self.__log.File_Name] def done(self) -> bool: return self.__done
class PSGDWorkerExecutor(AbsExecutor): def __init__(self, node_id, offset): super().__init__(node_id, offset) self.__log = Logger('Fit-{}'.format(node_id), log_to_file=True) self.__trace_filename = [self.__log.File_Name] # waiting for those self.__model: [Model] = None self.__optimizer: [IPSGDOpContainer] = None self.__batch_iter: [IBatchIter] = None self.__trans: [ITransfer] = None self.__data: [IDataset] = None self.__misc: [misc_package] = None self.__done: bool = False def requests(self) -> List[object]: return [Req.Setting, Req.Model, Req.Optimizer, Req.Transfer, Req.Data_Package, Req.Other_Stuff] def satisfy(self, reply: list) -> list: unsatisfied = [] # check list for obj in reply: if isinstance(obj, net_setting): GlobalSettings.deprecated_default_settings = obj.setting() if isinstance(obj, net_model): self.__model = obj.model self.__batch_iter = obj.batch_iter if isinstance(obj, IPSGDOpContainer): self.__optimizer = obj if isinstance(obj, ITransfer): self.__trans = obj if isinstance(obj, misc_package): self.__misc = obj if isinstance(obj, IDataset): if not obj.check(): unsatisfied.append(Requests(Req.Data_Content)) else: self.__data = obj return unsatisfied def ready(self) -> bool: return self.__check()[0] def __check(self) -> Tuple[bool, List[str]]: status = [] s1 = isinstance(self.__optimizer, IPSGDOpContainer) status.append("Optimizer:{}".format("OK" if s1 else "ABSENT")) s2 = isinstance(self.__model, IModel) status.append("Model:{}".format("OK" if s2 else "ABSENT")) s3 = isinstance(self.__data, IDataset) status.append("Dataset:{}".format("OK" if s3 else "ABSENT")) s4 = isinstance(self.__misc, misc_package) status.append("Others:{}".format("OK" if s4 else "ABSENT")) s5 = isinstance(self.__trans, ITransfer) status.append("Transfer:{}".format("OK" if s5 else "ABSENT")) s6 = isinstance(self.__batch_iter, IBatchIter) status.append("Batch Iterator:{}".format("OK" if s6 else "ABSENT")) s7 = isinstance(GlobalSettings.deprecated_default_settings, ISetting) status.append("Settings:{}".format("OK" if s7 else "ABSENT")) return s1 and s2 and s3 and s4 and s5 and s6 and s7, status def done(self) -> bool: return self.__done def start(self, com: ICommunication_Controller) -> None: state, report = self.__check() self.__log.log_message("Ready:{} \n\t Check List:\n\t\t--> {}".format(state, "\n\t\t--> ".join(report))) # get dataset train_x, train_y, test_x, test_y = self.__data.load() self.__log.log_message('Dataset is ready, type: ({})'.format(self.__data)) # build data feeder block_ids = GlobalSettings.get_default().node_2_block[com.Node_Id] feeder = PSGDBlockDataFeeder(train_x, train_y, batch_iter=self.__batch_iter, block_ids=block_ids) # assemble optimizer self.__optimizer.assemble(transfer=self.__trans, block_mgr=feeder) # compile model self.__model.compile(self.__optimizer) # summary summary = self.__model.summary() self.__log.log_message(summary) trace_head = '{}-N({})'.format(self.__misc.mission_title, self.node_id) self.__log.log_message('Model set to ready.') log_head = self.__log.Title # start ! GlobalSettings.deprecated_global_logger = self.__log self.__trans.start_transfer(com, group_offset=list(self.group)[0], printer=self.__log) # record data time_start = time.time() data_send_start = com.Com.bytes_sent data_recv_start = com.Com.bytes_read evaluation_history = [] title = [] r = {} # do until reach the target accuracy for i in range(self.__misc.epoch): # change title self.__log.Title = log_head + "-Epo-{}".format(i + 1) history = self.__model.fit(feeder, epoch=1, printer=self.__log) # do tests r = self.__model.evaluate(test_x, test_y) title = r.keys() row = r.values() self.__log.log_message('Evaluate result: {}'.format(r)) evaluation_history.append(row) if self.__misc.target_acc is not None: # only one metric in model metrics list. # evaluation[0] refers to loss # evaluation[1] refers to accuracy. if r[1] > self.__misc.target_acc: break # record data time_end = time.time() data_sent_end = com.Com.bytes_sent data_recv_end = com.Com.bytes_read training_history = self.__model.fit_history() # save training history data training_name = "TR-" + trace_head + ".csv" training_trace = pd.DataFrame(training_history.history, columns=training_history.title) training_trace.to_csv(training_name, index=False) # save evaluation history data evaluation_name = "EV-" + trace_head + ".csv" evaluation_trace = pd.DataFrame(evaluation_history, columns=title) evaluation_trace.to_csv(evaluation_name, index=False) # save model model_name = "MODEL-" + trace_head + ".model" self.__model.compile(nn.gradient_descent.SGDOptimizer(learn_rate=1e-5)) self.__model.save(model_name) self.__trace_filename.append(training_name) self.__trace_filename.append(evaluation_name) self.__trace_filename.append(model_name) self.__log.log_message('Execution complete, time: {}.'.format(time_end - time_start)) self.__log.log_message('Execution complete, Total bytes sent: {}.'.format(data_sent_end - data_send_start)) self.__log.log_message('Execution complete, Total bytes read: {}.'.format(data_recv_end - data_recv_start)) self.__log.log_message('Trace file has been saved to {}.'.format(trace_head)) # set marker self.__done = True # dispose self.__model.clear() del train_x, train_y, test_x, test_y # return last evaluation result return r def trace_files(self) -> list: return self.__trace_filename
class PSGD_Worker: Training_TimeOut_Limit = 180 def __init__(self): self.__running_thread = None self.client_logger = Logger(title_info='Worker-{}'.format(get_repr()), log_to_file=True) self.__training_log = None self.client_logger.log_message( 'Working started and ready for job submission.') def slave_forever(self): # set up listening port constructor = Worker_Communication_Constructor( '0.0.0.0', STAR_NET_WORKING_PORTS, worker_register=CLZ_WORKER_REGISTER()) while True: com = None try: self.client_logger.log_message( 'Worker started, prepare for connection...') register = constructor.buildCom() com = Communication_Controller(CLZ_COM_PROCESS(register)) com.establish_communication() self.client_logger.log_message( 'Job submission received. Node assigned node_id({})'. format(com.Node_Id)) if self.init_PSGD(com): self.do_training(com) GlobalSettings.clear_default() self.client_logger.log_message( 'Current session closed, node_id({}).'.format(com.Node_Id)) except Exception as e: self.client_logger.log_error( 'Exception occurred: {}'.format(e)) # print DEBUG message import sys import traceback exc_type, exc_value, exc_tb = sys.exc_info() exc_tb = traceback.format_exception(exc_type, exc_value, exc_tb) for line in exc_tb: self.client_logger.log_message(line) # print DEBUG message except KeyboardInterrupt: self.client_logger.log_error( 'Worker shutdown by interruption.') constructor.close() break finally: time.sleep(10) if isinstance(com, Communication_Controller): com.close() self.client_logger.log_message('Worker restarting...') # wait for safe closure def init_PSGD(self, com: Communication_Controller) -> bool: self.client_logger.log_message( 'ACK job submission and request global settings.') # ignore other data def acquire(com): id_from, data = com.get_one() while id_from != Initialization_Server: id_from, data = com.get_one() return data # initialize global settings com.send_one(Initialization_Server, Init.GlobalSettings) # get data data = acquire(com) # restore global settings if not isinstance(data, Reply.global_setting_package): if data == Reply.I_Need_Your_Working_Log: self.client_logger.log_message( 'Nothing needs to be done, send back logfile and exit process.' ) com.send_one(Initialization_Server, Binary_File_Package(self.client_logger.File_Name)) if isinstance(self.__training_log, Logger): com.send_one( Initialization_Server, Binary_File_Package(self.__training_log.File_Name)) if isinstance(self.__running_thread, PSGDTraining_Client): com.send_one( Initialization_Server, Binary_File_Package(self.__running_thread.Trace_Eval)) com.send_one( Initialization_Server, Binary_File_Package(self.__running_thread.Trace_Train)) com.send_one(Initialization_Server, Done_Type()) return False data.restore() self.client_logger.log_message('Request codec and sgd class.') # initialize codec and sgd type com.send_one(Initialization_Server, Init.Codec_And_SGD_Type) data = acquire(com) assert isinstance(data, Reply.codec_and_sgd_package) codec, sgd = data.restore() self.client_logger.log_message('Request weights and layer type.') # initialize weights and layer com.send_one(Initialization_Server, Init.Weights_And_Layers) data = acquire(com) assert isinstance(data, Reply.weights_and_layers_package) layers = data.restore() self.client_logger.log_message('Request other stuff.') # others com.send_one(Initialization_Server, Init.MISC) data = acquire(com) assert isinstance(data, Reply.misc_package) loss_t = data.loss_type target_acc = data.target_acc epoch = data.epoch learn_rate = data.learn_rate w_type = data.w_types op = data.optimizer metric = data.metric self.__training_log = Logger('Training log @ node-{}'.format( com.Node_Id), log_to_file=True) if com.Node_Id != Parameter_Server: self.client_logger.log_message('Request data samples.') # initialize dataset com.send_one(Initialization_Server, Init.Samples) data = acquire(com) # restore assert isinstance(data, Reply.data_sample_package) train_x, train_y, eval_x, eval_y = data.restore() self.__running_thread = PSGDTraining_Client( model_init=layers, loss=loss_t, codec_type=codec, sync_class=sgd, com=com, w_types=w_type, tags=build_tags(node_id=com.Node_Id), train_x=train_x, train_y=train_y, eval_x=eval_x, eval_y=eval_y, optimizer=op, batch_size=GlobalSettings.get_default().batch.batch_size, epochs=epoch, logger=self.__training_log, learn_rate=learn_rate, target_acc=target_acc, metrics=metric) else: self.__running_thread = PSGDTraining_Parameter_Server( model_init=layers, ps_codec=codec, ps_sgd_type=sgd, com=com, w_types=w_type, logger=self.__training_log) self.client_logger.log_message( 'Submit stage complete, Total bytes sent: {}'.format( com.Com.bytes_sent)) self.client_logger.log_message( 'Submit stage complete, Total bytes read: {}'.format( com.Com.bytes_read)) return True def do_training(self, com: Communication_Controller): self.client_logger.log_message('Prepare to start training process.') # check assert isinstance(self.__running_thread, Thread) assert isinstance(self.__training_log, Logger) ready_state = {} self.client_logger.log_message('Synchronize timeline with cluster.') len_ready = len(com.available_clients()) time_count = 0 # check ready states while len(ready_state) != len_ready: # require n, d = com.get_one(False) if isinstance(d, Ready_Type): ready_state[n] = True time_count = 0 if len(com.available_clients()) < len_ready: raise OSError('Minimal number of clients cannot be satisfied.') if time_count > PSGD_Worker.Training_TimeOut_Limit: raise AssertionError( 'Maximal waiting time exceed, give up waiting and reset environment.' ) for node_id in com.available_clients(): com.send_one(node_id, Ready_Type()) time.sleep(1) time_count += 1 try: self.client_logger.log_message('Execution process started.') data_sent_mark = com.Com.bytes_sent data_recv_mark = com.Com.bytes_read begin = time.time() self.__running_thread.start() self.__running_thread.join() end = time.time() self.__training_log.log_message( 'Execution complete, time:{}'.format(end - begin)) self.__training_log.log_message( 'Bytes sent: {}'.format(com.Com.bytes_sent - data_sent_mark)) self.__training_log.log_message( 'Bytes read: {}'.format(com.Com.bytes_read - data_recv_mark)) self.client_logger.log_message( 'Execution complete, time:{}'.format(end - begin)) self.client_logger.log_message( 'Training stage complete, Total bytes sent: {}'.format( com.Com.bytes_sent)) self.client_logger.log_message( 'Training stage complete, Total bytes read: {}'.format( com.Com.bytes_read)) if isinstance(self.__running_thread, PSGDTraining_Client): train_csv = Binary_File_Package( self.__running_thread.Trace_Train) eval_csv = Binary_File_Package( self.__running_thread.Trace_Eval) self.client_logger.log_message('Post training log.') com.send_one(Initialization_Server, train_csv) com.send_one(Initialization_Server, eval_csv) except Exception as error: self.client_logger.log_error( 'Error encountered while executing : {}'.format(error)) self.__training_log.log_error( 'Error encountered while executing : {}'.format(error)) self.client_logger.log_message('Training process exited.') log_file = Binary_File_Package(self.__training_log.File_Name) com.send_one(Initialization_Server, log_file)
class Coordinator: def __init__(self, hyper_model: IServerModel, logger=None): self.__com = None self.__model = hyper_model if logger is None: self.__log = Logger(title_info='Coordinator-{}'.format(get_repr()), log_to_file=True) else: self.__log = logger def set_workers(self, works: list, nodes_required) -> bool: """ Set worker list. :param works: list of tuples like: [ (rule1, address1), (rule2, address2), ... ] :return: None, raise exceptions if two workers with same id are assigned. """ pkg = IPA() uuid_for_this_task = str(random.randint(0, 0x7fffffff)) current_node_id_assigned = 0 # set all address for rule, addr in works: # Stop connecting, if required nodes count were satisfied. if current_node_id_assigned >= nodes_required and rule == "Worker": self.__log.log_message('Number of nodes satisfied.') break if rule == "PS": _id = Parameter_Server else: _id = current_node_id_assigned current_node_id_assigned += 1 pkg.put(_id, uuid_for_this_task, addr) self.__log.log_message( 'Add worker (Rule: {}, Id: {}, Address: {}).'.format( rule, _id, addr)) self.__log.log_message('Try connecting to the cluster.') self.__com = NET(pkg) self.__com = Communication_Controller(self.__com) self.__com.establish_communication() self.__log.log_message('Connection with cluster established.') return True def resources_dispatch(self): """ Reply to worker's requirements, prepare for the job :return: """ # assertion assert isinstance(self.__com, Communication_Controller) assert isinstance(self.__model, IServerModel) total_node_count = len(self.__com.available_clients()) node_ready = set() key_interrupted_before = False while not self.__com.is_closed(): try: id_from, data = self.__com.get_one() if isinstance(data, Init): if data == Init.GlobalSettings: reply = Reply.global_setting_package( GlobalSettings.get_default()) elif data == Init.Weights_And_Layers: reply = Reply.weights_and_layers_package( self.__model.getWeightsInit()) elif data == Init.Codec_And_SGD_Type: if id_from != Parameter_Server: reply = Reply.codec_and_sgd_package( self.__model.codec_ctrl(), self.__model.psgd_type()) else: reply = Reply.codec_and_sgd_package( self.__model.psgd_server_codec(), self.__model.psgd_server_type()) elif data == Init.Samples: reply = Reply.data_sample_package( *self.__model.train_data(), *self.__model.eval_data()) elif data == Init.MISC: reply = Reply.misc_package( self.__model.epoches(), self.__model.loss_type(), self.__model.learn_rate(), self.__model.target_acc(), self.__model.weights_types(), self.__model.optimizer_type(), self.__model.metric()) else: reply = None self.__log.log_message( 'Reply requirements to node({}), type({}).'.format( id_from, reply.__class__.__name__)) self.__com.send_one(id_from, reply) elif isinstance(data, Ready_Type): self.__com.send_one(id_from, Ready_Type()) if id_from in node_ready: continue node_ready.add(id_from) self.__log.log_message( 'Node({}) is ready, {} nodes total, {} is ready.'. format(id_from, total_node_count, node_ready)) elif isinstance(data, Binary_File_Package): self.__log.log_message( 'Restoring data ({}) from {}.'.format( data.filename, id_from)) data.restore() except KeyboardInterrupt: if len(node_ready) < total_node_count: self.__log.log_error( 'Some of workers is not ready, close anyway?') self.__log.log_message( 'Press Ctrl+C again to shutdown immediately.') key_interrupted_before = True if key_interrupted_before or len( node_ready) >= total_node_count: self.__log.log_error('Coordinator closed by user.') break self.__com.close() self.__log.log_message('Dispatcher closed.') def require_client_log(self): """ Require client_log file from all workers. :return: None """ assert isinstance(self.__com, Communication_Controller) # self.__log.log_message('Acquire log file from each worker.') # take all ACK for id in self.__com.available_clients(): _, _ = self.__com.get_one() # send request for id in self.__com.available_clients(): self.__com.send_one(id, Reply.I_Need_Your_Working_Log) try: # get result for id in self.__com.available_clients(): self.__log.log_message( 'Acquire log file from worker({}).'.format(id)) log = None while not isinstance(log, Done_Type): _, log = self.__com.get_one() if isinstance(log, Binary_File_Package): log.restore() self.__log.log_message( 'Save log file for worker({}).'.format(id)) except: self.__log.log_error('Connection lost.') self.__com.close() self.__log.log_message('Done.') return
class Coordinator: def __init__(self, com: ICommunication_Controller, estimate_bandwidth: int = 10, logger: IPrinter = None): """ Coordinator :param com: Communication Thread :param estimate_bandwidth: bandwidth estimation, Bytes per second :param logger: IPrinter """ self.__com = com if logger is None: self.__log = Logger(title_info='Coordinator', log_to_file=True) else: self.__log = logger self.__estimate_bandwidth = estimate_bandwidth self.__group_allocated = set() self.__global_allocated = set() self.__log.log_message("Coordinator version: {}.".format(VERSION)) @property def allocated_nodes(self): return self.__global_allocated | self.__group_allocated def resources_dispatch(self, dispatch_map: Callable[[int, object], IReplyPackage]): """ Reply to worker's requirements, prepare for the job :param dispatch_map: Callable object, receive a IRequestPackage instance and returns IReplyPackage instance for reply. :return: """ # dispatch to certain group node_ready = set() while node_ready != self.allocated_nodes: try: id_from, data = self.__com.get_one() reply = None if isinstance(data, IRequestPackage): reply = dispatch_map(id_from, data.content()) self.__log.log_message( 'Reply requirements to node({}), type({}).'.format(id_from, reply.__class__.__name__)) elif isinstance(data, ReadyType): reply = ReadyType(node_ready) if id_from in node_ready: continue node_ready.add(id_from) self.__log.log_message('Node({}) is ready, {} is ready.'.format(id_from, node_ready)) elif isinstance(data, Version): reply = Version(Initialization_Server) self.__log.log_message("{}".format(data)) self.__com.send_one(id_from, reply) except KeyboardInterrupt: if len(node_ready) < len(self.allocated_nodes): self.__log.log_error('Some workers are not ready.') self.__log.log_error('Coordinator closed by user.') self.__log.log_message('Dispatch complete.') def join(self) -> Dict[int, object]: """ Join all workers, wait for all task. :return: Returns a dict, indicates what has been returned from executor on each worker. """ # Join all nodes. node_ready = set() # Collect result. results: Dict[int, object] = {} self.__log.log_message("Waiting for ({}) ...".format(self.allocated_nodes)) while node_ready != self.allocated_nodes: id_from, data = self.__com.get_one() if isinstance(data, IReplyPackage): data.restore() self.__log.log_message('Restoring data ({}) from {}.'.format(data, id_from)) if isinstance(data, DoneType): file_format = "\n\t\t--> ".join([filename for filename in data.file_list]) self.__log.log_message('Save file for {}.\n\tList:\n\t\t--> {}'.format(id_from, file_format)) node_ready.add(id_from) self.__log.log_message('Node({}) is done, {} is done.'.format(id_from, node_ready)) results[id_from] = data.result self.__log.log_message("All task is complete.") return results def submit_group(self, worker_executor: Type[IExecutor], working_group: Iterable[int] = None, package_size: int = 1e9): """ Submit a job to a specified worker group. Nodes inside this group will wait for each other and synchronize start time. Group will also wait for all single nodes were ready. :param worker_executor: executor class, implementation of IExecutor :param working_group: Worker group list, iterable object, contains id of each worker in the group. :param package_size: Package size in transmission. Potentially required by executor, and provided by dispatch. :return: None """ # set work group if working_group is None: working_group = set(self.__com.available_clients) if not isinstance(working_group, set): working_group = set(working_group) # check for duplication assert len(self.__group_allocated & working_group) == 0, "Cannot submit a task to node which already has a job." # calculate data size dataset_ett = self.__com.available_clients_count * package_size / self.__estimate_bandwidth + 1 # send request for _id in working_group: self.__com.send_one(_id, SubmitJob(working_group | self.__global_allocated, dataset_ett, worker_executor)) self.__group_allocated = self.__group_allocated | working_group self.__log.log_message("Group submission complete ({}).".format(working_group)) def submit_single(self, worker_executor: Type[IExecutor], worker_id: int, package_size: int = 1e9): """ Submit a job to a specified node. This global node will start execution immediately when itself was ready. :param worker_executor: executor class, implementation of IExecutor :param worker_id: Worker id. :param package_size: Package size in transmission. Potentially required by executor, and provided by dispatch. :return: """ # check for duplication assert worker_id not in self.__global_allocated, "Cannot submit a task to node which already has a job." # calculate data size dataset_ett = self.__com.available_clients_count * package_size / self.__estimate_bandwidth + 0.6 # send request self.__com.send_one(worker_id, SubmitJob({worker_id}, dataset_ett, worker_executor)) self.__global_allocated.add(worker_id) self.__log.log_message("Single node submission complete.")
class ParallelSGD: """ P-SGD 主调类 P-SGD RPC Controller """ def __init__(self, model: Model, data: AbsDataset, transform: ITransformer): """ 初始化一个P-SGD主调对象 :param model: 用于数据并行化的模型。 :param data: 用于并行化的数据集。 :param transform: 数据集处理转换策略。Dataset在每个节点上都是可见的,由 transform 对数据集 进行状态转换处理,数据集的处理操作是本地化执行的,数据集转换策略是链表形式 组织的,以管道的形式运行,数据集依次经过每个 transform 操作,最终进入由 BatchIter 调度。 """ self.__model = model self.__data = data self.__transform = transform self.__log = Logger(title_info="P-SGD Submit", log_to_file=True) def parallel(self, nodes: NodeAssignment, redundancy: int = 1, block_size: int = 64, epoch: int = 10, assignment_type: Type[AbsBlockAssignment] = IIDBlockAssignment, sync_type: Type[IParallelSGD] = SynchronizedSGD, op_type: Type[IOptimizer] = PSGDOptimizer, gd_type: Type[IGradientDescent] = ADAMOptimizer, codec: Union[Dict[int, Type[Codec]], Type[Codec]] = None, gd_params: Tuple[object] = (), ps_codec: Union[Dict[int, Type[Codec]], Type[Codec], None] = None, network_bandwidth: int = 1048576, mission_title: str = "P-SGD", ssgd_timeout_limit: int = 10000): """ 执行并行化。 :param ssgd_timeout_limit: Sync-SGD等待超时限制,单位为毫秒,数值为整型。 :param network_bandwidth: 可用的网络带宽,用作计算预估传输时间,设置 pre_commit 超时计时器。 :param mission_title: 任务标题,作为本次任务的log文件文件名。 :param nodes: 由 network 模块提供的 NodeAssignment 接口,指示了当前并行化操作调用的节点数目。 参数服务器的节点编号由 utils.constant.Parameter_Server 指定,其余工作节点的id 从 0 开始依次递增(为整数型)。 :param redundancy: 冗余设置,适用于能够处理冗余的 codec 和 block assignment。 继承自 AbsBlockAssignment 总能够处理含有冗余参数的提交任务,codec 的冗余处理则由 codec 自行定义。 :param block_size: 节点粒度的 Batch 大小,由 codec 控制具体的更新策略,块大小与批次大小并没有具体对应关系。 若 codec 在每个 Block 后给出结果,则实践意义上 Block size 和 Batch size 是等价的, 若 codec 总是等待所有 Block 的训练完成后再同步结果,则实践意义上 Batch size 等于 Block size 乘以 Block 总数。 :param epoch: 训练批次数,由 codec 和 sync type 共同决定 epoch 内的同步策略,当使用参数服务器时,参数服务器 也参与到同步状态的维持中。 若 codec 不允许异步执行,则所有节点都会在同一时刻结束 epoch,若 codec 或 sync type 允许跨批次 执行,则节点会根据自己的计算能立先后结束计算。 :param assignment_type: 样本分配策略,一般与冗余分配结合使用,需要实现了 profiles.ISetting 接口的类型。 初衷是提供冗余的数据集分配策略,现可以提供静态数据量分配。 :param sync_type: 同步方式。分同步和异步两种,需要实现了 psgd.sync.IParallelSGD 接口的类型。 同步方式下,每个 worker 在调用 get_weights() 获取权重时才会处理接收数据。 异步方式下,每个 Worker 收到数据就处理并更新结果集。 具体的数据处理流程和结果集更新策略都由 codec 定义。 :param gd_type: 梯度处理策略类型,实现了 nn.IOptimizer 接口的类型。 负责处理梯度更新策略。 :param op_type: 梯度生成策略,实现了 nn.gradient_descent.IGradientDescent 接口的类型。 负责生成待处理的更新增量。 :param codec: 编码器类型,实现了 codec.interface.Codec 接口的类型。 :param gd_params: 梯度生成器参数 :param ps_codec: 编码器类型,实现了 codec.interface.Codec 接口的类型。 用于参数服务器进行数据处理。 :return: """ # 初始化适合的Codec if codec is None: codec = dict() if ps_codec is None: ps_codec = dict() # 默认填充Codec default_codec = DummyCodec default_ps_codec = DummyCodec # 如果传入确定的Codec if isinstance(codec, type): default_codec = codec codec = dict() if isinstance(ps_codec, type): default_ps_codec = ps_codec ps_codec = dict() # 获取所有的合法Slave node_count = 0 has_ps = False for _id, _ in nodes: if _id >= 0: node_count += 1 else: has_ps = True # 任务分配策略 assignment: ISetting = assignment_type(node_count, redundancy) # 分配策略实例 setting: net_setting = net_setting(assignment_type, node_count, redundancy) # 模型实例 model: net_model = net_model(self.__model, BatchIter(block_size, assignment.block_count)) # 优化器实例 optimizer: net_optimizer = net_optimizer(op_type, gd_type, op_params=gd_params) # 变量表 var_ids = [var.id for var in self.__model.trainable_variables()] # 变量表Codec字典 var_codec = {var_id: (sync_type, codec.get(var_id, default_codec)) for var_id in var_ids} # Transfer 实例 transfer_worker: net_transfer = net_transfer(var_codec) # PS Codec 变量表字典 var_ps_codec = {var_id: (AsynchronizedSGD, ps_codec.get(var_id, default_ps_codec)) for var_id in var_ids} # PS Transfer 实例 transfer_ps: [net_transfer] = net_transfer(var_ps_codec) if has_ps else None # 其他信息 misc: misc_package = misc_package(mission_title, epoch, None, ssgd_timeout_limit) replies = { Req.Model: model, Req.Setting: setting, Req.Optimizer: optimizer, Req.Transfer: transfer_worker, Req.Transfer_PS: transfer_ps, Req.Other_Stuff: misc, Req.Data_Package: data_package(self.__data, self.__transform), Req.Data_Content: data_content(self.__data, self.__transform) } req = Request() self.__log.log_message("Start job.") self.__log.log_message("Workers: {}".format(nodes)) with req.request(nodes) as com: coordinator = Coordinator(com, estimate_bandwidth=network_bandwidth, logger=self.__log) if has_ps: coordinator.submit_single(PSGDPSExecutor, Parameter_Server, self.__data.estimate_size()) coordinator.submit_group(PSGDWorkerExecutor, assignment.nodes, self.__data.estimate_size()) coordinator.resources_dispatch(lambda _id, x: replies[x]) return coordinator.join()
parse.add_argument("--workers", type=str, default='worker.json', help='Worker list file, json type') parse.add_argument("--network-bandwidth", dest='bandwidth', type=int, default=100, help='Network bandwidth in Mbps.') arg = parse.parse_args() logger = Logger(title_info='P-SGD User Submit', log_to_file=True) logger.log_message('Initializing with parameters: ') logger.log_message('\t --model <file name {}>'.format(arg.model_file)) logger.log_message('\t --node_count <node count {}>'.format(arg.n)) logger.log_message('\t --batch_size <batch size {}>'.format(arg.b)) logger.log_message('\t --redundancy <r {}>'.format(arg.r)) logger.log_message( '\t --codec <communication codec and protocol {}>'.format(arg.codec)) logger.log_message( '\t --optimizer <optimizer for model training {}>'.format(arg.op)) logger.log_message('\t --epochs <training epochs {}>'.format(arg.epochs)) logger.log_message('\t --dataset <using {}>'.format(arg.dataset)) logger.log_message('\t --non-iid <{}>'.format(arg.make_iid_dataset)) logger.log_message('\t --gradient-descent <Gradient method {}>'.format( arg.gd)) logger.log_message(
class Worker: def __init__(self): self.client_logger = Logger(title_info='Worker-{}'.format(get_repr()), log_to_file=True) self.client_logger.log_message('Worker version: {}.'.format(VERSION)) self.__job_executor: [IExecutor] = None def slave_forever(self): # set up listening port listener = Serve(net_type='fcnet') try: while True: self.client_logger.log_message( 'Worker started with network type \'FCNet\'.') try: with listener.acquire() as com: self.client_logger.log_message( 'Job submission received. Node assigned node_id({})' .format(com.Node_Id)) self.dispatch(com) self.client_logger.log_message( 'Current session closed, node_id({}).'.format( com.Node_Id)) self.client_logger.log_message('Worker restarting...') time.sleep(1) except OSError: self.client_logger.log_message( "Initialization server exited without report.") except ConnectionResetError: self.client_logger.log_message( "Initialization server exited without report.") except KeyboardInterrupt: self.client_logger.log_error('Worker shutdown by interruption.') listener.close() @staticmethod def __recv_pack(com: ICommunication_Controller, timeout: int = 100): data = None id_from = None time_out_end = time.time() + timeout # requests with timeout check while data is None: id_from, data = com.get_one(blocking=False) time.sleep(0.01) # Assertion, this node count as one assert Initialization_Server in com.available_clients, "Initialization server exited without finishing the initialization." assert time.time() < time_out_end, "Maximum waiting time exceed." return id_from, data def dispatch(self, com: ICommunication_Controller): """ Get first package and find out what to do. All exceptions will be handled here, and trace back information will be recorded to client_logger. Used job_submit.py --retrieve to get trace back log. :param com: :return: """ results = None try: id_from = com.Node_Id req = None while id_from != Initialization_Server: id_from, req = Worker.__recv_pack( com, Init_Job_Submission_Timeout_Limit_Sec) if isinstance(req, SubmitJob): self.client_logger.log_message('ACK job submission.') if self.initialize(com, req): results = self.do_training(com) if isinstance(req, RequestWorkingLog): self.client_logger.log_message('ACK logfile reclaim.') except Exception as e: # print DEBUG message import sys import traceback exc_type, exc_value, exc_tb = sys.exc_info() exc_tb = traceback.format_exception(exc_type, exc_value, exc_tb) exc_format = "".join(exc_tb) self.client_logger.log_error('Exception occurred: {}\n\t{}'.format( e, exc_format)) # print DEBUG message self.post_log(com, results) def post_log(self, com: ICommunication_Controller, other_contents: object): """ Post worker log file to coordinator. :param other_contents: other content can be attached :param com: :return: """ posting_files = [self.client_logger.File_Name] if isinstance(self.__job_executor, AbsExecutor): for filename in self.__job_executor.trace_files(): posting_files.append(filename) # Post files com.send_one(Initialization_Server, DoneType(com.Node_Id, posting_files, other_contents)) def initialize(self, com: ICommunication_Controller, job_info: SubmitJob) -> bool: """ Initialize execution environment :param com: Communication process :param job_info: job info :return: """ # restoring data job_info.restore() # get info ready_state = set() total_nodes = job_info.work_group eta_waiting_time = job_info.waiting_time self.__job_executor: AbsExecutor = job_info.executioner( com.Node_Id, job_info.work_group) # Report Version com.send_one(Initialization_Server, Version(node_id=com.Node_Id)) # Acknowledge requests requests = self.__job_executor.requests() replies = [] # Ask for replies for req in requests: com.send_one(Initialization_Server, RequestPackage(req)) req_format = "\tRequests List:\n\t\t--> {}".format("\n\t\t--> ".join( [str(req) for req in requests])) self.client_logger.log_message('Request data: ({})\n{}'.format( len(requests), req_format)) self.client_logger.log_message('ETA: ({})'.format(eta_waiting_time)) # Set job executor to ready state while not self.__job_executor.ready(): id_from, data = Worker.__recv_pack(com, eta_waiting_time) self.client_logger.log_message('Ack package, type: ({})'.format( data.__class__.__name__)) # restoring data if isinstance(data, IReplyPackage): data.restore() replies.append(data) if len(replies) == len(requests): requests = self.__job_executor.satisfy(replies) for req in requests: com.send_one(Initialization_Server, RequestPackage(req)) self.client_logger.log_message( 'Request data: ({}).'.format(requests)) self.client_logger.log_message( 'ETA: ({})'.format(eta_waiting_time)) replies.clear() # pass to sync elif isinstance(data, ReadyType): ready_state = ready_state | data.current_ready() self.client_logger.log_message( 'Submit stage complete, Total bytes sent: {}'.format( com.Com.bytes_sent)) self.client_logger.log_message( 'Submit stage complete, Total bytes read: {}'.format( com.Com.bytes_read)) self.client_logger.log_message('Synchronize timeline with cluster.') Worker.synchronize(com, ready_state, total_nodes, eta_waiting_time) return True @staticmethod def synchronize(com: ICommunication_Controller, ready_state: set, total_nodes: set, timeout: int): """ Synchronize timeline with cluster. Make sure all nodes exits this method with same time. :param com: communication controller :param ready_state: how much nodes is ready now :param total_nodes: how much nodes we need for the job :param timeout: timeout limit in seconds, vaguely accuracy :return: """ dead_line = time.time() + timeout ready_state.add(com.Node_Id) for id in com.available_clients: com.send_one(id, ReadyType(ready_state)) while ready_state & total_nodes != total_nodes: assert time.time() < dead_line, "Maximum waiting time exceed." current_active = set(com.available_clients) | {com.Node_Id} assert current_active & total_nodes == total_nodes, \ "Current nodes: {}, required nodes: {}.".format(current_active, total_nodes) # inc time clock time.sleep(0.01) # check ready state id_from, data = com.get_one(blocking=False) if isinstance(data, ReadyType): ready_state = ready_state | data.current_ready() def do_training(self, com: ICommunication_Controller) -> object: """ Execute job. """ self.client_logger.log_message('Execution process started.') begin = time.time() result = self.__job_executor.start(com) end = time.time() self.client_logger.log_message( 'Execution complete, time:{}'.format(end - begin)) self.client_logger.log_message( 'Execution stage complete, Total bytes sent: {}'.format( com.Com.bytes_sent)) self.client_logger.log_message( 'Execution stage complete, Total bytes read: {}'.format( com.Com.bytes_read)) self.client_logger.log_message('Execution process exited.') return result