def init_worker(self): """ `init_worker` has many many functions to do before training, first, wait for all parameter servers launch completely. second, run executor to initialize startup program third, wait for all worker initialize completely. Returns: None """ # if MPISymetricRoleMaker is defined # we suppose a user wants to submit job on mpi cluster if isinstance(self._role_maker, MPISymetricRoleMaker): # check whether server has been initialized from paddle.fluid.transpiler.details.checkport import wait_server_ready wait_server_ready(fleet.server_endpoints(to_string=False)) if not self._transpile_config.sync_mode: if self._transpile_config.geo_sgd_mode: self._communicator = Communicator( self.main_program, self.vars_info, fleet.worker_num(), self._transpile_config.geo_sgd_need_push_nums) else: self._communicator = Communicator(self.main_program) if not self._communicator.is_running(): self._communicator.start() else: warnings.warn("communicator has been initialized, skip")
def init_worker(self): """ `init_worker` has many many functions to do before training, first, wait for all parameter servers launch completely. second, run executor to initialize startup program third, wait for all worker initialize completely. Returns: None """ # if MPISymetricRoleMaker is defined # we suppose a user wants to submit job on mpi cluster if isinstance(self._role_maker, MPISymetricRoleMaker): # check whether server has been initialized from paddle.fluid.transpiler.details.checkport import wait_server_ready wait_server_ready(fleet.server_endpoints(to_string=False)) program_config = self._transpile_config.get_program_config() trainer_communicator_config = self._transpile_config.get_trainer_runtime_config( ) print(trainer_communicator_config) if isinstance(self._transpile_config, GeoStrategy): kwargs = {} kwargs["push_vars"] = self.vars_info kwargs["trainers"] = fleet.worker_num() kwargs["push_nums"] = self._transpile_config.get_program_config( ).geo_sgd_need_push_nums self._communicator = Communicator( self.main_program, DistributedMode.GEO, kwargs, trainer_communicator_config.get_communicator_flags()) elif isinstance(self._transpile_config, AsyncStrategy): self._communicator = Communicator( self.main_program, DistributedMode.ASYNC, None, trainer_communicator_config.get_communicator_flags()) elif isinstance(self._transpile_config, HalfAsyncStrategy): self._communicator = Communicator( self.main_program, DistributedMode.HALF_ASYNC, None, trainer_communicator_config.get_communicator_flags()) elif isinstance(self._transpile_config, SyncStrategy): kwargs = {} kwargs[ "pserver_endpoints"] = self._role_maker.get_pserver_endpoints() kwargs["trainer_id"] = self._role_maker.worker_index() self._communicator = Communicator( self.main_program, DistributedMode.SYNC, kwargs, trainer_communicator_config.get_communicator_flags()) else: raise TypeError("Training MODE do not supported") if not self._communicator.is_running(): self._communicator.start() else: warnings.warn("communicator has been initialized, skip")
def _init_transpiler_worker(self): """ `init_worker` has many many functions to do before training, first, wait for all parameter servers launch completely. second, run executor to initialize startup program third, wait for all worker initialize completely. Returns: None """ def sync_strategy_envs(): kwargs = {} kwargs[ "pserver_endpoints"] = self._role_maker.get_pserver_endpoints( ) kwargs["trainer_id"] = self._role_maker.worker_index() return kwargs def geo_strategy_envs(): def get_sparse_attrs(): opt_init_map = {} opt_init_map["gaussian_random"] = ["seed", "mean", "std"] opt_init_map["fill_constant"] = ["value"] opt_init_map["uniform_random"] = ["seed", "min", "max"] opt_init_map["truncated_gaussian_random"] = [ "seed", "mean", "std" ] dist_varnames = get_sparse_tablenames( self._origin_main_program, True) sparse_varnames = get_sparse_tablenames( self._origin_main_program, False) if len(dist_varnames) != 0: raise ValueError( "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding" ) init_attrs = [] for value_name in sparse_varnames: value_var = self._origin_main_program.global_block( ).vars[value_name] value_attr = [ value_name, ",".join([str(dim) for dim in value_var.shape]) ] for op in self._origin_startup_program.global_block().ops: if op.type in opt_init_map.keys( ) and value_name == op.output("Out")[0]: init_attr = [op.type] for attr in opt_init_map[op.type]: init_attr.append(str(op.attr(attr))) value_attr.append("&".join(init_attr)) init_attrs.append(":".join(value_attr)) break return "#".join(init_attrs) kwargs = {} kwargs["trainers"] = self.worker_num() kwargs["sparse_attrs"] = get_sparse_attrs() return kwargs # if MPISymetricRoleMaker is defined # we suppose a user wants to submit job on mpi cluster if isinstance(self._role_maker, MPISymetricRoleMaker): # check whether server has been initialized wait_server_ready(self.server_endpoints(to_string=False)) trainer_config = self._strategy.get_trainer_runtime_config() print(trainer_config) lrs = _has_global_step(_get_lr_ops(self._origin_main_program)) if lrs > 0: kwargs = {"need_global_step": "1"} else: kwargs = {"need_global_step": "0"} if isinstance(self._strategy, GeoStrategy): geo_kwargs = geo_strategy_envs() kwargs.update(geo_kwargs) if isinstance(self._strategy, SyncStrategy): sync_kwargs = sync_strategy_envs() kwargs.update(sync_kwargs) kwargs = kwargs if kwargs else None send_ctx = fleet.compiled_config.get_communicator_send_context() if self.compiled_config.is_geo_mode(): recv_ctx = fleet.compiled_config.get_communicator_recv_context( recv_type=4) else: recv_ctx = fleet.compiled_config.get_communicator_recv_context( recv_type=1) from paddle.fluid.communicator import Communicator self._communicator = Communicator( trainer_config.mode, kwargs, trainer_config.get_communicator_flags()) self._communicator.init_with_ctx(send_ctx, recv_ctx) if not self._communicator.is_running(): self._communicator.start() else: raise ValueError( "Communicator can only be inited once, please check")