def delete_optimizer_pass(program, config): def _delete_optimizer_op_and_vars(_program, optimize_ops): optimize_vars = [] optimize_op_role_vars = [] optimize_need_delete_vars = [] for op in optimize_ops: optimize_vars.extend(op.input_arg_names) optimize_op_role_vars.extend(op.attr("op_role_var")) optimize_vars = list(set(optimize_vars)) optimize_op_role_vars = list(set(optimize_op_role_vars)) for var in optimize_vars: if var not in optimize_op_role_vars: optimize_need_delete_vars.append(var) need_delete_optimize_vars = list(set(optimize_need_delete_vars)) delete_ops(_program.global_block(), optimize_ops) for var in need_delete_optimize_vars: if _program.global_block().has_var(var): _program.global_block()._remove_var(var) optimizer_ops = _get_optimize_ops(program) lr_ops = _get_lr_ops(program) optimizer_ops.extend(lr_ops) _delete_optimizer_op_and_vars(program, optimizer_ops) return program
def delete_optimizer_pass(program, config): def _delete_optimizer_op_and_vars(_program, optimize_ops): optimize_vars = [] optimize_op_role_vars = [] optimize_need_delete_vars = [] for op in optimize_ops: optimize_vars.extend(op.input_arg_names) optimize_op_role_vars.extend(op.attr("op_role_var")) optimize_vars = list(set(optimize_vars)) optimize_op_role_vars = list(set(optimize_op_role_vars)) for var in optimize_vars: if var not in optimize_op_role_vars: optimize_need_delete_vars.append(var) need_delete_optimize_vars = list(set(optimize_need_delete_vars)) delete_ops(_program.global_block(), optimize_ops) for var in need_delete_optimize_vars: if _program.global_block().has_var(var): _program.global_block()._remove_var(var) def _add_lr_var(main_program, compiled_config): # Todo: hard code for pe lr_var = compiled_config.origin_main_program.global_block( ).vars["learning_rate_0"] main_program.global_block().create_var(name=lr_var.name, shape=lr_var.shape, dtype=lr_var.dtype, type=lr_var.type, lod_level=lr_var.lod_level, persistable=True) optimizer_ops = _get_optimize_ops(program) lr_ops = _get_lr_ops(program) optimizer_ops.extend(lr_ops) _delete_optimizer_op_and_vars(program, optimizer_ops) if hasattr(config.origin_main_program, 'lr_sheduler'): _add_lr_var(program, config) return program
def _init_worker(self): def sync_strategy_envs(): kwargs = {} kwargs[ "pserver_endpoints"] = self.role_maker._get_pserver_endpoints( ) kwargs["trainer_id"] = self.role_maker._worker_index() return kwargs def geo_strategy_envs(): from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames def get_sparse_attrs(): opt_init_map = {} opt_init_map["gaussian_random"] = ["seed", "mean", "std"] opt_init_map["fill_constant"] = ["value"] opt_init_map["uniform_random"] = ["seed", "min", "max"] opt_init_map["truncated_gaussian_random"] = [ "seed", "mean", "std" ] dist_varnames = get_sparse_tablenames(self.origin_main_program, True) sparse_varnames = get_sparse_tablenames( self.origin_main_program, False) if len(dist_varnames) != 0: raise ValueError( "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding" ) init_attrs = [] for value_name in sparse_varnames: value_var = self.origin_main_program.global_block( ).vars[value_name] value_attr = [ value_name, ",".join([str(dim) for dim in value_var.shape]) ] for op in self.origin_startup_program.global_block().ops: if op.type in opt_init_map.keys( ) and value_name == op.output("Out")[0]: init_attr = [op.type] for attr in opt_init_map[op.type]: init_attr.append(str(op.attr(attr))) value_attr.append("&".join(init_attr)) init_attrs.append(":".join(value_attr)) break return "#".join(init_attrs) kwargs = {} kwargs["trainers"] = self.role_maker._worker_num() kwargs["sparse_attrs"] = get_sparse_attrs() return kwargs from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops, _has_global_step from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \ SyncStrategy, GeoStrategy trainer_config = self.async_strategy.get_trainer_runtime_config() print(trainer_config) dist_strategy = self.context["valid_strategy"] launch_barrier = dist_strategy.a_sync_configs["launch_barrier"] if launch_barrier: # for trainer wait server ready wait_server_ready(self.role_maker._get_pserver_endpoints()) # for ps-heter mode, wait heter worker ready if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker( ): wait_server_ready( self.role_maker._get_heter_worker_endpoints()) lrs = _has_global_step(_get_lr_ops(self.origin_main_program)) if lrs: kwargs = {"need_global_step": "1"} else: kwargs = {"need_global_step": "0"} if isinstance(self.async_strategy, GeoStrategy): geo_kwargs = geo_strategy_envs() kwargs.update(geo_kwargs) if isinstance(self.async_strategy, SyncStrategy): sync_kwargs = sync_strategy_envs() kwargs.update(sync_kwargs) kwargs = kwargs if kwargs else None send_ctx = self.compiled_strategy.get_communicator_send_context() if self.compiled_strategy.is_geo_mode(): recv_ctx = self.compiled_strategy.get_communicator_recv_context( recv_type=4) else: recv_ctx = self.compiled_strategy.get_communicator_recv_context( recv_type=1) from paddle.fluid.communicator import Communicator self._communicator = Communicator( trainer_config.mode, kwargs, trainer_config.get_communicator_flags()) self._communicator.init_with_ctx(send_ctx, recv_ctx) if not self._communicator.is_running(): self._communicator.start() else: warnings.warn("communicator has been initialized, skip")
def add_optimizer_pass(program, config): def _append_pserver_grad_merge_ops(optimize_block, grad_varname_for_block, endpoint, grad_to_block_id): trainers = config.get_trainers() program = optimize_block.program pserver_block = program.global_block() grad_block = None for g in config.param_grad_ep_mapping[endpoint]["grads"]: if _orig_varname(g.name) == \ _orig_varname(grad_varname_for_block): grad_block = g break if not grad_block: # do not append this op if current endpoint # is not dealing with this grad block return None orig_varname, block_name, trainer_name = _get_varname_parts( grad_block.name) if block_name: merged_var_name = '.'.join([orig_varname, block_name]) else: merged_var_name = orig_varname merged_var = pserver_block.create_var( name=grad_block.name, persistable=True, type=grad_block.type, dtype=grad_block.dtype, shape=grad_block.shape) grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx)) if config.is_sync_mode() and trainers > 1: vars2merge = [] for i in range(trainers): per_trainer_name = "%s.trainer_%d" % \ (merged_var_name, i) per_trainer_var = pserver_block.create_var( name=per_trainer_name, persistable=False, type=grad_block.type, dtype=grad_block.dtype, shape=grad_block.shape) vars2merge.append(per_trainer_var) optimize_block.append_op( type="sum", inputs={"X": vars2merge}, outputs={"Out": merged_var}, attrs={"use_mkldnn": False}) optimize_block.append_op( type="scale", inputs={"X": merged_var}, outputs={"Out": merged_var}, attrs={"scale": 1.0 / float(trainers)}) return merged_var origin_program = config.get_origin_main_program() origin_program = origin_program.clone() ps_endpoint = config.get_ps_endpoint() opt_op_on_pserver = [] # Iterate through the ops, and if an op and the optimize ops # which located on current pserver are in one set, then # append it into the sub program. global_ops = [] # sparse grad name to param name sparse_grad_to_param = [] def _is_opt_op_on_pserver(endpoint, op): param_names = [ p.name for p in config.param_grad_ep_mapping[endpoint]["params"] ] unmerged_varnames = [] merged_varnames = [] merged_ordernames = [] for name in param_names: orig_varname = _orig_varname(name) for pairs in config.merged_variables_pairs: merged_p = pairs[0] if merged_p.merged_var.name == orig_varname: if merged_p.merged_var.name == merged_p.ordered_vars[ 0].name: unmerged_varnames.append(merged_p.ordered_vars[0].name) else: merged_varnames.append(merged_p.merged_var.name) merged_ordernames.append(merged_p.ordered_vars[0].name) break param = op.input("Param")[0] if param in unmerged_varnames: return True for i in range(len(merged_ordernames)): if param == merged_ordernames[i]: merged_p = merged_varnames[i] merged_g = "{}@GRAD".format(merged_varnames[i]) op._set_attr(OP_ROLE_VAR_ATTR_NAME, [merged_p, merged_g]) return True return False def __append_optimize_op__(op, block, grad_to_block_id, merged_var, lr_ops): if _is_optimizer_op(op): _append_pserver_ops(block, op, ps_endpoint, grad_to_block_id, origin_program, merged_var, sparse_grad_to_param, config) elif op not in lr_ops: _append_pserver_non_opt_ops(block, op, origin_program, config) optimize_ops = _get_optimize_ops(origin_program) for _, op in enumerate(optimize_ops): if _is_optimizer_op(op) and _is_opt_op_on_pserver(ps_endpoint, op): opt_op_on_pserver.append(op) # append lr decay ops to the child block if exists lr_ops = _get_lr_ops(origin_program) has_lr_decay = True if len(lr_ops) > 0 else False lr_decay_block_id = -1 optimize_blocks = [] if has_lr_decay > 0: counter_increment_idx = -1 for idx, op in enumerate(lr_ops): if op.type != 'increment': continue counter = op.input("X")[0] if counter == LEARNING_RATE_DECAY_COUNTER: counter_increment_idx = idx break if counter_increment_idx != -1: lr_ops.pop(counter_increment_idx) lr_decay_block = program._create_block(program.num_blocks - 1) optimize_blocks.append(lr_decay_block) for op in lr_ops: cloned_op = _append_pserver_non_opt_ops(lr_decay_block, op, origin_program, config) # append sub blocks to pserver_program in lr_decay_op # todo(tangwei12): __clone_lr_op_sub_block__ lr_decay_block_id = lr_decay_block.idx # append op to the current block grad_to_block_id = [] pre_block_idx = program.num_blocks - 1 for idx, opt_op in enumerate(opt_op_on_pserver): per_opt_block = program._create_block(pre_block_idx) optimize_blocks.append(per_opt_block) optimize_target_param_name = opt_op.attr(OP_ROLE_VAR_ATTR_NAME)[0] # append grad merging ops before clip and weight decay # e.g.merge grad->L2Decay op->clip op->optimize merged_var = None for _, op in enumerate(optimize_ops): # find the origin grad var before clipping / L2Decay, # merged_var should be the input var name of L2Decay grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1] if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name: merged_var = _append_pserver_grad_merge_ops( per_opt_block, grad_varname_for_block, ps_endpoint, grad_to_block_id) if merged_var: break # append optimize op once then append other ops. if merged_var: for _, op in enumerate(optimize_ops): # optimizer is connected to itself if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \ op not in global_ops: __append_optimize_op__(op, per_opt_block, grad_to_block_id, merged_var, lr_ops) # dedup grad to ids list grad_to_block_id = list(set(grad_to_block_id)) # append global ops if global_ops: opt_state_block = program._create_block(program.num_blocks - 1) optimize_blocks.append(opt_state_block) for glb_op in global_ops: __append_optimize_op__(glb_op, opt_state_block, grad_to_block_id, None, lr_ops) if len(optimize_blocks) == 0: pre_block_idx = program.num_blocks - 1 empty_block = program._create_block(pre_block_idx) optimize_blocks.append(empty_block) op = get_op_by_type(program.global_block(), "listen_and_serv") op._set_attr("optimize_blocks", optimize_blocks) op._set_attr("grad_to_block_id", grad_to_block_id) op._set_attr("sparse_grad_to_param", sparse_grad_to_param) op._set_attr("lr_decay_block_id", lr_decay_block_id) return program
def _init_transpiler_worker(self): """ `init_worker` has many many functions to do before training, first, wait for all parameter servers launch completely. second, run executor to initialize startup program third, wait for all worker initialize completely. Returns: None """ def sync_strategy_envs(): kwargs = {} kwargs[ "pserver_endpoints"] = self._role_maker.get_pserver_endpoints( ) kwargs["trainer_id"] = self._role_maker.worker_index() return kwargs def geo_strategy_envs(): def get_sparse_attrs(): opt_init_map = {} opt_init_map["gaussian_random"] = ["seed", "mean", "std"] opt_init_map["fill_constant"] = ["value"] opt_init_map["uniform_random"] = ["seed", "min", "max"] opt_init_map["truncated_gaussian_random"] = [ "seed", "mean", "std" ] dist_varnames = get_sparse_tablenames( self._origin_main_program, True) sparse_varnames = get_sparse_tablenames( self._origin_main_program, False) if len(dist_varnames) != 0: raise ValueError( "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding" ) init_attrs = [] for value_name in sparse_varnames: value_var = self._origin_main_program.global_block( ).vars[value_name] value_attr = [ value_name, ",".join([str(dim) for dim in value_var.shape]) ] for op in self._origin_startup_program.global_block().ops: if op.type in opt_init_map.keys( ) and value_name == op.output("Out")[0]: init_attr = [op.type] for attr in opt_init_map[op.type]: init_attr.append(str(op.attr(attr))) value_attr.append("&".join(init_attr)) init_attrs.append(":".join(value_attr)) break return "#".join(init_attrs) kwargs = {} kwargs["trainers"] = self.worker_num() kwargs["sparse_attrs"] = get_sparse_attrs() return kwargs # if MPISymetricRoleMaker is defined # we suppose a user wants to submit job on mpi cluster if isinstance(self._role_maker, MPISymetricRoleMaker): # check whether server has been initialized wait_server_ready(self.server_endpoints(to_string=False)) trainer_config = self._strategy.get_trainer_runtime_config() print(trainer_config) lrs = _has_global_step(_get_lr_ops(self._origin_main_program)) if lrs > 0: kwargs = {"need_global_step": "1"} else: kwargs = {"need_global_step": "0"} if isinstance(self._strategy, GeoStrategy): geo_kwargs = geo_strategy_envs() kwargs.update(geo_kwargs) if isinstance(self._strategy, SyncStrategy): sync_kwargs = sync_strategy_envs() kwargs.update(sync_kwargs) kwargs = kwargs if kwargs else None send_ctx = fleet.compiled_config.get_communicator_send_context() if self.compiled_config.is_geo_mode(): recv_ctx = fleet.compiled_config.get_communicator_recv_context( recv_type=4) else: recv_ctx = fleet.compiled_config.get_communicator_recv_context( recv_type=1) from paddle.fluid.communicator import Communicator self._communicator = Communicator( trainer_config.mode, kwargs, trainer_config.get_communicator_flags()) self._communicator.init_with_ctx(send_ctx, recv_ctx) if not self._communicator.is_running(): self._communicator.start() else: raise ValueError( "Communicator can only be inited once, please check")