def init_resource(config=None): """Initialize NPU resource""" if (not isinstance(config, config_pb2.ConfigProto)) or (not issubclass(type(config), config_pb2.ConfigProto)): config = config_pb2.ConfigProto() npu_optimizer = None for custom_optimizer in config.graph_options.rewrite_options.custom_optimizers: if custom_optimizer.name == 'NpuOptimizer': npu_optimizer = custom_optimizer break if not npu_optimizer: npu_optimizer = config.graph_options.rewrite_options.custom_optimizers.add() npu_optimizer.name = 'NpuOptimizer' config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF config.allow_soft_placement = True config.log_device_placement = False config.graph_options.rewrite_options.remapping = RewriterConfig.OFF config.graph_options.optimizer_options.global_jit_level = config_pb2.OptimizerOptions.OFF util.global_dict_init() npu_init = npu_ops.initialize_system() npu_shutdown = npu_ops.shutdown_system() sess = session.Session(config=config) sess.run(npu_init) npu_rank_id = get_rank_id() npu_local_rank_id = get_local_rank_id() npu_rank_size = get_rank_size() npu_local_rank_size = get_local_rank_size() util.set_value("npu_rank_id", npu_rank_id) util.set_value("npu_local_rank_id", npu_local_rank_id) util.set_value("npu_rank_size", npu_rank_size) util.set_value("npu_local_rank_size", npu_local_rank_size) return sess, npu_shutdown
def open(device_id=None): """Initiate and return a NPU device handle""" if device_id is None: device_id = int(os.getenv("ASCEND_DEVICE_ID", '0')) with _npu_ctx_lock: if not isinstance(context.context(), _ContextWithDefaultDevice): ctx = _ContextWithDefaultDevice() ctx.ensure_initialized() context._set_context(ctx) _npu_device_instances.clear( ) # Global context has changed since last init npu if device_id in _npu_device_instances.keys(): logging.info('Npu instance on device %s already created', str(device_id)) return _npu_device_instances.get(device_id) if len(_npu_device_instances): raise RuntimeError( 'Failed create npu instance on device {} as existed instance on {}' ''.format(device_id, list(_npu_device_instances.keys()))) global_kw_options = global_options().as_dict() workers_num = int(os.getenv('RANK_SIZE', '1')) if workers_num > 1: env_rank_table = os.getenv("RANK_TABLE_FILE") env_worker_id = os.getenv('RANK_ID') if not env_rank_table: raise RuntimeError( 'You must specify a rank table file by set env RANK_TABLE_FILE in distribution mode' ) if not env_worker_id: raise RuntimeError( 'You must specify rank id by set env RANK_ID in distribution mode' ) global_kw_options['_distribute.rank_table'] = env_rank_table global_kw_options['_distribute.rank_id'] = env_worker_id device_options = {} error_message = _npu_device_backends.Open(context.context()._handle, NPU, device_id, global_kw_options, device_options) if error_message: raise RuntimeError("Failed open npu device %s : %s" % (str(device_id), error_message)) if workers_num > 1: from hccl.manage.api import get_rank_id worker_id = get_rank_id() else: worker_id = 0 _npu_device_instances[device_id] = NpuDeviceHandle( context.context(), device_id, device_options, workers_num, worker_id) return _npu_device_instances[device_id]
def _init_distributed_setting(self): if not self.distributed: return self._world_size = hvd.size() if zeus.is_gpu_device( ) else get_rank_size() self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id() self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device( ) else get_local_rank_id()
def _init_distributed_setting(self): if not self.distributed: return if zeus.is_npu_device(): self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) self._world_size = hvd.size() if zeus.is_gpu_device() else get_rank_size() self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id() self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device() else get_local_rank_id()
def init_resource(): util.global_dict_init() npu_init = npu_ops.initialize_system() npu_shutdown = npu_ops.shutdown_system() config = config_pb2.ConfigProto(allow_soft_placement=True, log_device_placement=False) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" config.graph_options.rewrite_options.remapping = RewriterConfig.OFF sess = session.Session(config=config) sess.run(npu_init) npu_rank_id = get_rank_id() npu_local_rank_id = get_local_rank_id() npu_rank_size = get_rank_size() util.set_value("npu_rank_id", npu_rank_id) util.set_value("npu_local_rank_id", npu_local_rank_id) util.set_value("npu_rank_size", npu_rank_size) return sess, npu_shutdown
def rdma_remote_register(remote_var_list): """ remote_var_list: embedding and opt var list. """ if not isinstance(remote_var_list, (tuple, list)): raise ValueError('{} should be tuple or list'.format(remote_var_list)) var_addr_list = [] local_rank_size = get_local_rank_size() rank_id = get_rank_id() server_id = int(rank_id / local_rank_size) for var in remote_var_list: for line in var: var_server_id = int(line[0] / local_rank_size) if server_id == var_server_id: host_var_info = tf_adapter.HostVarInfo() host_var_info.base_addr = line[1] host_var_info.var_size = line[2] var_addr_list.append(host_var_info) res = tf_adapter.RegistRdmaRemoteAddr(var_addr_list) if res != 0: raise RuntimeError('rdma remote register failed')
def _init_distributed_setting(self): if not self.distributed: return if zeus.is_npu_device(): from npu_bridge.estimator import npu_ops self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) import horovod.tensorflow as hvd if zeus.is_gpu_device(): self._world_size = hvd.size() self._rank_id = hvd.rank() self._local_rank_id = hvd.local_rank() elif zeus.is_npu_device(): from hccl.manage.api import get_local_rank_id from hccl.manage.api import get_rank_size from hccl.manage.api import get_rank_id self._world_size = get_rank_size() self._rank_id = get_rank_id() self._local_rank_id = get_local_rank_id()
def rdma_remote_init(remote_var_list, mem_size): """ remote_var_list: embedding and opt var list. mem_size: ramd pool memory size to be allocated. type:int """ if not isinstance(remote_var_list, (tuple, list)): raise ValueError('{} should be tuple or list'.format(remote_var_list)) if not isinstance(mem_size, int): raise ValueError('{} should be int'.format(mem_size)) var_addr_list = [] local_rank_size = get_local_rank_size() rank_id = get_rank_id() server_id = int(rank_id / local_rank_size) for var in remote_var_list: server_var = var[server_id] host_var_info = tf_adapter.HostVarInfo() host_var_info.base_addr = server_var[1] host_var_info.var_size = server_var[2] var_addr_list.append(host_var_info) res = tf_adapter.RdmaInitAndRegister(var_addr_list, mem_size) if res != 0: raise RuntimeError('rdma init and register failed')
def _experimental_distribute_dataset(self, dataset): return dataset.shard(get_rank_size(), get_rank_id())