def _read_job_info(self, file_name): """Read the job information. Args: file_name: it's a json file which contains the job info from CSA. Returns: The job information. """ try: with open(file_name, 'r', encoding='UTF-8') as f: content = f.read() data = json.loads(content, encoding='UTF-8') # 1. Get the device_info and check it. device_info = data.get('device_info') util_lib.check_not_none(device_info, 'device_info') index = device_info.get('Index', None) util_lib.check_nonnegative_integer(index, 'Index') # 2. Get the rank_table_file and check it. rank_table_file = data.get('rank_table_file', None) util_lib.check_not_none(rank_table_file, 'rank_table_file') # 3. Get the rank_size and check it. rank_size = data.get('rank_size', None) util_lib.check_positive_integer(rank_size, 'rank_size') # 4. Get the local_checkpoint_dir and check it. local_checkpoint_dir = data.get('local_checkpoint_dir', None) # 5. Init the JobInfo. device_info = DeviceInfo(index=str(index)) job_info = JobInfo(device_info=device_info, rank_table_file=rank_table_file, local_checkpoint_dir=local_checkpoint_dir, rank_size=rank_size) return job_info except IOError: logging.warning('Warning:job config file does not exist') job_id = os.getenv('JOB_ID', "") if(job_id == ""): logging.error('Error:can not get job config from env') return None heartbeat = os.getenv('HEARTBEAT', "") rank_table_file = os.getenv('RANK_TABLE_FILE', "") identity = os.getenv('POD_NAME', "") if(identity == ""): identity = os.getenv('RANK_ID', "") checkpoint_dir = os.getenv('LOCAL_CHECKPOINT_DIR', "") # cann't get rank_size from env, set to default 1 rank_size = os.getenv('RANK_SIZE', 1) if(rank_size.isdigit() is False): print("set rank_size to default 1") rank_size = 1 device_info = DeviceInfo(index=str(identity)) job_info = JobInfo(job_id=job_id, heartbeat_time=heartbeat, device_info=device_info, rank_table_file=rank_table_file, local_checkpoint_dir=checkpoint_dir, rank_size=int(rank_size) ) return job_info
def __init__(self, iterations_per_loop=1, profiling_config=None, model_dir=None, tf_random_seed=None, save_summary_steps=0, save_checkpoints_steps=None, save_checkpoints_secs=None, session_config=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, distribute=None, enable_data_pre_proc=True, precision_mode=None, enable_reduce_precision=False, variable_format_optimize=True, mix_compile_mode=False, hcom_parallel=False, graph_memory_max_size=None, variable_memory_max_size=None, auto_tune_mode=None, dump_config=None, stream_max_parallel_num=None, is_tailing_optimization=False, horovod_mode=False, graph_run_mode=1, op_debug_level=0, enable_scope_fusion_passes=None, enable_exception_dump=0, op_select_implmode=None, optypelist_for_implmode=None, dynamic_input_config=None, mstune_mode=None, work_path=None, buffer_optimize="l2_optimize", enable_small_channel=0, fusion_switch_file=None, enable_compress_weight=False, compress_weight_conf=None, op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=False, dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None, train_distribute=None, eval_distribute=None, local_rank_id=None, local_device_list=None, session_device_id=None, distribute_config=None, op_tune_mode=None): """ Constructs a NPUConfig. Args: iterations_per_loop: This is the number of train steps running in NPU system before returning to CPU host for each `Session.run`. This means global step is increased `iterations_per_loop` times in one `Session.run`. It is recommended to be set as number of global steps for next checkpoint. profiling_config: The profiling configuration. model_dir: Directory where model parameters, graph, etc are saved. If `PathLike` object, the path will be resolved. If `None`, will use a default value set by the Estimator. tf_random_seed: Random seed for TensorFlow initializers. Setting this value allows consistency between reruns. save_summary_steps: Save summaries every this many steps. save_checkpoints_steps: Save checkpoints every this many steps. Can not be specified with `save_checkpoints_secs`. save_checkpoints_secs: Save checkpoints every this many seconds. Can not be specified with `save_checkpoints_steps`. Defaults to 600 seconds if both `save_checkpoints_steps` and `save_checkpoints_secs` are not set in constructor. If both `save_checkpoints_steps` and `save_checkpoints_secs` are None, then checkpoints are disabled. session_config: A ConfigProto used to set session parameters, or None. keep_checkpoint_max: The maximum number of recent checkpoint files to keep. As new files are created, older files are deleted. If None or 0, all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) keep_checkpoint_every_n_hours: Number of hours between each checkpoint to be saved. The default value of 10,000 hours effectively disables the feature. log_step_count_steps: The frequency, in number of global steps, that the global step/sec and the loss will be logged during training. enabel_data_pre_proc: This is the switch of data preprocess. precision_mode: if train, default is: allow_fp32_to_fp16; if inference, default is: force_fp16. variable_format_optimize: enable or disable variable format optimize while graph engineer optimize process. mix_compile_mode: This is the swith of mix_compile_mode. When the value is False, all graphs run on device. Otherwise, some graphs run on host. hcom_parallel: This is the switch of hcom parallel. When the value is True, hcom will execute with parallel mode. Otherwise, hcom will execute with serialize mode. graph_memory_max_size: The max size of ge graph memory size. variable_memory_max_size: The max size of ge variable memory size. auto_tune_mode: None, or `GA` ,or `RL` or `GA|RL` dump_config: The dump configuration. stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine to achieve parallel execution between AICPU / AICORE operators. op_select_implmode: Selecting whether the operator is implemented with high precision or high performance. optypelist_for_implmode: Operator list. dynamic_input_config:Dynamic dims configuration mstune_mode: Optimization Task Type."1": model tune; "2": optune; "3": model tune & optune; "4": gradient split tune. work_path: Stores temporary files generated during optimization. buffer_optimize: Whether to enable buffer optimization. enable_small_channel: Whether to enable small channel optimization. fusion_switch_file: Fusion switch configuration file path. enable_compress_weight: Whether to enable global weight compression. compress_weight_conf:Path and file name of the node list configuration file to be compressed. dynamic_input:Whether Input is dynamic. dynamic_graph_execute_mode:Dynamic graph execute mode. lazy_recompile or dynamic_execute dynamic_inputs_shape_range:Inputs shape range. local_rank_id: Local sequence number of the device in a group. local_device_list: Available devices. distribute_config: Specify the NCA configuration file path op_tune_mode: None, or `GA` ,or `RL` or `GA|RL`, use with mstune_mode. """ # Check iterations_per_loop. util.check_positive_integer(iterations_per_loop, "iterations_per_loop") if isinstance(mix_compile_mode, bool) == False: raise ValueError('"mix_compile_mode" type must be bool') if mix_compile_mode is True and iterations_per_loop != 1: raise ValueError( '"iterations_per_loop" must be 1 with "mix_compile_mode" is True' ) tf_config = json.loads( os.environ.get(run_config_lib._TF_CONFIG_ENV, '{}')) tmp_cluster_spec = server_lib.ClusterSpec( tf_config.get(run_config_lib._CLUSTER_KEY, {})) if ((tmp_cluster_spec and not isinstance(distribute, ParameterServerStrategy)) or (not tmp_cluster_spec and isinstance(distribute, ParameterServerStrategy))): raise ValueError( '"cluster" and "distribute" must all be set in ps mode') if tmp_cluster_spec and mix_compile_mode is False: raise ValueError( '"mix_compile_mode" can only be True with "cluster" is set') self.iterations_per_loop = iterations_per_loop self.mix_compile_mode = mix_compile_mode self.enable_data_pre_proc = enable_data_pre_proc self.is_tailing_optimization = is_tailing_optimization if save_checkpoints_secs == None and save_checkpoints_steps == None: save_checkpoints_steps = 100 self._profiling_config = profiling_config # mix precision configuration self._precision_mode = precision_mode self._enable_reduce_precision = enable_reduce_precision self._variable_format_optimize = variable_format_optimize self._hcom_parallel = hcom_parallel self._graph_memory_max_size = graph_memory_max_size self._variable_memory_max_size = variable_memory_max_size self._auto_tune_mode = auto_tune_mode if dump_config is not None and not isinstance(dump_config, DumpConfig): raise ValueError( '`dump_config` must be provided with type `DumpConfig`') self._dump_config = dump_config self._stream_max_parallel_num = stream_max_parallel_num if isinstance(horovod_mode, bool) == False: raise ValueError('"horovod_mode" type must be bool') self.horovod_mode = horovod_mode util.check_nonnegative_integer(graph_run_mode, "graph_run_mode") if graph_run_mode > 1: raise ValueError('"graph_run_mode" value must be 0 or 1') self.graph_run_mode = graph_run_mode self.op_debug_level = op_debug_level self.enable_scope_fusion_passes = enable_scope_fusion_passes experimental_distribute = None if tmp_cluster_spec and isinstance(distribute, ParameterServerStrategy): experimental_distribute = DistributeConfig(distribute, distribute, None) util.check_nonnegative_integer(enable_exception_dump, "enable_exception_dump") self.enable_exception_dump = enable_exception_dump self._op_select_implmode = op_select_implmode self._optypelist_for_implmode = optypelist_for_implmode if dynamic_input_config is not None and not isinstance( dynamic_input_config, DynamicInputConfig): raise ValueError( 'dynamic_input_config must be provided with type DynamicInputConfig' ) self._dynamic_input_config = dynamic_input_config self._mstune_mode = mstune_mode self._work_path = work_path self._buffer_optimize = buffer_optimize self._enable_small_channel = enable_small_channel self._fusion_switch_file = fusion_switch_file self._enable_compress_weight = enable_compress_weight self._compress_weight_conf = compress_weight_conf self._op_compiler_cache_mode = op_compiler_cache_mode self._op_compiler_cache_dir = op_compiler_cache_dir self._debug_dir = debug_dir self._hcom_multi_mode = hcom_multi_mode self._dynamic_input = dynamic_input self._dynamic_graph_execute_mode = dynamic_graph_execute_mode self._dynamic_inputs_shape_range = dynamic_inputs_shape_range self._local_rank_id = local_rank_id self._local_device_list = local_device_list self._session_device_id = session_device_id self._distribute_config = distribute_config self._op_tune_mode = op_tune_mode super(NPURunConfig, self).__init__( model_dir=model_dir, tf_random_seed=tf_random_seed, save_summary_steps=save_summary_steps, save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=save_checkpoints_secs, session_config=session_config, keep_checkpoint_max=keep_checkpoint_max, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, log_step_count_steps=log_step_count_steps, experimental_distribute=experimental_distribute, train_distribute=train_distribute, eval_distribute=eval_distribute)
def __init__(self, iterations_per_loop=1, profiling_config=None, model_dir=None, tf_random_seed=None, save_summary_steps=0, save_checkpoints_steps=None, save_checkpoints_secs=None, session_config=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, distribute=None, enable_data_pre_proc=True, precision_mode=None, enable_reduce_precision=False, variable_format_optimize=True, mix_compile_mode=False, hcom_parallel=False, graph_memory_max_size=None, variable_memory_max_size=None, auto_tune_mode=None, dump_config=None, stream_max_parallel_num=None, is_tailing_optimization=False, horovod_mode=False, graph_run_mode=1, op_debug_level=0, enable_scope_fusion_passes=None, enable_exception_dump=0, op_select_implmode=None, optypelist_for_implmode=None, dynamic_input_config=None, aoe_mode=None, work_path=None, buffer_optimize="l2_optimize", enable_small_channel=0, fusion_switch_file=None, enable_compress_weight=False, compress_weight_conf=None, op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=False, dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None, train_distribute=None, eval_distribute=None, local_rank_id=None, local_device_list=None, session_device_id=None, distribute_config=None, modify_mixlist=None, op_precision_mode=None, device_type="default_device_type", soc_config=None, hccl_timeout=None, op_wait_timeout=None, op_execute_timeout=None, HCCL_algorithm=None, customize_dtypes=None, op_debug_config=None, memory_config=None): """ Constructs a NPUConfig. Args: iterations_per_loop: This is the number of train steps running in NPU system before returning to CPU host for each `Session.run`. This means global step is increased `iterations_per_loop` times in one `Session.run`. It is recommended to be set as number of global steps for next checkpoint. profiling_config: The profiling configuration. model_dir: Reference tensorflow tf.estimator.RunConfig model_dir. tf_random_seed: Reference tensorflow tf.estimator.RunConfig tf_random_seed. save_summary_steps: Reference tensorflow tf.estimator.RunConfig save_summary_steps. save_checkpoints_steps: Reference tensorflow tf.estimator.RunConfig save_checkpoints_steps. save_checkpoints_secs: Reference tensorflow tf.estimator.RunConfig save_checkpoints_secs. session_config: Reference tensorflow tf.estimator.RunConfig session_config. keep_checkpoint_max: Reference tensorflow tf.estimator.RunConfig keep_checkpoint_max. keep_checkpoint_every_n_hours: Reference tensorflow tf.estimator.RunConfig keep_checkpoint_every_n_hours. log_step_count_steps: Reference tensorflow tf.estimator.RunConfig log_step_count_steps. enabel_data_pre_proc: This is the switch of data preprocess. precision_mode: if train, default is: allow_fp32_to_fp16; if inference, default is: force_fp16. variable_format_optimize: enable or disable variable format optimize while graph engineer optimize process. mix_compile_mode: This is the swith of mix_compile_mode. When the value is False, all graphs run on device. Otherwise, some graphs run on host. hcom_parallel: This is the switch of hcom parallel. When the value is True, hcom will execute with parallel mode. Otherwise, hcom will execute with serialize mode. graph_memory_max_size: The max size of ge graph memory size. variable_memory_max_size: The max size of ge variable memory size. auto_tune_mode: None, or `GA` ,or `RL` or `GA|RL` dump_config: The dump configuration. stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine to achieve parallel execution between AICPU / AICORE operators. op_select_implmode: Selecting whether the operator is implemented with high_precision or high_performance or high_precision_for_all or high_performance_for_all. optypelist_for_implmode: Operator list. dynamic_input_config:Dynamic dims configuration aoe_mode: Optimization Task Type."1": model tune; "2": optune; "3": model tune & optune; "4": gradient split tune. work_path: Stores temporary files generated during optimization, default is current path. buffer_optimize: Whether to enable buffer optimization. enable_small_channel: Whether to enable small channel optimization. fusion_switch_file: Fusion switch configuration file path. enable_compress_weight: Whether to enable global weight compression. compress_weight_conf:Path and file name of the node list configuration file to be compressed. dynamic_input:Whether Input is dynamic. dynamic_graph_execute_mode:Dynamic graph execute mode. lazy_recompile or dynamic_execute dynamic_inputs_shape_range:Inputs shape range. local_rank_id: Local sequence number of the device in a group. local_device_list: Available devices. distribute_config: Specify the NCA configuration file path modify_mixlist: Set the path of operator mixed precision configuration file. op_precision_mode: Set the path of operator precision mode configuration file (.ini) """ # Check iterations_per_loop. util.check_positive_integer(iterations_per_loop, "iterations_per_loop") if not isinstance(mix_compile_mode, bool): raise ValueError('"mix_compile_mode" type must be bool') if mix_compile_mode is True and iterations_per_loop != 1: raise ValueError( '"iterations_per_loop" must be 1 with "mix_compile_mode" is True' ) tf_config = json.loads( os.environ.get(run_config_lib._TF_CONFIG_ENV, '{}')) tmp_cluster_spec = server_lib.ClusterSpec( tf_config.get(run_config_lib._CLUSTER_KEY, {})) if ((tmp_cluster_spec and not isinstance(distribute, ParameterServerStrategy)) or (not tmp_cluster_spec and isinstance(distribute, ParameterServerStrategy))): raise ValueError( '"cluster" and "distribute" must all be set in ps mode') if tmp_cluster_spec and mix_compile_mode is False: raise ValueError( '"mix_compile_mode" can only be True with "cluster" is set') if aoe_mode is None and os.getenv("AOE_MODE") is None: self.iterations_per_loop = iterations_per_loop else: self.iterations_per_loop = 1 self.mix_compile_mode = mix_compile_mode self.enable_data_pre_proc = enable_data_pre_proc self.is_tailing_optimization = is_tailing_optimization save_checkpoints_steps = self._get_save_checkpoints_steps( save_checkpoints_secs, save_checkpoints_steps) self._profiling_config = profiling_config # mix precision configuration self._precision_mode = precision_mode self._enable_reduce_precision = enable_reduce_precision self._variable_format_optimize = variable_format_optimize self._hcom_parallel = hcom_parallel self._graph_memory_max_size = graph_memory_max_size self._variable_memory_max_size = variable_memory_max_size self._auto_tune_mode = auto_tune_mode self._dump_config = self._get_dump_config(dump_config) self._stream_max_parallel_num = stream_max_parallel_num self.horovod_mode = self._get_horovod_mode(horovod_mode) util.check_nonnegative_integer(graph_run_mode, "graph_run_mode") self.graph_run_mode = self._get_graph_run_mode(graph_run_mode) self.op_debug_level = op_debug_level self.enable_scope_fusion_passes = enable_scope_fusion_passes experimental_distribute = self._get_experimental_distribute( tmp_cluster_spec, distribute) util.check_nonnegative_integer(enable_exception_dump, "enable_exception_dump") self.enable_exception_dump = enable_exception_dump self._op_select_implmode = op_select_implmode self._optypelist_for_implmode = optypelist_for_implmode self._dynamic_input_config = self._get_dynamic_input_config( dynamic_input_config) self._aoe_mode = aoe_mode self._work_path = work_path self._buffer_optimize = buffer_optimize self._enable_small_channel = enable_small_channel self._fusion_switch_file = fusion_switch_file self._enable_compress_weight = enable_compress_weight self._compress_weight_conf = compress_weight_conf self._op_compiler_cache_mode = op_compiler_cache_mode self._op_compiler_cache_dir = op_compiler_cache_dir self._debug_dir = debug_dir self._hcom_multi_mode = hcom_multi_mode self._dynamic_input = dynamic_input self._dynamic_graph_execute_mode = dynamic_graph_execute_mode self._dynamic_inputs_shape_range = dynamic_inputs_shape_range self._local_rank_id = local_rank_id self._local_device_list = local_device_list self._session_device_id = session_device_id self._distribute_config = distribute_config self._modify_mixlist = modify_mixlist self._op_precision_mode = op_precision_mode self._device_type = device_type self._soc_config = soc_config self._hccl_timeout = hccl_timeout self._op_wait_timeout = op_wait_timeout self._op_execute_timeout = op_execute_timeout self._HCCL_algorithm = HCCL_algorithm self._customize_dtypes = customize_dtypes self._op_debug_config = op_debug_config self._memory_config = memory_config super(NPURunConfig, self).__init__( model_dir=model_dir, tf_random_seed=tf_random_seed, save_summary_steps=save_summary_steps, save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=save_checkpoints_secs, session_config=session_config, keep_checkpoint_max=keep_checkpoint_max, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, log_step_count_steps=log_step_count_steps, experimental_distribute=experimental_distribute, train_distribute=train_distribute, eval_distribute=eval_distribute)