Пример #1
0
    def _read_job_info(self, file_name):
        """Read the job information.
        Args:
            file_name: it's a json file which contains the job info from CSA.
        Returns:
            The job information.
        """
        try:
            with open(file_name, 'r', encoding='UTF-8') as f:
                content = f.read()
                data = json.loads(content, encoding='UTF-8')

                # 1. Get the device_info and check it.
                device_info = data.get('device_info')
                util_lib.check_not_none(device_info, 'device_info')

                index = device_info.get('Index', None)
                util_lib.check_nonnegative_integer(index, 'Index')

                # 2. Get the rank_table_file and check it.
                rank_table_file = data.get('rank_table_file', None)
                util_lib.check_not_none(rank_table_file, 'rank_table_file')

                # 3. Get the rank_size and check it.
                rank_size = data.get('rank_size', None)
                util_lib.check_positive_integer(rank_size, 'rank_size')

                # 4. Get the local_checkpoint_dir and check it.
                local_checkpoint_dir = data.get('local_checkpoint_dir', None)

                # 5. Init the JobInfo.
                device_info = DeviceInfo(index=str(index))
                job_info = JobInfo(device_info=device_info, rank_table_file=rank_table_file,
                                   local_checkpoint_dir=local_checkpoint_dir, rank_size=rank_size)
                return job_info
        except IOError:
            logging.warning('Warning:job config file does not exist')

            job_id = os.getenv('JOB_ID', "")
            if(job_id == ""):
                logging.error('Error:can not get job config from env')
                return None

            heartbeat = os.getenv('HEARTBEAT', "")

            rank_table_file = os.getenv('RANK_TABLE_FILE', "")

            identity = os.getenv('POD_NAME', "")
            if(identity == ""):
                identity = os.getenv('RANK_ID', "")

            checkpoint_dir = os.getenv('LOCAL_CHECKPOINT_DIR', "")

            # cann't get rank_size from env, set to default 1
            rank_size = os.getenv('RANK_SIZE', 1)
            if(rank_size.isdigit() is False):
                print("set rank_size to default 1")
                rank_size = 1

            device_info = DeviceInfo(index=str(identity))
            job_info = JobInfo(job_id=job_id,
                               heartbeat_time=heartbeat,
                               device_info=device_info,
                               rank_table_file=rank_table_file,
                               local_checkpoint_dir=checkpoint_dir,
                               rank_size=int(rank_size)
                               )
        return job_info
Пример #2
0
    def __init__(self,
                 iterations_per_loop=1,
                 profiling_config=None,
                 model_dir=None,
                 tf_random_seed=None,
                 save_summary_steps=0,
                 save_checkpoints_steps=None,
                 save_checkpoints_secs=None,
                 session_config=None,
                 keep_checkpoint_max=5,
                 keep_checkpoint_every_n_hours=10000,
                 log_step_count_steps=100,
                 distribute=None,
                 enable_data_pre_proc=True,
                 precision_mode=None,
                 enable_reduce_precision=False,
                 variable_format_optimize=True,
                 mix_compile_mode=False,
                 hcom_parallel=False,
                 graph_memory_max_size=None,
                 variable_memory_max_size=None,
                 auto_tune_mode=None,
                 dump_config=None,
                 stream_max_parallel_num=None,
                 is_tailing_optimization=False,
                 horovod_mode=False,
                 graph_run_mode=1,
                 op_debug_level=0,
                 enable_scope_fusion_passes=None,
                 enable_exception_dump=0,
                 op_select_implmode=None,
                 optypelist_for_implmode=None,
                 dynamic_input_config=None,
                 mstune_mode=None,
                 work_path=None,
                 buffer_optimize="l2_optimize",
                 enable_small_channel=0,
                 fusion_switch_file=None,
                 enable_compress_weight=False,
                 compress_weight_conf=None,
                 op_compiler_cache_mode=None,
                 op_compiler_cache_dir=None,
                 debug_dir=None,
                 hcom_multi_mode=False,
                 dynamic_input=False,
                 dynamic_graph_execute_mode="dynamic_execute",
                 dynamic_inputs_shape_range=None,
                 train_distribute=None,
                 eval_distribute=None,
                 local_rank_id=None,
                 local_device_list=None,
                 session_device_id=None,
                 distribute_config=None,
                 op_tune_mode=None):
        """
        Constructs a NPUConfig.

        Args:
        iterations_per_loop: This is the number of train steps running in NPU
            system before returning to CPU host for each `Session.run`. This means
            global step is increased `iterations_per_loop` times in one `Session.run`.
            It is recommended to be set as number of global steps for next checkpoint.
        profiling_config: The profiling configuration.
        model_dir: Directory where model parameters, graph, etc are saved. If
            `PathLike` object, the path will be resolved. If `None`, will use a
            default value set by the Estimator.
        tf_random_seed: Random seed for TensorFlow initializers.
            Setting this value allows consistency between reruns.
        save_summary_steps: Save summaries every this many steps.
        save_checkpoints_steps: Save checkpoints every this many steps. Can not be
            specified with `save_checkpoints_secs`.
        save_checkpoints_secs: Save checkpoints every this many seconds. Can not
            be specified with `save_checkpoints_steps`. Defaults to 600 seconds if
            both `save_checkpoints_steps` and `save_checkpoints_secs` are not set
            in constructor.  If both `save_checkpoints_steps` and
            `save_checkpoints_secs` are None, then checkpoints are disabled.
        session_config: A ConfigProto used to set session parameters, or None.
        keep_checkpoint_max: The maximum number of recent checkpoint files to
            keep. As new files are created, older files are deleted. If None or 0,
            all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
            checkpoint files are kept.)
        keep_checkpoint_every_n_hours: Number of hours between each checkpoint
            to be saved. The default value of 10,000 hours effectively disables
            the feature.
        log_step_count_steps: The frequency, in number of global steps, that the
            global step/sec and the loss will be logged during training.
        enabel_data_pre_proc: This is the switch of data preprocess.
        precision_mode: if train, default is: allow_fp32_to_fp16; if inference, default is: force_fp16.
        variable_format_optimize: enable or disable variable format optimize while graph
            engineer optimize process.
        mix_compile_mode: This is the swith of mix_compile_mode. When the value is
            False, all graphs run on device. Otherwise, some graphs run on host.
        hcom_parallel: This is the switch of hcom parallel. When the value is True,
            hcom will execute with parallel mode. Otherwise, hcom will execute with
            serialize mode.
        graph_memory_max_size: The max size of ge graph memory size.
        variable_memory_max_size: The max size of ge variable memory size.
        auto_tune_mode: None, or `GA` ,or `RL` or `GA|RL`
        dump_config: The dump configuration.
        stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine
                                 to achieve parallel execution between AICPU / AICORE operators.
        op_select_implmode: Selecting whether the operator is implemented with high precision
                            or high performance.
        optypelist_for_implmode: Operator list.
        dynamic_input_config:Dynamic dims configuration
        mstune_mode: Optimization Task Type."1": model tune; "2": optune;
                     "3": model tune & optune; "4": gradient split tune.
        work_path: Stores temporary files generated during optimization.
        buffer_optimize: Whether to enable buffer optimization.
        enable_small_channel: Whether to enable small channel optimization.
        fusion_switch_file: Fusion switch configuration file path.
        enable_compress_weight: Whether to enable global weight compression.
        compress_weight_conf:Path and file name of the node list configuration file to be compressed.
        dynamic_input:Whether Input is dynamic.
        dynamic_graph_execute_mode:Dynamic graph execute mode. lazy_recompile or dynamic_execute
        dynamic_inputs_shape_range:Inputs shape range.
        local_rank_id: Local sequence number of the device in a group.
        local_device_list: Available devices.
        distribute_config: Specify the NCA configuration file path
        op_tune_mode: None, or `GA` ,or `RL` or `GA|RL`, use with mstune_mode.
        """

        # Check iterations_per_loop.
        util.check_positive_integer(iterations_per_loop, "iterations_per_loop")
        if isinstance(mix_compile_mode, bool) == False:
            raise ValueError('"mix_compile_mode" type must be bool')
        if mix_compile_mode is True and iterations_per_loop != 1:
            raise ValueError(
                '"iterations_per_loop" must be 1 with "mix_compile_mode" is True'
            )
        tf_config = json.loads(
            os.environ.get(run_config_lib._TF_CONFIG_ENV, '{}'))
        tmp_cluster_spec = server_lib.ClusterSpec(
            tf_config.get(run_config_lib._CLUSTER_KEY, {}))
        if ((tmp_cluster_spec
             and not isinstance(distribute, ParameterServerStrategy))
                or (not tmp_cluster_spec
                    and isinstance(distribute, ParameterServerStrategy))):
            raise ValueError(
                '"cluster" and "distribute" must all be set in ps mode')
        if tmp_cluster_spec and mix_compile_mode is False:
            raise ValueError(
                '"mix_compile_mode" can only be True with "cluster" is set')

        self.iterations_per_loop = iterations_per_loop
        self.mix_compile_mode = mix_compile_mode
        self.enable_data_pre_proc = enable_data_pre_proc
        self.is_tailing_optimization = is_tailing_optimization
        if save_checkpoints_secs == None and save_checkpoints_steps == None:
            save_checkpoints_steps = 100

        self._profiling_config = profiling_config

        # mix precision configuration
        self._precision_mode = precision_mode
        self._enable_reduce_precision = enable_reduce_precision
        self._variable_format_optimize = variable_format_optimize
        self._hcom_parallel = hcom_parallel
        self._graph_memory_max_size = graph_memory_max_size
        self._variable_memory_max_size = variable_memory_max_size

        self._auto_tune_mode = auto_tune_mode

        if dump_config is not None and not isinstance(dump_config, DumpConfig):
            raise ValueError(
                '`dump_config` must be provided with type `DumpConfig`')
        self._dump_config = dump_config
        self._stream_max_parallel_num = stream_max_parallel_num

        if isinstance(horovod_mode, bool) == False:
            raise ValueError('"horovod_mode" type must be bool')
        self.horovod_mode = horovod_mode
        util.check_nonnegative_integer(graph_run_mode, "graph_run_mode")
        if graph_run_mode > 1:
            raise ValueError('"graph_run_mode" value must be 0 or 1')
        self.graph_run_mode = graph_run_mode
        self.op_debug_level = op_debug_level
        self.enable_scope_fusion_passes = enable_scope_fusion_passes
        experimental_distribute = None
        if tmp_cluster_spec and isinstance(distribute,
                                           ParameterServerStrategy):
            experimental_distribute = DistributeConfig(distribute, distribute,
                                                       None)
        util.check_nonnegative_integer(enable_exception_dump,
                                       "enable_exception_dump")
        self.enable_exception_dump = enable_exception_dump
        self._op_select_implmode = op_select_implmode
        self._optypelist_for_implmode = optypelist_for_implmode
        if dynamic_input_config is not None and not isinstance(
                dynamic_input_config, DynamicInputConfig):
            raise ValueError(
                'dynamic_input_config must be provided with type DynamicInputConfig'
            )
        self._dynamic_input_config = dynamic_input_config
        self._mstune_mode = mstune_mode
        self._work_path = work_path
        self._buffer_optimize = buffer_optimize
        self._enable_small_channel = enable_small_channel
        self._fusion_switch_file = fusion_switch_file
        self._enable_compress_weight = enable_compress_weight
        self._compress_weight_conf = compress_weight_conf
        self._op_compiler_cache_mode = op_compiler_cache_mode
        self._op_compiler_cache_dir = op_compiler_cache_dir
        self._debug_dir = debug_dir
        self._hcom_multi_mode = hcom_multi_mode
        self._dynamic_input = dynamic_input
        self._dynamic_graph_execute_mode = dynamic_graph_execute_mode
        self._dynamic_inputs_shape_range = dynamic_inputs_shape_range
        self._local_rank_id = local_rank_id
        self._local_device_list = local_device_list
        self._session_device_id = session_device_id
        self._distribute_config = distribute_config
        self._op_tune_mode = op_tune_mode

        super(NPURunConfig, self).__init__(
            model_dir=model_dir,
            tf_random_seed=tf_random_seed,
            save_summary_steps=save_summary_steps,
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=save_checkpoints_secs,
            session_config=session_config,
            keep_checkpoint_max=keep_checkpoint_max,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            log_step_count_steps=log_step_count_steps,
            experimental_distribute=experimental_distribute,
            train_distribute=train_distribute,
            eval_distribute=eval_distribute)
Пример #3
0
    def __init__(self,
                 iterations_per_loop=1,
                 profiling_config=None,
                 model_dir=None,
                 tf_random_seed=None,
                 save_summary_steps=0,
                 save_checkpoints_steps=None,
                 save_checkpoints_secs=None,
                 session_config=None,
                 keep_checkpoint_max=5,
                 keep_checkpoint_every_n_hours=10000,
                 log_step_count_steps=100,
                 distribute=None,
                 enable_data_pre_proc=True,
                 precision_mode=None,
                 enable_reduce_precision=False,
                 variable_format_optimize=True,
                 mix_compile_mode=False,
                 hcom_parallel=False,
                 graph_memory_max_size=None,
                 variable_memory_max_size=None,
                 auto_tune_mode=None,
                 dump_config=None,
                 stream_max_parallel_num=None,
                 is_tailing_optimization=False,
                 horovod_mode=False,
                 graph_run_mode=1,
                 op_debug_level=0,
                 enable_scope_fusion_passes=None,
                 enable_exception_dump=0,
                 op_select_implmode=None,
                 optypelist_for_implmode=None,
                 dynamic_input_config=None,
                 aoe_mode=None,
                 work_path=None,
                 buffer_optimize="l2_optimize",
                 enable_small_channel=0,
                 fusion_switch_file=None,
                 enable_compress_weight=False,
                 compress_weight_conf=None,
                 op_compiler_cache_mode=None,
                 op_compiler_cache_dir=None,
                 debug_dir=None,
                 hcom_multi_mode=False,
                 dynamic_input=False,
                 dynamic_graph_execute_mode="dynamic_execute",
                 dynamic_inputs_shape_range=None,
                 train_distribute=None,
                 eval_distribute=None,
                 local_rank_id=None,
                 local_device_list=None,
                 session_device_id=None,
                 distribute_config=None,
                 modify_mixlist=None,
                 op_precision_mode=None,
                 device_type="default_device_type",
                 soc_config=None,
                 hccl_timeout=None,
                 op_wait_timeout=None,
                 op_execute_timeout=None,
                 HCCL_algorithm=None,
                 customize_dtypes=None,
                 op_debug_config=None,
                 memory_config=None):
        """
        Constructs a NPUConfig.

        Args:
        iterations_per_loop: This is the number of train steps running in NPU
            system before returning to CPU host for each `Session.run`. This means
            global step is increased `iterations_per_loop` times in one `Session.run`.
            It is recommended to be set as number of global steps for next checkpoint.
        profiling_config: The profiling configuration.
        model_dir: Reference tensorflow tf.estimator.RunConfig model_dir.
        tf_random_seed: Reference tensorflow tf.estimator.RunConfig tf_random_seed.
        save_summary_steps: Reference tensorflow tf.estimator.RunConfig save_summary_steps.
        save_checkpoints_steps: Reference tensorflow tf.estimator.RunConfig save_checkpoints_steps.
        save_checkpoints_secs: Reference tensorflow tf.estimator.RunConfig save_checkpoints_secs.
        session_config: Reference tensorflow tf.estimator.RunConfig session_config.
        keep_checkpoint_max: Reference tensorflow tf.estimator.RunConfig keep_checkpoint_max.
        keep_checkpoint_every_n_hours: Reference tensorflow tf.estimator.RunConfig keep_checkpoint_every_n_hours.
        log_step_count_steps: Reference tensorflow tf.estimator.RunConfig log_step_count_steps.
        enabel_data_pre_proc: This is the switch of data preprocess.
        precision_mode: if train, default is: allow_fp32_to_fp16; if inference, default is: force_fp16.
        variable_format_optimize: enable or disable variable format optimize while graph
            engineer optimize process.
        mix_compile_mode: This is the swith of mix_compile_mode. When the value is
            False, all graphs run on device. Otherwise, some graphs run on host.
        hcom_parallel: This is the switch of hcom parallel. When the value is True,
            hcom will execute with parallel mode. Otherwise, hcom will execute with
            serialize mode.
        graph_memory_max_size: The max size of ge graph memory size.
        variable_memory_max_size: The max size of ge variable memory size.
        auto_tune_mode: None, or `GA` ,or `RL` or `GA|RL`
        dump_config: The dump configuration.
        stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine
                                 to achieve parallel execution between AICPU / AICORE operators.
        op_select_implmode: Selecting whether the operator is implemented with high_precision
                            or high_performance or high_precision_for_all or high_performance_for_all.
        optypelist_for_implmode: Operator list.
        dynamic_input_config:Dynamic dims configuration
        aoe_mode: Optimization Task Type."1": model tune; "2": optune;
                     "3": model tune & optune; "4": gradient split tune.
        work_path: Stores temporary files generated during optimization, default is current path.
        buffer_optimize: Whether to enable buffer optimization.
        enable_small_channel: Whether to enable small channel optimization.
        fusion_switch_file: Fusion switch configuration file path.
        enable_compress_weight: Whether to enable global weight compression.
        compress_weight_conf:Path and file name of the node list configuration file to be compressed.
        dynamic_input:Whether Input is dynamic.
        dynamic_graph_execute_mode:Dynamic graph execute mode. lazy_recompile or dynamic_execute
        dynamic_inputs_shape_range:Inputs shape range.
        local_rank_id: Local sequence number of the device in a group.
        local_device_list: Available devices.
        distribute_config: Specify the NCA configuration file path
        modify_mixlist: Set the path of operator mixed precision configuration file.
        op_precision_mode: Set the path of operator precision mode configuration file (.ini)
        """

        # Check iterations_per_loop.
        util.check_positive_integer(iterations_per_loop, "iterations_per_loop")
        if not isinstance(mix_compile_mode, bool):
            raise ValueError('"mix_compile_mode" type must be bool')
        if mix_compile_mode is True and iterations_per_loop != 1:
            raise ValueError(
                '"iterations_per_loop" must be 1 with "mix_compile_mode" is True'
            )
        tf_config = json.loads(
            os.environ.get(run_config_lib._TF_CONFIG_ENV, '{}'))
        tmp_cluster_spec = server_lib.ClusterSpec(
            tf_config.get(run_config_lib._CLUSTER_KEY, {}))
        if ((tmp_cluster_spec
             and not isinstance(distribute, ParameterServerStrategy))
                or (not tmp_cluster_spec
                    and isinstance(distribute, ParameterServerStrategy))):
            raise ValueError(
                '"cluster" and "distribute" must all be set in ps mode')
        if tmp_cluster_spec and mix_compile_mode is False:
            raise ValueError(
                '"mix_compile_mode" can only be True with "cluster" is set')
        if aoe_mode is None and os.getenv("AOE_MODE") is None:
            self.iterations_per_loop = iterations_per_loop
        else:
            self.iterations_per_loop = 1
        self.mix_compile_mode = mix_compile_mode
        self.enable_data_pre_proc = enable_data_pre_proc
        self.is_tailing_optimization = is_tailing_optimization
        save_checkpoints_steps = self._get_save_checkpoints_steps(
            save_checkpoints_secs, save_checkpoints_steps)
        self._profiling_config = profiling_config

        # mix precision configuration
        self._precision_mode = precision_mode
        self._enable_reduce_precision = enable_reduce_precision
        self._variable_format_optimize = variable_format_optimize
        self._hcom_parallel = hcom_parallel
        self._graph_memory_max_size = graph_memory_max_size
        self._variable_memory_max_size = variable_memory_max_size

        self._auto_tune_mode = auto_tune_mode
        self._dump_config = self._get_dump_config(dump_config)
        self._stream_max_parallel_num = stream_max_parallel_num

        self.horovod_mode = self._get_horovod_mode(horovod_mode)
        util.check_nonnegative_integer(graph_run_mode, "graph_run_mode")
        self.graph_run_mode = self._get_graph_run_mode(graph_run_mode)
        self.op_debug_level = op_debug_level
        self.enable_scope_fusion_passes = enable_scope_fusion_passes
        experimental_distribute = self._get_experimental_distribute(
            tmp_cluster_spec, distribute)
        util.check_nonnegative_integer(enable_exception_dump,
                                       "enable_exception_dump")
        self.enable_exception_dump = enable_exception_dump
        self._op_select_implmode = op_select_implmode
        self._optypelist_for_implmode = optypelist_for_implmode

        self._dynamic_input_config = self._get_dynamic_input_config(
            dynamic_input_config)
        self._aoe_mode = aoe_mode
        self._work_path = work_path
        self._buffer_optimize = buffer_optimize
        self._enable_small_channel = enable_small_channel
        self._fusion_switch_file = fusion_switch_file
        self._enable_compress_weight = enable_compress_weight
        self._compress_weight_conf = compress_weight_conf
        self._op_compiler_cache_mode = op_compiler_cache_mode
        self._op_compiler_cache_dir = op_compiler_cache_dir
        self._debug_dir = debug_dir
        self._hcom_multi_mode = hcom_multi_mode
        self._dynamic_input = dynamic_input
        self._dynamic_graph_execute_mode = dynamic_graph_execute_mode
        self._dynamic_inputs_shape_range = dynamic_inputs_shape_range
        self._local_rank_id = local_rank_id
        self._local_device_list = local_device_list
        self._session_device_id = session_device_id
        self._distribute_config = distribute_config
        self._modify_mixlist = modify_mixlist
        self._op_precision_mode = op_precision_mode
        self._device_type = device_type
        self._soc_config = soc_config
        self._hccl_timeout = hccl_timeout
        self._op_wait_timeout = op_wait_timeout
        self._op_execute_timeout = op_execute_timeout
        self._HCCL_algorithm = HCCL_algorithm
        self._customize_dtypes = customize_dtypes
        self._op_debug_config = op_debug_config
        self._memory_config = memory_config

        super(NPURunConfig, self).__init__(
            model_dir=model_dir,
            tf_random_seed=tf_random_seed,
            save_summary_steps=save_summary_steps,
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=save_checkpoints_secs,
            session_config=session_config,
            keep_checkpoint_max=keep_checkpoint_max,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            log_step_count_steps=log_step_count_steps,
            experimental_distribute=experimental_distribute,
            train_distribute=train_distribute,
            eval_distribute=eval_distribute)