示例#1
0
    def _initialize_writers(self, only_initialize_if_missing=False) -> None:
        # Function is overridden in smdebug/tensorflow/base_hook.py
        if only_initialize_if_missing and self.writer:
            return
        if self.dry_run:
            return
        if self.first_process is False:
            return
        elif self.first_process is None:
            if self._get_num_workers() == 1:
                if is_first_process(self.out_dir):
                    self.first_process = True
                    self.logger.info(
                        f"Hook is writing from the hook with pid: {os.getpid()}\n"
                    )
                else:
                    if self.first_process is None:
                        self.logger.warn(
                            f"Unsupported Distributed Training Strategy Detected. \
                            Sagemaker-Debugger will only write from one process. \
                            The process with pid: {os.getpid()} will not be writing any data. \n"
                        )
                    self.first_process = False
                    return

        if self.save_all_workers is False:
            if self.worker != self.chief_worker:
                return

        self.writer = FileWriter(trial_dir=self.out_dir,
                                 step=self.step,
                                 worker=self.worker)
def test_is_first_process(dir):
    s3_path = "s3://this/is/a/valid/path"
    assert is_first_process(s3_path)

    # This section tests local path
    for _ in range(10):
        helper_test_is_first_process(dir)
    def write_tf_dataloader_flag(self, flag_filename):
        """If dataloader metrics collection is enabled, then write a .tmp file with the provided flag_filename such
        that is has this path: <local_path>/<node_id>/<flag_filename>. We simply create the file but never close the
        writer, since we don't want the file to be uploaded to s3.

        If flag_filename is TF_DATALOADER_START_FLAG_FILENAME, we are signaling that dataloader metrics should be
        collected now. If flag_filename is TF_DATALOADER_END_FLAG_FILENAME, we are signaling that dataloader metrics
        should not be collected anymore. In AWS TF, we will collect dataloader metrics when only
        TF_DATALOADER_START_FLAG_FILENAME exists and not collect dataloader metrics when neither or both flags exist.

        Return True if writing the flag was successful, False if unsuccessful.
        """
        if not self.profiling_enabled or not self.config.dataloader_profiling_config.is_enabled():
            return

        tf_dataloader_flag_path = os.path.join(
            self.config.local_path, get_node_id_from_resource_config(), flag_filename
        )
        success = is_first_process(tf_dataloader_flag_path, is_dir=False)

        if not os.path.isfile(tf_dataloader_flag_path):
            self.logger.error(f"Could not write flag to: {tf_dataloader_flag_path}!")
            return False

        return success