Exemplo n.º 1
0
 def _has_event_file_been_skipped(self, missing_event_file_name: str) -> bool:
     """
     Checks if an event file will ever be downloaded.
     if event_file present --> return False
     if the worker has written the next event file --> return True
     if none of the above --> return False
     :param missing_event_file_name:
     :return:
     """
     self.logger.info(f" Index Reader: Event File {missing_event_file_name} not found.")
     missing_worker = parse_worker_name_from_file(missing_event_file_name)
     missing_step = IndexFileLocationUtils.parse_step_from_index_file_name(
         missing_event_file_name
     )
     event_files = self.list_event_files(missing_event_file_name)
     for event_file in event_files:
         if missing_worker == parse_worker_name_from_file(event_file):
             step = IndexFileLocationUtils.parse_step_from_index_file_name(event_file)
             if missing_step == step:
                 """
                     The missing step file may have been written to disk before
                     we perform the list operation.
                 """
                 return False
             self.logger.warn(
                 f" Index Reader: Event File {missing_event_file_name} was written but not found "
                 f"\nHowever Event File {event_file} found."
             )
             self.logger.warn(f"IndexReader: Skipping {missing_event_file_name} ")
             return True
     return False
Exemplo n.º 2
0
    def _update_last_index_token(self, new_index_token: str) -> None:
        """
        This function updates the last_index_token in the following scenarios:
            1. last_complete_step >= last_index_token_step :
                this means that the token isn't pointing to the latest completed step
            2. number of steps available ( complete or incomplete ) - (last_completed_step+1) > window_size_limit:
                we maintain a window to stop querying for older steps that have not completed.
                if the total number of steps, we are querying for completion is greater than our window_size_limit
                we update the last_index_token and last_complete_step by (window_size_limit // 2)
        :param new_index_token:
        :return:None
        """
        if self.last_index_token is None:
            last_index_token_step = 0
        else:
            last_index_token_step = IndexFileLocationUtils.parse_step_from_index_file_name(
                self.last_index_token)

        # Case 1: This case is not satisfied when all workers in a
        # distributed training job have not written a step
        if self.last_complete_step >= last_index_token_step:
            prefix = IndexFileLocationUtils.get_prefix_from_index_file(
                new_index_token)
            # sort lexicographically and select the last worker
            last_worker = sorted(list(self.worker_set))[-1]
            # below converts worker_name to serialized workerName
            # if it's a tf device, else no effect
            last_worker_serialized = serialize_tf_device(last_worker)
            self.last_index_token = IndexFileLocationUtils.get_index_key_for_step(
                prefix, self.last_complete_step, last_worker_serialized)
            self.logger.debug(
                f"Updated last index token to:{self.last_index_token}")

        # Case 2: This case is satisfied if the number of incomplete steps
        # is greater than the INCOMPLETE_STEP_WAIT_WINDOW
        available_step = self._global_to_mode.keys()
        if (len(available_step) - (self.last_complete_step + 1) >
                self._incomplete_wait_for_step_window):
            prefix = IndexFileLocationUtils.get_prefix_from_index_file(
                new_index_token)
            last_worker = sorted(list(self.worker_set))[-1]
            # below converts worker_name to serialized workerName
            # if it's a tf device, else no effect
            last_worker_serialized = serialize_tf_device(last_worker)
            self.last_index_token = IndexFileLocationUtils.get_index_key_for_step(
                prefix,
                self.last_complete_step +
                (self._incomplete_wait_for_step_window // 2),
                last_worker_serialized,
            )
            self.last_complete_step = IndexFileLocationUtils.parse_step_from_index_file_name(
                self.last_index_token)
            self.logger.info(
                f"Waiting for: {len(available_step) - (self.last_complete_step + 1)} Steps. \n"
                f"INCOMPLETE_STEP_WAIT_WINDOW: {self._incomplete_wait_for_step_window}. \n"
                f"Marking the last {self._incomplete_wait_for_step_window // 2} incomplete steps as complete"
                f"Updating last_index_token to: {self.last_index_token}. \n"
                f"Updating last_complete_step to: {self.last_complete_step}. ")
 def read_index_files(
     self, start_after_key: str, range_steps=None
 ) -> Tuple[List[bytes], list, str, List[str]]:
     """
         Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
     :param start_after_key: str
     :param range_steps: str
     :return: Tuple( responses, steps, start_after_key, workers)
     """
     index_files = self.list_index_files()
     steps = []
     workers = []
     responses = []
     if start_after_key is not None:
         start_after_index = bisect_left(index_files, start_after_key)
     else:
         start_after_index = 0
     index_files = index_files[start_after_index:]  # ignore files we have already read
     for index_file in index_files:
         if self.index_file_cache.has_not_read(index_file):
             step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file)
             if (
                 range_steps is not None and step_in_range(range_steps, step)
             ) or range_steps is None:
                 steps.append(step)
                 workers.append(parse_worker_name_from_file(index_file))
                 self.logger.debug(
                     f"Sagemaker-Debugger: Read {os.path.getsize(index_file)} bytes from file {index_file}"
                 )
                 with open(index_file) as f:
                     responses.append(f.read().encode())
             self.index_file_cache.add(index_file, start_after_key)
     if len(index_files) > 0:
         start_after_key = index_files[-1]  # Last file that we have read
     return responses, steps, start_after_key, workers
    def read_index_files(
        self, start_after_key: str, range_steps=None
    ) -> Tuple[List[bytes], list, str, List[str]]:
        """
            Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
        :param start_after_key: str
        :param range_steps:
        :return: Tuple( responses, steps, start_after_key, workers)
        """
        object_requests = []
        steps = []
        workers = []
        index_files, start_after_key = self.list_index_files(start_after_key)
        self.logger.debug(f'Loaded Index Files: {",".join(index_files)}')
        for index_file in index_files:
            if self.index_file_cache.has_not_read(index_file):
                step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file)
                if (
                    range_steps is not None and step_in_range(range_steps, step)
                ) or range_steps is None:
                    steps.append(step)
                    workers.append(parse_worker_name_from_file(index_file))
                    object_requests.append(
                        ReadObjectRequest(format(f"s3://{self.bucket_name}/") + index_file)
                    )
                self.index_file_cache.add(index_file, start_after_key)

        responses = self.s3_handler.get_objects(object_requests)
        return responses, steps, start_after_key, workers