class BaseHook: __metaclass__ = ABCMeta def __init__( self, collection_manager: CollectionManager, default_include_collections: List[str], profiler_config_parser: ProfilerConfigParser, init_step: int = 0, out_dir: Optional[str] = None, export_tensorboard: bool = False, tensorboard_dir: Optional[str] = None, dry_run: bool = False, reduction_config: Optional[ReductionConfig] = None, save_config: Optional[Union[SaveConfig, Dict[ModeKeys, SaveConfigMode]]] = None, include_regex: Optional[List[str]] = None, include_collections: Optional[List[str]] = None, save_all: bool = False, include_workers: str = "one", ): """ A class used to represent the hook which gets attached to the training process. This takes the form appropriate for the framework such as tf.train.SessionRunHook for TF, Callback for keras... ... Attributes ---------- out_dir : str represents a path into which outputs will be written to. The hook raises error if the 'out_dir' already exists. The implementation does not support merging the tensors generated in current job with tensors from previous job. Hence, ensure that the 'out_dir' does not exist. dry_run : bool when dry run is set, behavior is only described in the log file. tensors are not actually saved. save_config: SaveConfig object Takes save config object which is applied as default for all included tensors. A collection can optionally have its own saveconfig object which overrides this for its tensors. reduction_config: ReductionConfig object if passed, this reduction config object is used as default for all tensors included. A collection has its own saveconfig object which overrides this for its tensors. if this is not passed, tensor is saved in full. include_regex: list of str takes as input the list of string representing regular expressions. Tensors whose names match these regular expressions will be saved. These tensors will be available as part of the `default` collection. include_collections: list of str representing collection names takes as input the collections which should be saved. if this is empty, it defaults to including all collections from code save_all: bool a shortcut for saving all tensors in the model. they are all saved in the collection `all` include_workers: str makes the hook save data from all workers profiler_config_parser: ProfilerConfigParser object if passed, use this profiler configuration. by default, set up a new profiler configuration here. """ error_handling_agent.set_hook( self) # This should be the first line in the constructor. self.out_dir = verify_and_get_out_dir(out_dir) self.tensorboard_dir = get_tensorboard_dir( export_tensorboard=export_tensorboard, tensorboard_dir=tensorboard_dir, out_dir=self.out_dir, ) self.dry_run = dry_run self.worker = None # when smdebug is used during an unsupported dist training process # we write data only from the process that has self.first_process set to True. self.first_process = None self.save_all_workers = True if include_workers == "all" else False self.chief_worker = DEFAULT_WORKER_NAME if include_collections is None: include_collections = default_include_collections else: include_collections = flatten(include_collections) self.include_collections = list( set(include_collections).union(set(default_include_collections))) self.save_all = save_all self.save_config = SaveConfig.parse(save_config) if reduction_config is None: reduction_config = ReductionConfig(save_raw_tensor=True) self.reduction_config = reduction_config self.include_regex = include_regex self.collection_manager = collection_manager self.init_step = init_step # The written_tensor_name_for_step dictionary stores # the names of each tensor saved for every step. # This is to detect name clashes. # If a name clash is detected, it is avoided by appending # an index to the tensor name. self.written_tensor_name_for_step = defaultdict(int) self.logger = logger if self.tensorboard_dir is None: self.logger.info( f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries." ) if include_regex is not None: collection_manager.get( CollectionKeys.DEFAULT).include(include_regex) if CollectionKeys.DEFAULT not in self.include_collections: self.include_collections.append(CollectionKeys.DEFAULT) self.save_all = save_all if self.save_all: collection_manager.get(CollectionKeys.ALL).include(".*") if CollectionKeys.ALL not in self.include_collections: self.include_collections.append(CollectionKeys.ALL) if (CollectionKeys.DEFAULT not in self.include_collections and collection_manager.get(CollectionKeys.DEFAULT).include_regex): self.logger.warn("The `default` collection was not passed to " "include_collections. So it is not being saved") self._collections_to_save = set() self._collections_to_save_for_step = None self.prepared_collections = False self.tensor_to_collections = {} self.step = init_step self.last_saved_step = None self.mode = ModeKeys.GLOBAL self.mode_steps = {ModeKeys.GLOBAL: init_step} self.writer = None self.profiler_config_parser = profiler_config_parser self.profiler_config_parser.load_config() self.timeline_writer = TimelineFileWriter( profiler_config_parser=profiler_config_parser) self.hvd_reader = None self.is_smdataparallel_profiling = False if is_sagemaker_job() and SageMakerFileMetricsWriter is not None: self.metrics_writer = SageMakerFileMetricsWriter() else: self.metrics_writer = None # Maps ModeKeys to FileWriter objects self.tb_writers = {} # Cache scalars that are being saved through save_scalar() calls self.scalar_cache = [] self.logger.info("Saving to {}".format(self.out_dir)) atexit.register(self._cleanup) # Check if there is any last saved state. Initialize the hook based last saved state. self.training_run = 0 self._initialize_to_last_saved_state() self.custom_tensors_to_save = dict() # This will avoid pickling of BaseHook object def __getstate__(self): return {} def _initialize_to_last_saved_state(self): self.state_store = StateStore() last_state = self.state_store.get_last_saved_state() if last_state is not None: self.last_saved_step = last_state[LATEST_GLOBAL_STEP_SAVED] self.init_step = last_state[LATEST_GLOBAL_STEP_SEEN] self.training_run = 1 + last_state[TRAINING_RUN] for (mode, step) in last_state[LATEST_MODE_STEP].items(): self.mode_steps[ModeKeys[mode]] = step self.mode_steps[ModeKeys.GLOBAL] = self.init_step self.step = self.init_step self.logger.info( f"Initialized the hook with the last saved state: last_saved_step={self.last_saved_step} init_step = {self.init_step}, step = {self.step} mode_steps = {str(self.mode_steps)}" ) def __repr__(self): return ( f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}>:(\n" f" out_dir={self.out_dir},\n" f" tensorboard_dir={self.tensorboard_dir},\n" f" step={self.step},\n" f" mode={self.mode},\n" f" mode_steps={self.mode_steps},\n" f" include_collections={self.include_collections},\n" f" writer={self.writer},\n" f" save_config={str(self.save_config)[:200]} ...>,\n" f" reduction_config={str(self.reduction_config)},\n" f" save_all={self.save_all},\n" f" dry_run={self.dry_run},\n" f")") @classmethod def create_from_json_file(cls, json_file_path=None): """Relies on the existence of a JSON file. First, check json_config_path. If it's not None, If the file exists, use that. If the file does not exist, throw an error. Otherwise, check the filepath set by a SageMaker environment variable. If the file exists, use that. Otherwise, return None. """ return create_hook_from_json_config(cls, json_config_path=json_file_path) @abstractmethod def _get_worker_name(self): pass @abstractmethod def _get_num_workers(self): pass @abstractmethod def _is_not_supported(self): pass #### Save Manager methods #### def _should_collection_be_saved(self, coll_name: str) -> bool: return coll_name in self.include_collections def _assert_prep(self): assert self.prepared_collections, "Collections have not been prepared yet" def _get_all_collections_to_save(self) -> Set["Collection"]: self._assert_prep() return self._collections_to_save @error_handling_agent.catch_smdebug_errors(default_return_val=False) def _is_collection_being_saved_for_step(self, name): # if saving all, all collections will be part of colls_for_step colls_for_step = self._get_collections_to_save_for_step() return self.collection_manager.get(name) in colls_for_step def _get_collections_to_save_for_step(self) -> Set["Collection"]: if self._collections_to_save_for_step is None: self._assert_prep() self._collections_to_save_for_step = set() for coll in self._get_all_collections_to_save(): if self.mode in [ModeKeys.EVAL, ModeKeys.PREDICT]: if coll.name in [ CollectionKeys.GRADIENTS, CollectionKeys.OPTIMIZER_VARIABLES ]: continue if coll.save_config.should_save_step( self.mode, self.mode_steps[self.mode]): self._collections_to_save_for_step.add(coll) if self._collections_to_save_for_step: if self.mode == ModeKeys.GLOBAL: step_str = f"for step {self.step}" else: step_str = f"for step {self.mode_steps[self.mode]} of mode {self.mode.name}" self.logger.debug( f"Saving the collections " f"{', '.join([x.name for x in self._collections_to_save_for_step])} {step_str}" ) return self._collections_to_save_for_step def _is_tensor_saved_for_step(self, tensor_name): collections_to_save = self._get_collections_to_save_for_step() for c in collections_to_save: if match_inc(tensor_name, c.include_regex): return True return False def _get_collections_with_tensor(self, tensor_name) -> Set["Collection"]: self._assert_prep() # for tf this will be prepopulated in check_and_add_tensor if tensor_name not in self.tensor_to_collections: # for mxnet it is computed and then cached matched_colls = set() for coll in self._collections_to_save: if tensor_name in coll.tensor_names: # if being matched as reduction, # it must be in reduction_tensor_name, not with regex matched_colls.add(coll) elif match_inc(tensor_name, coll.include_regex): coll.add_tensor_name(tensor_name) matched_colls.add(coll) self.tensor_to_collections[tensor_name] = matched_colls return self.tensor_to_collections[tensor_name] @abstractmethod def _get_default_collections(self): pass def has_default_hook_configuration( self, default_saved_collections=DEFAULT_SAVED_COLLECTIONS): # Used in the internal framework forks to determine if the hook # is using the default hook configuration if not self.prepared_collections: self._prepare_collections() collections_being_saved = [x.name for x in self._collections_to_save] return set(collections_being_saved) == set(default_saved_collections) def _has_default_profiler_configuration(self): return self.profiler_config_parser.config is None def has_default_configuration(self): return self.has_default_hook_configuration( ) and self._has_default_profiler_configuration() def _prepare_collections(self): """Populate collections_to_save and ensure every collection has a save_config and reduction_config.""" for c_name, c in self.collection_manager.get_collections().items(): if c_name not in self._get_default_collections(): if bool(c.include_regex) is False and bool( c.tensor_names) is False: raise InvalidCollectionConfiguration(c_name) if c in self._collections_to_save: continue elif self._should_collection_be_saved(CollectionKeys.ALL): self._collections_to_save.add(c) elif self._should_collection_be_saved(c_name): self._collections_to_save.add(c) self.logger.info( f'Monitoring the collections: {", ".join([x.name for x in self._collections_to_save])}' ) # Populate configs_for_collections and reduction_config for c_name, c in self.collection_manager.get_collections().items(): if c_name in NON_HISTOGRAM_COLLECTIONS: c.save_histogram = False if c.save_config is None: # Set to the default if None c.save_config = self.save_config elif isinstance(c.save_config, SaveConfig): # Otherwise, set missing modes to the defaults c.save_config.merge_default_save_config(self.save_config) else: raise TypeError( f"save_config={c.save_config} must be None or SaveConfig") if c_name in NON_REDUCTION_COLLECTIONS: c.reduction_config = ReductionConfig(save_raw_tensor=True) elif c.reduction_config is None: c.reduction_config = self.reduction_config self.prepared_collections = True #### End of Save Manager methods #### @staticmethod def _close_given_writer_map(writer_dict): # Delete all the dist training writers to_delete_writers = [] for key, writer in writer_dict.items(): # close calls flush writer.close() to_delete_writers.append(key) for key in to_delete_writers: del writer_dict[key] def _close_writers(self) -> None: if self.dry_run: return # flush out sm_metric scalars to metrics file self._write_scalars() if self.writer is not None: self.writer.flush() self.writer.close() self.writer = None self._close_given_writer_map(self.tb_writers) def _initialize_writers(self, only_initialize_if_missing=False) -> None: # Function is overridden in smdebug/tensorflow/base_hook.py if only_initialize_if_missing and self.writer: return if self.dry_run: return if self.first_process is False: return elif self.first_process is None: if self._get_num_workers() == 1: if is_first_process(self.out_dir): self.first_process = True self.logger.info( f"Hook is writing from the hook with pid: {os.getpid()}\n" ) else: if self.first_process is None: self.logger.warn( f"Unsupported Distributed Training Strategy Detected. \ Sagemaker-Debugger will only write from one process. \ The process with pid: {os.getpid()} will not be writing any data. \n" ) self.first_process = False return if self.save_all_workers is False: if self.worker != self.chief_worker: return self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) def _get_main_writer(self) -> List[FileWriter]: return [self.writer] if self.writer else [] def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: """ :param tensor_name: :param tensor_ref: used by TF :return: List[FileWriter] """ if self.save_all_workers is False and self.worker != self.chief_worker: return [] return self._get_main_writer() def _maybe_get_tb_writer(self) -> Optional[FileWriter]: """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None. Creates a writer if does not exist. """ if not self.tensorboard_dir: return None if self.mode in self.tb_writers: assert self.tb_writers[self.mode] is not None # would be there if set_mode was called return self.tb_writers[self.mode] else: # s = self.step # if s < 0: s = 0 self.tb_writers[self.mode] = FileWriter( trial_dir=self.tensorboard_dir, step=self.step, worker=get_tb_worker(), write_checksum=True, wtype="tensorboard", mode=self.mode, ) return self.tb_writers[self.mode] def _close_tb_writer(self): if self.dry_run: return if self.mode in self.tb_writers: self.tb_writers[self.mode].close() del self.tb_writers[self.mode] def close(self): self._cleanup() def log_outstanding_timeline_metrics(self): pass def _cleanup(self): self._close_writers() if self.metrics_writer: self.metrics_writer.close() self.log_outstanding_timeline_metrics() self.timeline_writer.close() # close the Horovod file reader thread if it has been enabled if self.hvd_reader and self.hvd_reader.enabled: self.hvd_reader.close() training_has_ended(self.out_dir) if self.first_process is True: remove_claim_file(self.out_dir) def _increment_step(self): # Update the last_state to the last step number that was saved or seen self._write_state() self.step += 1 self.mode_steps[self.mode] += 1 self.written_tensor_name_for_step.clear() # Increment Global step number irrespective of what mode it is if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step self._collections_to_save_for_step = None # Called in the internal AWS codebase to determine # if a particular tensor value should be saved @error_handling_agent.catch_smdebug_errors() def should_save_tensor_or_collection(self, tensor_name: str, collection_name: str) -> bool: if self.prepared_collections is False: # always return false if an attempt to save a # tensor is made before the collections are prepared. # this can happen if the fn is called before callbacks are init. self.logger.warning( "Tensors cannot be saved with smdebug before callbacks are initialized." ) return False if collection_name == "gradients": layer_name = tensor_name.split(":")[0] tensor_name = "gradients/" + layer_name + "Grad" if self._is_collection_being_saved_for_step(collection_name): c = self.collection_manager.get(collection_name) return match_inc(tensor_name, c.include_regex) or c.include_regex == [] return self._is_tensor_saved_for_step(tensor_name) def _write_state(self): if self.state_store.is_checkpoint_updated(): current_state = dict() current_state[TRAINING_RUN] = self.training_run current_state[LATEST_GLOBAL_STEP_SAVED] = self.last_saved_step current_state[LATEST_GLOBAL_STEP_SEEN] = self.step mode_step = dict() for (mode, step) in self.mode_steps.items(): mode_step[mode.name] = step current_state[LATEST_MODE_STEP] = mode_step self.state_store.update_state(current_state) def save_tensor(self, tensor_name, tensor_value, collections_to_write=CollectionKeys.DEFAULT): if validate_custom_tensor_value(tensor_value, self._make_numpy_array) is False: self.logger.warn( "The tensor value could not be converted into a numpy value") return if isinstance(collections_to_write, str): collections_to_write = [collections_to_write] for collection in collections_to_write: self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) def _save_custom_tensors_post_step(self): for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[ tensor_name] c = self.collection_manager.get(collection_names, create=True) c.add_tensor_name(tensor_name) self._write_raw_tensor(tensor_name, tensor_value, [c]) self.custom_tensors_to_save.clear() def set_mode(self, mode): # train if mode in ALLOWED_MODES: self.mode = mode else: raise ValueError("Invalid mode {}. Valid modes are {}.".format( mode, ",".join(ALLOWED_MODE_NAMES))) if mode not in self.mode_steps: self.mode_steps[mode] = self.init_step self._collections_to_save_for_step = None def export_collections(self): num_workers = self._get_num_workers() if num_workers == 1 and self.first_process is False: self.logger.warn( f"Unsupported Distributed Training Strategy Detected. \ Sagemaker-Debugger will only write from one process. \ The process with pid: {os.getpid()} will not be writing any data. \n" ) return if self.save_all_workers is False: if self.chief_worker != self.worker: return num_workers = 1 # Override self.collection_manager.set_num_workers(num_workers) collection_file_name = f"{self.worker}_collections.json" self.collection_manager.export(self.out_dir, collection_file_name) def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): return get_reduction_tensor_name(tensor_name, reduction_name, abs, remove_colon_index=True) def _write_reduction(self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None): reduction_tensor_name = self._get_reduction_tensor_name( tensor_name, reduction_name, abs) try: tensor_data = self._get_reduction_of_data(reduction_name, tensor_value, tensor_name, abs) self._write_raw_tensor_simple(reduction_tensor_name, tensor_data, tensor_ref=tensor_ref) except ValueError as e: self.logger.warning( f"Could not compute reduction {reduction_name} of {tensor_name} due to {e}" ) def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_ref=None): reductions_saved = set() for s_col in save_collections: if s_col.name in SCALAR_COLLECTIONS: continue reduction_config = s_col.reduction_config for reduction_list in (reduction_config.reductions, reduction_config.norms): for reduction in reduction_list: if (reduction, False) not in reductions_saved: self._write_reduction(tensor_name, tensor_value, reduction, abs=False, tensor_ref=tensor_ref) reductions_saved.add((reduction, False)) for reduction_list in (reduction_config.abs_reductions, reduction_config.abs_norms): for reduction in reduction_list: if (reduction, True) not in reductions_saved: self._write_reduction(tensor_name, tensor_value, reduction, abs=True, tensor_ref=tensor_ref) reductions_saved.add((reduction, True)) def _write_scalar_summary(self, tensor_name, tensor_value, save_colls): """ Maybe write to TensorBoard. """ tb_writer = self._maybe_get_tb_writer() if tb_writer: for s_col in save_colls: if s_col.name in SCALAR_COLLECTIONS: np_val = self._make_numpy_array(tensor_value) if self.dry_run: return if np_val.squeeze().ndim == 0: self.logger.debug( f"Saving scalar summary {tensor_name} for global step {self.step}" ) tb_writer.write_scalar_summary(tensor_name, np_val, self.step) else: self.logger.debug( f"Value of {tensor_name} is not scalar, " f"so scalar summary could not be created") break def _write_histogram_summary(self, tensor_name, tensor_value, save_collections): """ Maybe write to TensorBoard. """ tb_writer = self._maybe_get_tb_writer() if tb_writer: for s_col in save_collections: if s_col.name in NON_HISTOGRAM_COLLECTIONS: continue elif s_col.save_histogram is True: np_value = self._make_numpy_array(tensor_value) if self.dry_run or np_value.dtype == np.bool or np_value.nbytes == 0: return hist_name = f"{s_col.name}/{tensor_name}" self.logger.debug( f"Saving {hist_name} for global step {self.step}") tb_writer.write_histogram_summary(tdata=np_value, tname=hist_name, global_step=self.step) break @error_handling_agent.catch_smdebug_errors() def record_trace_events(self, timestamp, training_phase="", op_name="", phase="X", duration=1, **kwargs): """ Write trace events to the timeline. :param training_phase: strings like, data_iterating, forward, backward, operations etc :param op_name: more details about phase like whether dataset or iterator :param phase: this is defaulted to 'X' :param timestamp: start_time for the event (in seconds) :param duration: any duration manually computed (in seconds) :param kwargs: can be process id and thread id """ self.timeline_writer.write_trace_events( training_phase=training_phase, op_name=op_name, phase=phase, timestamp=timestamp, duration=duration, **kwargs, ) def _write_scalars(self): """ This function writes all the scalar values saved in the scalar_cache to file. If sm_metric is set to True for certain scalars, then that scalar is written to SageMaker as well. By default, loss values are sm_metric. """ if self._is_not_supported(): # Do not log scalars if smdebug hook is not supported # Like when TFDistributionStrategy.UNSUPPORTED self.scalar_cache = [] return for scalar_obj in self.scalar_cache: scalar_name = scalar_obj.name scalar_val = scalar_obj.value scalar_mode = scalar_obj.mode sm_metric = scalar_obj.sm_metric write_tb = scalar_obj.write_tb write_event = scalar_obj.write_event timestamp = scalar_obj.timestamp if self.metrics_writer and sm_metric: self.metrics_writer.log_metric( scalar_name + "_" + scalar_mode.name, scalar_val, timestamp=timestamp, iteration_number=self.mode_steps[scalar_mode], ) if write_tb: tb_writer = self._maybe_get_tb_writer() if tb_writer: tb_writer.write_scalar_summary(scalar_name, scalar_val, self.step, timestamp=timestamp) if write_event: self._initialize_writers(only_initialize_if_missing=True) self._write_raw_tensor_simple(scalar_name, scalar_val, timestamp=timestamp) self.scalar_cache = [] # Fix step number for saving scalar and tensor def save_scalar(self, name, value, sm_metric=False, timestamp: float = None): """ Call save_scalar at any point in the training script to log a scalar value, such as a metric or any other value. :param name: Name of the scalar. A prefix 'scalar/' will be added to it :param value: Scalar value :param sm_metric: True/False. If set to True, the scalar value will be written to SageMaker """ name = CallbackHook.SCALAR_PREFIX + name val = self._make_numpy_array(value) if val.size != 1: raise TypeError( f"{name} has non scalar value of type: {type(value)}") scalar_obj = ScalarCache(name, val, self.mode, sm_metric, write_tb=True, write_event=True, timestamp=timestamp) self.scalar_cache.append(scalar_obj) def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_ref=None): for s_col in save_collections: reduction_config = s_col.reduction_config if reduction_config.save_raw_tensor is True: self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref) break def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None): writers = self._get_writers(tensor_name, tensor_ref=tensor_ref) for s_col in save_collections: reduction_config = s_col.reduction_config if self.dry_run is False and reduction_config.save_shape is True: numpy_tensor_value = self._make_numpy_array(tensor_value) this_size, this_shape = size_and_shape(numpy_tensor_value) # In TF Keras and Variables in all interfaces of TF, sometimes we output tensors with # more meaningful names than the origina name. Outputting # both Smdebug given name and original name in such cases if tensor_ref is not None and tensor_ref.tf_obj is not None: original_name = tensor_ref.tf_obj.name else: original_name = None for writer in writers: writer.write_shape( tensor_name, this_shape, self.mode, self.mode_steps[self.mode], original_name=original_name, ) break def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None): # tensor_ref is used by TF # todo: if fp16, check perf of saving as fp16 in proto vs as fp32 numpy_tensor_value = self._make_numpy_array(tensor_value) this_size, this_shape = size_and_shape(numpy_tensor_value) if self.dry_run is False and this_size > 0: writers = self._get_writers(tensor_name, tensor_ref=tensor_ref) for writer in writers: writer.write_tensor( tdata=numpy_tensor_value, tname=tensor_name, mode=self.mode, mode_step=self.mode_steps[self.mode], timestamp=timestamp, ) def _save_for_tensor(self, tensor_name, tensor_value, check_before_write=True): """ Identifies if this tensor should be saved for this step based on the save configs for the collections it belongs to. If this tensor is to be saved, calls write_for_tensor. This check can be disabled by passing check_before_write=False. Disabling this check is cleaner for TF, as for TF this method is never called if tensor should not be saved for this step. :param tensor_name: str The name of tensor. In TensorFlow's case, this is graph name of tensor and will be converted to internal name in write_for_tensor. :param tensor_value: dtype is tensor class of corresponding framework value of the tensor to be saved :param check_before_write: bool checks whether to save tensor for this step :return: """ save_collections = self._get_collections_with_tensor(tensor_name) save_collections_for_tensor = save_collections.intersection( self._get_collections_to_save_for_step()) if check_before_write and bool(save_collections_for_tensor) is False: return elif not check_before_write: # if not checking before write, means we want to write # regardless of whether the collection should be written for step save_collections_for_tensor = save_collections self._write_for_tensor(tensor_name, tensor_value, save_collections_for_tensor) for s_col in save_collections_for_tensor: if s_col.name in SM_METRIC_COLLECTIONS: np_val = self._make_numpy_array(tensor_value) # Always log loss to SageMaker tensor_val = np.mean(np_val) scalar_obj = ScalarCache( tensor_name, tensor_val, self.mode, sm_metric=True, write_tb=False, write_event=False, ) self.scalar_cache.append(scalar_obj) def _log_save(self, tensor_name, save_collections): coll_str = ", ".join([x.name for x in save_collections]) many_colls = len(save_collections) > 1 if self.mode != ModeKeys.GLOBAL: step_str = f"for step {self.mode_steps[self.mode]} of mode {self.mode.name}" else: step_str = f"for step: {self.step}" base_str = f"Saving {tensor_name} from {'collections' if many_colls else 'collection'}" self.logger.debug(f"{base_str} {coll_str} {step_str}") def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_ref=None): """ Write all data that we might want to for this tensor :param tensor_name: name of tensor :param tensor_value: value (could be in framework tensor dtype) :param save_collections: list of collections which are being saved for this step """ self._log_save(tensor_name, save_collections) self._write_shape(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref) # write reductions defined for collections this tensor may be part of self._write_reductions(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref) # write histogram for this tensor if any collection this tensor # is part of has save_histogram as True self._write_histogram_summary(tensor_name, tensor_value, save_collections) # write raw tensor if save_raw_tensor in reduction config is True self._write_raw_tensor(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref) # writes scalar summary if this value is a scalar (or 1x1 array) self._write_scalar_summary(tensor_name, tensor_value, save_collections) @staticmethod @abstractmethod def _get_reduction_of_data(reduction_name, tensor_value, tensor_name, abs): """ Returns the reduction of given tensor :param reduction_name: str type of reduction :param tensor_value: tensor_data_type reduction to be performed on this original tensor value :param tensor_name: str name of original tensor :param abs: bool whether to take absolute value of tensor before performing reduction :return: """ @staticmethod @abstractmethod def _make_numpy_array(tensor_value): """ Convert the tensor value into a numpy array :param tensor_value: mx.nd.NDArray, torch.Tensor, etc :return: numpy ndarray """ def get_collection(self, name, create=True): return self.collection_manager.get(name, create=create) def get_collections(self): return self.collection_manager.get_collections() def add_collection(self, collection): if not isinstance(collection, Collection): raise TypeError( f"collection must be an instance of Collection class. " f"value of type {collection.__class__} is not supported") self.collection_manager.add(collection)
def __init__( self, collection_manager: CollectionManager, default_include_collections: List[str], profiler_config_parser: ProfilerConfigParser, init_step: int = 0, out_dir: Optional[str] = None, export_tensorboard: bool = False, tensorboard_dir: Optional[str] = None, dry_run: bool = False, reduction_config: Optional[ReductionConfig] = None, save_config: Optional[Union[SaveConfig, Dict[ModeKeys, SaveConfigMode]]] = None, include_regex: Optional[List[str]] = None, include_collections: Optional[List[str]] = None, save_all: bool = False, include_workers: str = "one", ): """ A class used to represent the hook which gets attached to the training process. This takes the form appropriate for the framework such as tf.train.SessionRunHook for TF, Callback for keras... ... Attributes ---------- out_dir : str represents a path into which outputs will be written to. The hook raises error if the 'out_dir' already exists. The implementation does not support merging the tensors generated in current job with tensors from previous job. Hence, ensure that the 'out_dir' does not exist. dry_run : bool when dry run is set, behavior is only described in the log file. tensors are not actually saved. save_config: SaveConfig object Takes save config object which is applied as default for all included tensors. A collection can optionally have its own saveconfig object which overrides this for its tensors. reduction_config: ReductionConfig object if passed, this reduction config object is used as default for all tensors included. A collection has its own saveconfig object which overrides this for its tensors. if this is not passed, tensor is saved in full. include_regex: list of str takes as input the list of string representing regular expressions. Tensors whose names match these regular expressions will be saved. These tensors will be available as part of the `default` collection. include_collections: list of str representing collection names takes as input the collections which should be saved. if this is empty, it defaults to including all collections from code save_all: bool a shortcut for saving all tensors in the model. they are all saved in the collection `all` include_workers: str makes the hook save data from all workers profiler_config_parser: ProfilerConfigParser object if passed, use this profiler configuration. by default, set up a new profiler configuration here. """ error_handling_agent.set_hook( self) # This should be the first line in the constructor. self.out_dir = verify_and_get_out_dir(out_dir) self.tensorboard_dir = get_tensorboard_dir( export_tensorboard=export_tensorboard, tensorboard_dir=tensorboard_dir, out_dir=self.out_dir, ) self.dry_run = dry_run self.worker = None # when smdebug is used during an unsupported dist training process # we write data only from the process that has self.first_process set to True. self.first_process = None self.save_all_workers = True if include_workers == "all" else False self.chief_worker = DEFAULT_WORKER_NAME if include_collections is None: include_collections = default_include_collections else: include_collections = flatten(include_collections) self.include_collections = list( set(include_collections).union(set(default_include_collections))) self.save_all = save_all self.save_config = SaveConfig.parse(save_config) if reduction_config is None: reduction_config = ReductionConfig(save_raw_tensor=True) self.reduction_config = reduction_config self.include_regex = include_regex self.collection_manager = collection_manager self.init_step = init_step # The written_tensor_name_for_step dictionary stores # the names of each tensor saved for every step. # This is to detect name clashes. # If a name clash is detected, it is avoided by appending # an index to the tensor name. self.written_tensor_name_for_step = defaultdict(int) self.logger = logger if self.tensorboard_dir is None: self.logger.info( f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries." ) if include_regex is not None: collection_manager.get( CollectionKeys.DEFAULT).include(include_regex) if CollectionKeys.DEFAULT not in self.include_collections: self.include_collections.append(CollectionKeys.DEFAULT) self.save_all = save_all if self.save_all: collection_manager.get(CollectionKeys.ALL).include(".*") if CollectionKeys.ALL not in self.include_collections: self.include_collections.append(CollectionKeys.ALL) if (CollectionKeys.DEFAULT not in self.include_collections and collection_manager.get(CollectionKeys.DEFAULT).include_regex): self.logger.warn("The `default` collection was not passed to " "include_collections. So it is not being saved") self._collections_to_save = set() self._collections_to_save_for_step = None self.prepared_collections = False self.tensor_to_collections = {} self.step = init_step self.last_saved_step = None self.mode = ModeKeys.GLOBAL self.mode_steps = {ModeKeys.GLOBAL: init_step} self.writer = None self.profiler_config_parser = profiler_config_parser self.profiler_config_parser.load_config() self.timeline_writer = TimelineFileWriter( profiler_config_parser=profiler_config_parser) self.hvd_reader = None self.is_smdataparallel_profiling = False if is_sagemaker_job() and SageMakerFileMetricsWriter is not None: self.metrics_writer = SageMakerFileMetricsWriter() else: self.metrics_writer = None # Maps ModeKeys to FileWriter objects self.tb_writers = {} # Cache scalars that are being saved through save_scalar() calls self.scalar_cache = [] self.logger.info("Saving to {}".format(self.out_dir)) atexit.register(self._cleanup) # Check if there is any last saved state. Initialize the hook based last saved state. self.training_run = 0 self._initialize_to_last_saved_state() self.custom_tensors_to_save = dict()
def __init__( self, collection_manager: CollectionManager, default_include_collections: List[str], init_step: int = 0, out_dir: Optional[str] = None, export_tensorboard: bool = False, tensorboard_dir: Optional[str] = None, dry_run: bool = False, reduction_config: Optional[ReductionConfig] = None, save_config: Optional[Union[SaveConfig, Dict[ModeKeys, SaveConfigMode]]] = None, include_regex: Optional[List[str]] = None, include_collections: Optional[List[str]] = None, save_all: bool = False, include_workers: str = "one", ): """ A class used to represent the hook which gets attached to the training process. This takes the form appropriate for the framework such as tf.train.SessionRunHook for TF, Callback for keras... ... Attributes ---------- out_dir : str represents a path into which outputs will be written to dry_run : bool when dry run is set, behavior is only described in the log file. tensors are not actually saved. save_config: SaveConfig object Takes save config object which is applied as default for all included tensors. A collection can optionally have its own saveconfig object which overrides this for its tensors. reduction_config: ReductionConfig object if passed, this reduction config object is used as default for all tensors included. A collection has its own saveconfig object which overrides this for its tensors. if this is not passed, tensor is saved in full. include_regex: list of str takes as input the list of string representing regular expressions. Tensors whose names match these regular expressions will be saved. These tensors will be available as part of the `default` collection. include_collections: list of str representing collection names takes as input the collections which should be saved. if this is empty, it defaults to including all collections from code save_all: bool a shortcut for saving all tensors in the model. they are all saved in the collection `all` include_workers: str makes the hook save data from all workers """ self.out_dir = verify_and_get_out_dir(out_dir) self.tensorboard_dir = get_tensorboard_dir( export_tensorboard=export_tensorboard, tensorboard_dir=tensorboard_dir, out_dir=self.out_dir, ) self.dry_run = dry_run self.worker = None self.save_all_workers = True if include_workers == "all" else False self.chief_worker = CONFIG_DEFAULT_WORKER_NAME if include_collections is None: include_collections = default_include_collections else: include_collections = flatten(include_collections) self.include_collections = list( set(include_collections).union(set(default_include_collections)) ) self.save_all = save_all self.save_config = SaveConfig.parse(save_config) if reduction_config is None: reduction_config = ReductionConfig(save_raw_tensor=True) self.reduction_config = reduction_config self.include_regex = include_regex self.collection_manager = collection_manager self.collection_manager.set_num_workers(self._get_num_workers()) self.init_step = init_step self.logger = logger if self.tensorboard_dir is None: self.logger.info( f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries." ) if include_regex is not None: collection_manager.get(CollectionKeys.DEFAULT).include(include_regex) if CollectionKeys.DEFAULT not in self.include_collections: self.include_collections.append(CollectionKeys.DEFAULT) self.save_all = save_all if self.save_all: collection_manager.get(CollectionKeys.ALL).include(".*") if CollectionKeys.ALL not in self.include_collections: self.include_collections.append(CollectionKeys.ALL) if ( CollectionKeys.DEFAULT not in self.include_collections and collection_manager.get(CollectionKeys.DEFAULT).include_regex ): self.logger.warn( "The `default` collection was not passed to " "include_collections. So it is not being saved" ) self._collections_to_save = set() self._collections_to_save_for_step = None self.prepared_collections = False self.tensor_to_collections = {} self.step = init_step self.last_saved_step = None self.mode = ModeKeys.GLOBAL self.mode_steps = {ModeKeys.GLOBAL: init_step} self.writer = None if is_sagemaker_job() and SageMakerFileMetricsWriter is not None: self.metrics_writer = SageMakerFileMetricsWriter() else: self.metrics_writer = None # Maps ModeKeys to FileWriter objects self.tb_writers = {} # Cache scalars that are being saved through save_scalar() calls self.scalar_cache = [] self.logger.info("Saving to {}".format(self.out_dir)) atexit.register(self._cleanup) # Check if there is any last saved state. Initialize the hook based last saved state. self.training_run = 0 self._initialize_to_last_saved_state()