def _get_matching_collections( self, mode, tensor, tensor_type, ts_name, is_input_to_model=False, is_output_of_model=False ): colls_with_tensor = set() if tensor_type == "weight": if match_inc( tensor.name, self.collection_manager.get(CollectionKeys.BIASES).include_regex ): colls_with_tensor.add(self.collection_manager.get(CollectionKeys.BIASES)) else: colls_with_tensor.add(self.collection_manager.get(CollectionKeys.WEIGHTS)) elif is_input_to_model: colls_with_tensor.add(self.collection_manager.get(CollectionKeys.INPUTS)) elif is_output_of_model: colls_with_tensor.add(self.collection_manager.get(CollectionKeys.OUTPUTS)) for current_coll in self.collection_manager.get_collections().values(): if current_coll.name in [CollectionKeys.WEIGHTS, CollectionKeys.BIASES]: # don't match regex for these as these are added specially above # we also don't want users to make mistakes configuring these collections continue if match_inc(ts_name, current_coll.include_regex): if not current_coll.has_tensor(tensor): # tensor will be added to this coll below colls_with_tensor.add(current_coll) # don't recommend adding tensors externally as # they will have different internal name # but regardless, in such case we only use that tensor name to save data # instead of the keras-style-internal-names return colls_with_tensor
def _get_matching_collections(self, mode, tensor, tensor_type, ts_name, is_input_to_model=False, is_output_of_model=False): colls_with_tensor = set() if tensor_type == "weight": if match_inc( tensor.name, self.collection_manager.get( CollectionKeys.BIASES).include_regex): colls_with_tensor.add( self.collection_manager.get(CollectionKeys.BIASES)) else: colls_with_tensor.add( self.collection_manager.get(CollectionKeys.WEIGHTS)) elif is_input_to_model: colls_with_tensor.add( self.collection_manager.get(CollectionKeys.INPUTS)) elif is_output_of_model: colls_with_tensor.add( self.collection_manager.get(CollectionKeys.OUTPUTS)) for current_coll in self.collection_manager.get_collections().values(): if current_coll.name in [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES ]: # don't match regex for these as these are added specially above # we also don't want users to make mistakes configuring these collections continue if match_inc(ts_name, current_coll.include_regex): # In TF 2.x eager mode, we can't put tensors in a set/dictionary as tensor.__hash__() # is no longer available. tensor.experimental_ref() returns a hashable reference # object to this Tensor. if is_tf_version_2x() and tf.executing_eagerly(): # tensor.experimental_ref is an experimental API # and can be changed or removed. # Ref: https://www.tensorflow.org/api_docs/python/tf/Tensor#experimental_ref tensor = tensor.experimental_ref() if not current_coll.has_tensor(tensor): # tensor will be added to this coll below colls_with_tensor.add(current_coll) # don't recommend adding tensors externally as # they will have different internal name # but regardless, in such case we only use that tensor name to save data # instead of the keras-style-internal-names return colls_with_tensor
def _save_tensor_to_file(self, tensor_name, tensor_value, collections): if isinstance(collections, set) is False: collections = {collections} # Since this function modifies the set, there is a possibility # of bugs if calling functions attempt to re-use the set passed # to this function collections_to_write = collections.copy() collections_to_save = self._get_collections_to_save_for_step() for c in collections_to_save: if match_inc(tensor_name, c.include_regex): collections_to_write.add(c) self._initialize_writers(only_initialize_if_missing=True) tensor_refs = [] if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(tensor_name) tensor_refs.append((tensor_ref, t)) else: tensor_ref = TensorRef.from_non_graph_var(tensor_name) tensor_refs.append((tensor_ref, tensor_value)) for tensor_ref, t in tensor_refs: for collection in collections_to_write: if isinstance(collection, str): collection = self.get_collection(collection) collection.set_tensor_ref(tensor_ref) self._save_for_tensor(tensor_name, t, check_before_write=True)
def _add_tensor_to_matching_collections(self, tensor): """ Finds which collections to add this tensor to, and adds tensor to them """ colls_with_tensor = set() for coll in sorted(self._get_all_collections_to_save(), key=lambda x: x.name): variable_collections_with_tensor, processed = self._process_tensor_from_variable_read_op( tensor) if processed: colls_with_tensor.update(variable_collections_with_tensor) # processed=True means this tensor was either a variable read tensor, # or a tensor with same name as variable # former will be added to collections such as weights, biases, opt_variables # latter will be skipped as they refer to the same tensor else: # some collections are added automatically, don't match regex for these if coll.name not in [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.OPTIMIZER_VARIABLES, CollectionKeys.TENSORFLOW_SUMMARIES, ] and match_inc(tensor.name, coll.include_regex): coll.add(tensor) if coll.has_tensor(tensor.name): # it must have been added when collection was added to # from user(custom_coll)/library(losses, weights, grads) tensor_ref = coll.get_tensor(tensor.name) tensor_ref.tf_obj = tensor colls_with_tensor.add(coll) # create entry in hook's tensor_to_collections map for this tensor self._create_tensors_for_matching_collections(tensor, colls_with_tensor)
def _prepare_tensors_available_post_step(self): # for gradients, optimizer_variables custom_collections, _ = self._get_custom_and_default_collections() for coll in [ self.get_collection(name=CollectionKeys.OPTIMIZER_VARIABLES), self.get_collection(name=CollectionKeys.GRADIENTS), self.get_collection(name=CollectionKeys.OUTPUTS), self.get_collection(name=CollectionKeys.INPUTS), ]: collection_values = coll.get_tensors() for tensor_ref in collection_values: if tensor_ref.name not in self.tensor_to_collections: self.tensor_to_collections[tensor_ref.name] = {coll} elif coll not in self.tensor_to_collections[tensor_ref.name]: self.tensor_to_collections[tensor_ref.name].add(coll) # Add tensor to custom collections for custom_coll in custom_collections: if ( match_inc(tensor_ref.name, custom_coll.include_regex) and tensor_ref.tf_obj is not None ): custom_coll.add_for_mode(tensor_ref.tf_obj, self.mode) if custom_coll not in self.tensor_to_collections[tensor_ref.name]: self.tensor_to_collections[tensor_ref.name].add(custom_coll)
def _tensors_matching_regex(self, regex_list) -> set: matched_tensornames = set() if not isinstance(regex_list, list): regex_list = [regex_list] regex_list = flatten(regex_list) for tensorname in self._tensors.keys(): if match_inc(tensorname, regex_list): matched_tensornames.add(tensorname) return matched_tensornames
def _add_weights_and_biases(self): wts = tf.trainable_variables() # variable such as <tf.Var w1:0> for w in wts: if match_inc(w.name, self.collection_manager.get(CollectionKeys.BIASES).include_regex): self.collection_manager.get(CollectionKeys.BIASES).add(w) else: # adds a tensor_ref with name `w1/read:0` and export_name `w1:0` self.collection_manager.get(CollectionKeys.WEIGHTS).add(w)
def _get_collections_with_tensor(self, tensor_name) -> Set["Collection"]: self._assert_prep() # for tf this will be prepopulated in check_and_add_tensor if tensor_name not in self.tensor_to_collections: # for mxnet it is computed and then cached matched_colls = set() for coll in self._collections_to_save: if tensor_name in coll.tensor_names: # if being matched as reduction, # it must be in reduction_tensor_name, not with regex matched_colls.add(coll) elif match_inc(tensor_name, coll.include_regex): coll.add_tensor_name(tensor_name) matched_colls.add(coll) self.tensor_to_collections[tensor_name] = matched_colls return self.tensor_to_collections[tensor_name]
def should_save_tensor_or_collection(self, tensor_name: str, collection_name: str) -> bool: if self.prepared_collections is False: # always return false if an attempt to save a # tensor is made before the collections are prepared. # this can happen if the fn is called before callbacks are init. self.logger.warning( "Tensors cannot be saved with smdebug before callbacks are initialized." ) return False if collection_name == "gradients": layer_name = tensor_name.split(":")[0] tensor_name = "gradients/" + layer_name + "Grad" if self._is_collection_being_saved_for_step(collection_name): c = self.collection_manager.get(collection_name) return match_inc(tensor_name, c.include_regex) or c.include_regex == [] return self._is_tensor_saved_for_step(tensor_name)
def _prepare_non_layer_tensors(self): # for gradients, optimizer_variables custom_collections, _ = self._get_custom_and_default_collections() for coll in [ self.get_collection(name=CollectionKeys.OPTIMIZER_VARIABLES), self.get_collection(name=CollectionKeys.GRADIENTS), ]: for tensor_ref in coll.get_tensors(): if tensor_ref.name not in self.tensor_to_collections: self.tensor_to_collections[tensor_ref.name] = {coll} elif coll not in self.tensor_to_collections[tensor_ref.name]: self.tensor_to_collections[tensor_ref.name].add(coll) # Add tensor to custom collections for custom_coll in custom_collections: if match_inc(tensor_ref.name, custom_coll.include_regex): custom_coll.add_for_mode(tensor_ref.tf_obj, self.mode) if custom_coll not in self.tensor_to_collections[tensor_ref.name]: self.tensor_to_collections[tensor_ref.name].add(custom_coll)
def _prepare_non_layer_tensors(self): # for gradients, optimizer_variables custom_collections, default_tf_collection = self._get_custom_and_default_collections( ) for default_coll in default_tf_collection: for tensor_ref in default_coll.get_tensors(): if tensor_ref.name not in self.tensor_to_collections: self.tensor_to_collections[tensor_ref.name] = { default_coll } elif default_coll not in self.tensor_to_collections[ tensor_ref.name]: self.tensor_to_collections[tensor_ref.name].add( default_coll) # Add tensor to custom collections for custom_coll in custom_collections: if match_inc(tensor_ref.name, custom_coll.include_regex): custom_coll.add_for_mode(tensor_ref.tf_obj, self.mode) if custom_coll not in self.tensor_to_collections[ tensor_ref.name]: self.tensor_to_collections[tensor_ref.name].add( custom_coll)
def _get_dataloader_profiler_data_by_time(self, start_time_us, end_time_us, cache_metrics=False, selected_framework_metrics=[]): """ Get metrics data within a time interval. :param start_time_us: Start of the interval in microseconds :param end_time_us: End of the interval in microseconds :param cache_metrics: If True, collect and return all metrics requested so far, else, :param framework_metrics_list: list of framework metrics. If not empty, function will only return framework events that are part of this list. :return: Framework metrics DataFrame """ # get framework metrics framework_metrics = [] # only fetch a subset of data to avoid out of memory issues if end_time_us - start_time_us > self.interval: current_time_us = start_time_us + self.interval else: current_time_us = end_time_us while start_time_us < end_time_us: # get all framework metrics from last to current timestamp self.framework_metrics_reader.refresh_event_file_list() events = self.framework_metrics_reader.get_events( start_time_us, current_time_us) # append new events to existing list for event in events: if len(selected_framework_metrics) > 0 and not ( match_inc(event.event_name, selected_framework_metrics) or match_inc(event.event_phase, selected_framework_metrics)): continue if event.event_args is not None and "step_num" in event.event_args: step = int(event.event_args["step_num"]) else: step = -1 if event.event_args is not None and "layer_name" in event.event_args: name = event.event_args["layer_name"] elif event.event_args is not None and "name" in event.event_args: name = event.event_args["name"] else: name = event.event_name if event.event_args is not None and "worker_id" in event.event_args: worker_id = event.event_args["worker_id"] else: worker_id = -1 if event.event_args is not None and "num_workers" in event.event_args: num_workers = event.event_args["num_workers"] else: num_workers = -1 if event.event_args is not None and "pin_memory" in event.event_args: pin_memory = "True" if event.event_args[ "pin_memory"] is True else "False" else: pin_memory = "NA" framework_metrics.append([ us_since_epoch_to_human_readable_time(event.start_time), us_since_epoch_to_human_readable_time(event.end_time), event.start_time, event.end_time, event.duration, event.pid, name, step, worker_id, num_workers, pin_memory, event.event_phase, event.node_id, ]) # read the next chunk of data start_time_us = current_time_us if current_time_us + self.interval < end_time_us: current_time_us = current_time_us + self.interval else: current_time_us = end_time_us if cache_metrics is True: self.framework_metrics.extend(framework_metrics) framework_metrics = self.framework_metrics # create data frame for framework metrics framework_metrics_df = pd.DataFrame( framework_metrics, columns=[ "start_time", "end_time", "start_time_us", "end_time_us", "duration_us", "pid", "framework_metric", "step", "worker_id", "num_workers", "pin_memory", "process", "node_id", ], ) framework_metrics_df["start_time_us"] = ( framework_metrics_df["start_time_us"] - self.start_time) framework_metrics_df["end_time_us"] = framework_metrics_df[ "end_time_us"] - self.start_time return framework_metrics_df[framework_metrics_df.duplicated() == False]
def _is_tensor_saved_for_step(self, tensor_name): collections_to_save = self._get_collections_to_save_for_step() for c in collections_to_save: if match_inc(tensor_name, c.include_regex): return True return False