示例#1
0
    def _get_matching_collections(
        self, mode, tensor, tensor_type, ts_name, is_input_to_model=False, is_output_of_model=False
    ):
        colls_with_tensor = set()
        if tensor_type == "weight":
            if match_inc(
                tensor.name, self.collection_manager.get(CollectionKeys.BIASES).include_regex
            ):
                colls_with_tensor.add(self.collection_manager.get(CollectionKeys.BIASES))
            else:
                colls_with_tensor.add(self.collection_manager.get(CollectionKeys.WEIGHTS))
        elif is_input_to_model:
            colls_with_tensor.add(self.collection_manager.get(CollectionKeys.INPUTS))
        elif is_output_of_model:
            colls_with_tensor.add(self.collection_manager.get(CollectionKeys.OUTPUTS))

        for current_coll in self.collection_manager.get_collections().values():
            if current_coll.name in [CollectionKeys.WEIGHTS, CollectionKeys.BIASES]:
                # don't match regex for these as these are added specially above
                # we also don't want users to make mistakes configuring these collections
                continue

            if match_inc(ts_name, current_coll.include_regex):
                if not current_coll.has_tensor(tensor):
                    # tensor will be added to this coll below
                    colls_with_tensor.add(current_coll)
                # don't recommend adding tensors externally as
                # they will have different internal name
                # but regardless, in such case we only use that tensor name to save data
                # instead of the keras-style-internal-names
        return colls_with_tensor
示例#2
0
    def _get_matching_collections(self,
                                  mode,
                                  tensor,
                                  tensor_type,
                                  ts_name,
                                  is_input_to_model=False,
                                  is_output_of_model=False):
        colls_with_tensor = set()
        if tensor_type == "weight":
            if match_inc(
                    tensor.name,
                    self.collection_manager.get(
                        CollectionKeys.BIASES).include_regex):
                colls_with_tensor.add(
                    self.collection_manager.get(CollectionKeys.BIASES))
            else:
                colls_with_tensor.add(
                    self.collection_manager.get(CollectionKeys.WEIGHTS))
        elif is_input_to_model:
            colls_with_tensor.add(
                self.collection_manager.get(CollectionKeys.INPUTS))
        elif is_output_of_model:
            colls_with_tensor.add(
                self.collection_manager.get(CollectionKeys.OUTPUTS))

        for current_coll in self.collection_manager.get_collections().values():
            if current_coll.name in [
                    CollectionKeys.WEIGHTS, CollectionKeys.BIASES
            ]:
                # don't match regex for these as these are added specially above
                # we also don't want users to make mistakes configuring these collections
                continue

            if match_inc(ts_name, current_coll.include_regex):
                # In TF 2.x eager mode, we can't put tensors in a set/dictionary as tensor.__hash__()
                # is no longer available. tensor.experimental_ref() returns a hashable reference
                # object to this Tensor.
                if is_tf_version_2x() and tf.executing_eagerly():
                    # tensor.experimental_ref is an experimental API
                    # and can be changed or removed.
                    # Ref: https://www.tensorflow.org/api_docs/python/tf/Tensor#experimental_ref
                    tensor = tensor.experimental_ref()
                if not current_coll.has_tensor(tensor):
                    # tensor will be added to this coll below
                    colls_with_tensor.add(current_coll)
                # don't recommend adding tensors externally as
                # they will have different internal name
                # but regardless, in such case we only use that tensor name to save data
                # instead of the keras-style-internal-names
        return colls_with_tensor
示例#3
0
    def _save_tensor_to_file(self, tensor_name, tensor_value, collections):
        if isinstance(collections, set) is False:
            collections = {collections}
        # Since this function modifies the set, there is a possibility
        # of bugs if calling functions attempt to re-use the set passed
        # to this function
        collections_to_write = collections.copy()
        collections_to_save = self._get_collections_to_save_for_step()
        for c in collections_to_save:
            if match_inc(tensor_name, c.include_regex):
                collections_to_write.add(c)
        self._initialize_writers(only_initialize_if_missing=True)
        tensor_refs = []
        if isinstance(tensor_value, values.PerReplica):
            for t in tensor_value._values:
                tensor_ref = TensorRef.from_non_graph_var(tensor_name)
                tensor_refs.append((tensor_ref, t))
        else:
            tensor_ref = TensorRef.from_non_graph_var(tensor_name)
            tensor_refs.append((tensor_ref, tensor_value))

        for tensor_ref, t in tensor_refs:
            for collection in collections_to_write:
                if isinstance(collection, str):
                    collection = self.get_collection(collection)
                collection.set_tensor_ref(tensor_ref)
            self._save_for_tensor(tensor_name, t, check_before_write=True)
示例#4
0
    def _add_tensor_to_matching_collections(self, tensor):
        """
        Finds which collections to add this tensor to, and adds tensor to them
        """
        colls_with_tensor = set()
        for coll in sorted(self._get_all_collections_to_save(),
                           key=lambda x: x.name):
            variable_collections_with_tensor, processed = self._process_tensor_from_variable_read_op(
                tensor)
            if processed:
                colls_with_tensor.update(variable_collections_with_tensor)
                # processed=True means this tensor was either a variable read tensor,
                # or a tensor with same name as variable
                # former will be added to collections such as weights, biases, opt_variables
                # latter will be skipped as they refer to the same tensor
            else:
                # some collections are added automatically, don't match regex for these
                if coll.name not in [
                        CollectionKeys.WEIGHTS,
                        CollectionKeys.BIASES,
                        CollectionKeys.OPTIMIZER_VARIABLES,
                        CollectionKeys.TENSORFLOW_SUMMARIES,
                ] and match_inc(tensor.name, coll.include_regex):
                    coll.add(tensor)

                if coll.has_tensor(tensor.name):
                    # it must have been added when collection was added to
                    # from user(custom_coll)/library(losses, weights, grads)
                    tensor_ref = coll.get_tensor(tensor.name)
                    tensor_ref.tf_obj = tensor
                    colls_with_tensor.add(coll)

        # create entry in hook's tensor_to_collections map for this tensor
        self._create_tensors_for_matching_collections(tensor,
                                                      colls_with_tensor)
示例#5
0
    def _prepare_tensors_available_post_step(self):
        # for gradients, optimizer_variables
        custom_collections, _ = self._get_custom_and_default_collections()
        for coll in [
            self.get_collection(name=CollectionKeys.OPTIMIZER_VARIABLES),
            self.get_collection(name=CollectionKeys.GRADIENTS),
            self.get_collection(name=CollectionKeys.OUTPUTS),
            self.get_collection(name=CollectionKeys.INPUTS),
        ]:
            collection_values = coll.get_tensors()
            for tensor_ref in collection_values:
                if tensor_ref.name not in self.tensor_to_collections:
                    self.tensor_to_collections[tensor_ref.name] = {coll}
                elif coll not in self.tensor_to_collections[tensor_ref.name]:
                    self.tensor_to_collections[tensor_ref.name].add(coll)

                # Add tensor to custom collections
                for custom_coll in custom_collections:
                    if (
                        match_inc(tensor_ref.name, custom_coll.include_regex)
                        and tensor_ref.tf_obj is not None
                    ):
                        custom_coll.add_for_mode(tensor_ref.tf_obj, self.mode)
                        if custom_coll not in self.tensor_to_collections[tensor_ref.name]:
                            self.tensor_to_collections[tensor_ref.name].add(custom_coll)
示例#6
0
 def _tensors_matching_regex(self, regex_list) -> set:
     matched_tensornames = set()
     if not isinstance(regex_list, list):
         regex_list = [regex_list]
     regex_list = flatten(regex_list)
     for tensorname in self._tensors.keys():
         if match_inc(tensorname, regex_list):
             matched_tensornames.add(tensorname)
     return matched_tensornames
示例#7
0
 def _add_weights_and_biases(self):
     wts = tf.trainable_variables()
     # variable such as <tf.Var w1:0>
     for w in wts:
         if match_inc(w.name, self.collection_manager.get(CollectionKeys.BIASES).include_regex):
             self.collection_manager.get(CollectionKeys.BIASES).add(w)
         else:
             # adds a tensor_ref with name `w1/read:0` and export_name `w1:0`
             self.collection_manager.get(CollectionKeys.WEIGHTS).add(w)
示例#8
0
 def _get_collections_with_tensor(self, tensor_name) -> Set["Collection"]:
     self._assert_prep()
     # for tf this will be prepopulated in check_and_add_tensor
     if tensor_name not in self.tensor_to_collections:
         # for mxnet it is computed and then cached
         matched_colls = set()
         for coll in self._collections_to_save:
             if tensor_name in coll.tensor_names:
                 # if being matched as reduction,
                 # it must be in reduction_tensor_name, not with regex
                 matched_colls.add(coll)
             elif match_inc(tensor_name, coll.include_regex):
                 coll.add_tensor_name(tensor_name)
                 matched_colls.add(coll)
         self.tensor_to_collections[tensor_name] = matched_colls
     return self.tensor_to_collections[tensor_name]
示例#9
0
 def should_save_tensor_or_collection(self, tensor_name: str,
                                      collection_name: str) -> bool:
     if self.prepared_collections is False:
         # always return false if an attempt to save a
         # tensor is made before the collections are prepared.
         # this can happen if the fn is called before callbacks are init.
         self.logger.warning(
             "Tensors cannot be saved with smdebug before callbacks are initialized."
         )
         return False
     if collection_name == "gradients":
         layer_name = tensor_name.split(":")[0]
         tensor_name = "gradients/" + layer_name + "Grad"
     if self._is_collection_being_saved_for_step(collection_name):
         c = self.collection_manager.get(collection_name)
         return match_inc(tensor_name,
                          c.include_regex) or c.include_regex == []
     return self._is_tensor_saved_for_step(tensor_name)
示例#10
0
    def _prepare_non_layer_tensors(self):
        # for gradients, optimizer_variables
        custom_collections, _ = self._get_custom_and_default_collections()
        for coll in [
            self.get_collection(name=CollectionKeys.OPTIMIZER_VARIABLES),
            self.get_collection(name=CollectionKeys.GRADIENTS),
        ]:
            for tensor_ref in coll.get_tensors():
                if tensor_ref.name not in self.tensor_to_collections:
                    self.tensor_to_collections[tensor_ref.name] = {coll}
                elif coll not in self.tensor_to_collections[tensor_ref.name]:
                    self.tensor_to_collections[tensor_ref.name].add(coll)

                # Add tensor to custom collections
                for custom_coll in custom_collections:
                    if match_inc(tensor_ref.name, custom_coll.include_regex):
                        custom_coll.add_for_mode(tensor_ref.tf_obj, self.mode)
                        if custom_coll not in self.tensor_to_collections[tensor_ref.name]:
                            self.tensor_to_collections[tensor_ref.name].add(custom_coll)
示例#11
0
    def _prepare_non_layer_tensors(self):
        # for gradients, optimizer_variables
        custom_collections, default_tf_collection = self._get_custom_and_default_collections(
        )
        for default_coll in default_tf_collection:
            for tensor_ref in default_coll.get_tensors():
                if tensor_ref.name not in self.tensor_to_collections:
                    self.tensor_to_collections[tensor_ref.name] = {
                        default_coll
                    }
                elif default_coll not in self.tensor_to_collections[
                        tensor_ref.name]:
                    self.tensor_to_collections[tensor_ref.name].add(
                        default_coll)

                # Add tensor to custom collections
                for custom_coll in custom_collections:
                    if match_inc(tensor_ref.name, custom_coll.include_regex):
                        custom_coll.add_for_mode(tensor_ref.tf_obj, self.mode)
                        if custom_coll not in self.tensor_to_collections[
                                tensor_ref.name]:
                            self.tensor_to_collections[tensor_ref.name].add(
                                custom_coll)
示例#12
0
    def _get_dataloader_profiler_data_by_time(self,
                                              start_time_us,
                                              end_time_us,
                                              cache_metrics=False,
                                              selected_framework_metrics=[]):
        """
        Get metrics data within a time interval.
        :param start_time_us: Start of the interval in microseconds
        :param end_time_us: End of the interval in microseconds
        :param cache_metrics: If True, collect and return all metrics requested so far, else,
        :param framework_metrics_list: list of framework metrics. If not empty, function will only return framework events that are part of this list.
        :return: Framework metrics DataFrame
        """
        # get framework metrics
        framework_metrics = []
        # only fetch a subset of data to avoid out of memory issues
        if end_time_us - start_time_us > self.interval:
            current_time_us = start_time_us + self.interval
        else:
            current_time_us = end_time_us

        while start_time_us < end_time_us:
            # get all framework metrics from last to current timestamp
            self.framework_metrics_reader.refresh_event_file_list()
            events = self.framework_metrics_reader.get_events(
                start_time_us, current_time_us)

            # append new events to existing list
            for event in events:
                if len(selected_framework_metrics) > 0 and not (
                        match_inc(event.event_name, selected_framework_metrics)
                        or match_inc(event.event_phase,
                                     selected_framework_metrics)):
                    continue
                if event.event_args is not None and "step_num" in event.event_args:
                    step = int(event.event_args["step_num"])
                else:
                    step = -1
                if event.event_args is not None and "layer_name" in event.event_args:
                    name = event.event_args["layer_name"]
                elif event.event_args is not None and "name" in event.event_args:
                    name = event.event_args["name"]
                else:
                    name = event.event_name
                if event.event_args is not None and "worker_id" in event.event_args:
                    worker_id = event.event_args["worker_id"]
                else:
                    worker_id = -1

                if event.event_args is not None and "num_workers" in event.event_args:
                    num_workers = event.event_args["num_workers"]
                else:
                    num_workers = -1

                if event.event_args is not None and "pin_memory" in event.event_args:
                    pin_memory = "True" if event.event_args[
                        "pin_memory"] is True else "False"
                else:
                    pin_memory = "NA"

                framework_metrics.append([
                    us_since_epoch_to_human_readable_time(event.start_time),
                    us_since_epoch_to_human_readable_time(event.end_time),
                    event.start_time,
                    event.end_time,
                    event.duration,
                    event.pid,
                    name,
                    step,
                    worker_id,
                    num_workers,
                    pin_memory,
                    event.event_phase,
                    event.node_id,
                ])
            # read the next chunk of data
            start_time_us = current_time_us
            if current_time_us + self.interval < end_time_us:
                current_time_us = current_time_us + self.interval
            else:
                current_time_us = end_time_us

            if cache_metrics is True:
                self.framework_metrics.extend(framework_metrics)
                framework_metrics = self.framework_metrics

        # create data frame for framework metrics
        framework_metrics_df = pd.DataFrame(
            framework_metrics,
            columns=[
                "start_time",
                "end_time",
                "start_time_us",
                "end_time_us",
                "duration_us",
                "pid",
                "framework_metric",
                "step",
                "worker_id",
                "num_workers",
                "pin_memory",
                "process",
                "node_id",
            ],
        )
        framework_metrics_df["start_time_us"] = (
            framework_metrics_df["start_time_us"] - self.start_time)
        framework_metrics_df["end_time_us"] = framework_metrics_df[
            "end_time_us"] - self.start_time
        return framework_metrics_df[framework_metrics_df.duplicated() == False]
示例#13
0
 def _is_tensor_saved_for_step(self, tensor_name):
     collections_to_save = self._get_collections_to_save_for_step()
     for c in collections_to_save:
         if match_inc(tensor_name, c.include_regex):
             return True
     return False