def on_value_event(self, event): """Records the summary values based on an updated message from the debugger. Logs an error message if writing the event to disk fails. Args: event: The Event proto to be processed. """ if not event.summary.value: logger.warning("The summary of the event lacks a value.") return # The node name property is actually a watch key, which is a concatenation # of several pieces of data. watch_key = event.summary.value[0].node_name if not watch_key.endswith(constants.DEBUG_NUMERIC_SUMMARY_SUFFIX): # Ignore events that lack a DebugNumericSummary. # NOTE(@chihuahua): We may later handle other types of debug ops. return # We remove the constants.DEBUG_NUMERIC_SUMMARY_SUFFIX from the end of the # watch name because it is not distinguishing: every health pill entry ends # with it. node_name_and_output_slot = watch_key[:-len( constants.DEBUG_NUMERIC_SUMMARY_SUFFIX)] shape = tensor_util.make_ndarray(event.summary.value[0].tensor).shape if (len(shape) != 1 or shape[0] < constants.MIN_DEBUG_NUMERIC_SUMMARY_TENSOR_LENGTH): logger.warning("Health-pill tensor either lacks a dimension or is " "shaped incorrectly: %s" % shape) return match = re.match(r"^(.*):(\d+)$", node_name_and_output_slot) if not match: logger.warning( ("A event with a health pill has an invalid node name and output " "slot combination, (i.e., an unexpected debug op): %r"), node_name_and_output_slot, ) return if self._session_run_index >= 0: event.step = self._session_run_index else: # Data from parameter servers (or any graphs without a master) do not # contain core metadata. So the session run count is missing. Set its # value to a microsecond epoch timestamp. event.step = int(time.time() * 1e6) # Write this event to the events file designated for data from the # debugger. self._events_writer_manager.write_event(event) alert = numerics_alert.extract_numerics_alert(event) if self._numerics_alert_callback and alert: self._numerics_alert_callback(alert)
def on_value_event(self, event): """Records the summary values based on an updated message from the debugger. Logs an error message if writing the event to disk fails. Args: event: The Event proto to be processed. """ if not event.summary.value: tf.logging.warn("The summary of the event lacks a value.") return # The node name property is actually a watch key, which is a concatenation # of several pieces of data. watch_key = event.summary.value[0].node_name if not watch_key.endswith(constants.DEBUG_NUMERIC_SUMMARY_SUFFIX): # Ignore events that lack a DebugNumericSummary. # NOTE(@chihuahua): We may later handle other types of debug ops. return # We remove the constants.DEBUG_NUMERIC_SUMMARY_SUFFIX from the end of the # watch name because it is not distinguishing: every health pill entry ends # with it. node_name_and_output_slot = watch_key[ :-len(constants.DEBUG_NUMERIC_SUMMARY_SUFFIX)] shape = tf.make_ndarray(event.summary.value[0].tensor).shape if (len(shape) != 1 or shape[0] < constants.MIN_DEBUG_NUMERIC_SUMMARY_TENSOR_LENGTH): tf.logging.warning("Health-pill tensor either lacks a dimension or is " "shaped incorrectly: %s" % shape) return match = re.match(r"^(.*):(\d+)$", node_name_and_output_slot) if not match: tf.logging.warning( ("A event with a health pill has an invalid node name and output " "slot combination, (i.e., an unexpected debug op): %r"), node_name_and_output_slot) return if self._session_run_index >= 0: event.step = self._session_run_index else: # Data from parameter servers (or any graphs without a master) do not # contain core metadata. So the session run count is missing. Set its # value to a microsecond epoch timestamp. event.step = int(time.time() * 1e6) # Write this event to the events file designated for data from the # debugger. self._events_writer_manager.write_event(event) alert = numerics_alert.extract_numerics_alert(event) if self._numerics_alert_callback and alert: self._numerics_alert_callback(alert)