def after_run(self, run_context, run_values): del run_context # Unused by feature importance summary saver hook. # Read result tensors. global_step = run_values.results["global_step"] feature_names = run_values.results["feature_names"] feature_usage_counts = run_values.results["feature_usage_counts"] feature_gains = run_values.results["feature_gains"] # Ensure summaries are logged at desired frequency if (self._last_triggered_step is not None and global_step < self._last_triggered_step + self._every_n_steps): return # Validate tensors. if (len(feature_names) != len(feature_usage_counts) or len(feature_names) != len(feature_gains)): raise RuntimeError( "Feature names and importance measures have inconsistent lengths." ) # Compute total usage. total_usage_count = 0.0 for usage_count in feature_usage_counts: total_usage_count += usage_count usage_count_norm = 1.0 / total_usage_count if total_usage_count else 1.0 # Compute total gain. total_gain = 0.0 for gain in feature_gains: total_gain += gain gain_norm = 1.0 / total_gain if total_gain else 1.0 # Output summary for each feature. self._last_triggered_step = global_step for (name, usage_count, gain) in zip(feature_names, feature_usage_counts, feature_gains): output_dir = os.path.join(self._model_dir, name.decode("utf-8")) summary_writer = SummaryWriterCache.get(output_dir) usage_count_summary = Summary(value=[ Summary.Value(tag="feature_importance/usage_counts", simple_value=usage_count) ]) usage_fraction_summary = Summary(value=[ Summary.Value(tag="feature_importance/usage_fraction", simple_value=usage_count * usage_count_norm) ]) summary_writer.add_summary(usage_count_summary, global_step) summary_writer.add_summary(usage_fraction_summary, global_step) gains_summary = Summary(value=[ Summary.Value(tag="feature_importance/gains", simple_value=gain) ]) gains_fraction_summary = Summary(value=[ Summary.Value(tag="feature_importance/gains_fraction", simple_value=gain * gain_norm) ]) summary_writer.add_summary(gains_summary, global_step) summary_writer.add_summary(gains_fraction_summary, global_step)
def tf_scalar_summary(vals): # pylint: disable=import-error,no-name-in-module from tensorflow.core.framework.summary_pb2 import Summary return Summary(value=[ Summary.Value(tag=key, simple_value=val) for key, val in vals.items() ])
def log_image(self, step, tag, val): ''' Write an image event. :param int step: Time step (x-axis in TensorBoard graphs) :param str tag: Label for this value :param numpy.ndarray val: Image in RGB format with values from 0 to 255; a 3-D array with index order (row, column, channel). `val.shape[-1] == 3` ''' # TODO: support floating-point tensors, 4-D tensors, grayscale if len(val.shape) != 3: raise ValueError( '`log_image` value should be a 3-D tensor, instead got shape %s' % (val.shape, )) if val.shape[2] != 3: raise ValueError( 'Last dimension of `log_image` value should be 3 (RGB), ' 'instead got shape %s' % (val.shape, )) fakefile = StringIO() png.Writer(size=(val.shape[1], val.shape[0])).write( fakefile, val.reshape(val.shape[0], val.shape[1] * val.shape[2])) encoded = fakefile.getvalue() # https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/core/framework/summary.proto RGB = 3 image = Summary.Image(height=val.shape[0], width=val.shape[1], colorspace=RGB, encoded_image_string=encoded) summary = Summary(value=[Summary.Value(tag=tag, image=image)]) self._add_event(step, summary)
def image(tag, tensor): """Outputs a `Summary` protocol buffer with images. The summary has up to `max_images` summary values containing images. The images are built from `tensor` which must be 3-D with shape `[height, width, channels]` and where `channels` can be: * 1: `tensor` is interpreted as Grayscale. * 3: `tensor` is interpreted as RGB. * 4: `tensor` is interpreted as RGBA. The `name` in the outputted Summary.Value protobufs is generated based on the name, with a suffix depending on the max_outputs setting: * If `max_outputs` is 1, the summary value tag is '*name*/image'. * If `max_outputs` is greater than 1, the summary value tags are generated sequentially as '*name*/image/0', '*name*/image/1', etc. Args: tag: A name for the generated node. Will also serve as a series name in TensorBoard. tensor: A 3-D `uint8` or `float32` `Tensor` of shape `[height, width, channels]` where `channels` is 1, 3, or 4. Returns: A scalar `Tensor` of type `string`. The serialized `Summary` protocol buffer. """ tag = _clean_tag(tag) if not isinstance(tensor, np.ndarray): # try conversion, if failed then need handle by user. tensor = np.ndarray(tensor, dtype=np.float32) shape = tensor.shape height, width, channel = shape[0], shape[1], shape[2] if channel == 1: # walk around. PIL's setting on dimension. tensor = np.reshape(tensor, (height, width)) image = make_image(tensor, height, width, channel) return Summary(value=[Summary.Value(tag=tag, image=image)])
def _log_and_record(self, elapsed_steps, elapsed_time, global_step): steps_per_sec = elapsed_steps / elapsed_time if self._summary_writer is not None: summary = Summary(value=[Summary.Value( tag=self._summary_tag, simple_value=steps_per_sec)]) self._summary_writer.add_summary(summary, global_step) logging.info("%s: %g", self._summary_tag, steps_per_sec)
def _log_and_record(self, elapsed_steps, elapsed_time, global_step): images_per_sec = elapsed_steps / elapsed_time * FLAGS.batch_size * hvd.size() summary_tag = 'images/sec' if self._summary_writer is not None: summary = Summary(value=[Summary.Value( tag=summary_tag, simple_value=images_per_sec)]) self._summary_writer.add_summary(summary, global_step) logging.info("%s: %g", summary_tag, images_per_sec)
def hyper_log(rmse, job_dir): log = Summary(value=[ Summary.Value(tag='training/hptuning/metric', simple_value=rmse) ]) logpath = os.path.join(job_dir, 'eval') writer = tf.summary.FileWriter(logpath) writer.add_summary(log) writer.flush()
def after_run(self, run_context, run_values): global_episode = run_values.results self._num_steps += 1 if self._timer.should_trigger_for_episode(global_episode): elapsed_time, elapsed_steps = self._timer.update_last_triggered_episode(global_episode) if elapsed_time is not None: steps_per_sec = elapsed_steps / elapsed_time if self._summary_writer is not None: summary = Summary(value=[ Summary.Value(tag=self._summary_sec_tag, simple_value=steps_per_sec), Summary.Value(tag=self._summary_steps_tag, simple_value=self._num_steps), ]) self._summary_writer.add_summary(summary, global_episode) logging.info("%s: %g, %s: %d", self._summary_sec_tag, steps_per_sec, self._summary_steps_tag, self._num_steps) self._num_steps = 0
def _log_and_record(self, step): if self._summary_writer is not None: if self._total_batch_size: img_per_sec_tag = 'eval/img_per_sec' img_per_sec_tag_value = self._total_batch_size / ( self._run_end - self._run_begin) sec_per_img_tag = 'eval/sec_per_img' sec_per_img_tag_value = 1 / img_per_sec_tag_value * 1000 summary = Summary(value=[ Summary.Value(tag=img_per_sec_tag, simple_value=img_per_sec_tag_value), Summary.Value(tag=sec_per_img_tag, simple_value=sec_per_img_tag_value) ]) logging.info("%s: %g, %s: %g ms, step: %g", img_per_sec_tag, img_per_sec_tag_value, sec_per_img_tag, sec_per_img_tag_value, step) self._summary_writer.add_summary(summary, step)
def _log_and_record(self, elapsed_steps, elapsed_time, global_step): examples_per_sec = self.batch_size * elapsed_steps / elapsed_time example_summary_tag = 'examples/sec' if self._summary_writer is not None: summary = Summary(value=[ Summary.Value(tag=example_summary_tag, simple_value=examples_per_sec) ]) self._summary_writer.add_summary(summary, global_step) logging.info("%s: %g", example_summary_tag, examples_per_sec)
def log_scalar(self, step, tag, val): ''' Write a scalar event. :param int step: Time step (x-axis in TensorBoard graphs) :param str tag: Label for this value :param float val: Scalar to graph at this time step (y-axis) ''' summary = Summary(value=[Summary.Value(tag=tag, simple_value=float(np.float32(val)))]) self._add_event(step, summary)
def write_hptuning_metric(args, metric): summary = Summary(value=[ Summary.Value(tag='training/hptuning/metric', simple_value=metric) ]) eval_path = os.path.join(args.output_dir, 'eval') summary_writer = tf.summary.FileWriter(eval_path) summary_writer.add_summary(summary) summary_writer.flush()
def every_n_step_end(self, current_step, outputs): current_time = time.time() if self._last_reported_time is not None and self._summary_writer: added_steps = current_step - self._last_reported_step elapsed_time = current_time - self._last_reported_time steps_per_sec = added_steps / elapsed_time summary = Summary(value=[Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec)]) self._summary_writer.add_summary(summary, current_step) self._last_reported_step = current_step self._last_reported_time = current_time
def add_summary(self, summary_tag, summary_value, global_step): """ Adds summary at specific step. Args: summary_tag: A string, the name of the summary. summary_value: The value of the summary at current step. global_step: The step. """ summary = Summary( value=[Summary.Value(tag=summary_tag, simple_value=summary_value)]) self._summary_writer.add_summary(summary, global_step) self._summary_writer.flush()
def log_histogram(self, step, tag, val): ''' Write a histogram event. :param int step: Time step (x-axis in TensorBoard graphs) :param str tag: Label for this value :param numpy.ndarray val: Arbitrary-dimensional array containing values to be aggregated in the resulting histogram. ''' hist = Histogram() hist.add(val) summary = Summary(value=[Summary.Value(tag=tag, histo=hist.encode_to_proto())]) self._add_event(step, summary)
def _log_and_record(self, global_step, learning_rate, total_loss, mlm_loss, nsp_loss): time_per_step = self.elapsed_secs / self.count throughput = self.global_batch_size / time_per_step log_string = ' ' log_string += 'Step = %6i' % (global_step) log_string += ', throughput = %6.1f' % (throughput) log_string += ', total_loss = %6.3f' % (total_loss) log_string += ', mlm_oss = %6.4e' % (mlm_loss) log_string += ', nsp_loss = %6.4e' % (nsp_loss) log_string += ', learning_rate = %6.4e' % (learning_rate) tf.compat.v1.logging.info(log_string) if self.summary_writer is not None: throughput_summary = Summary(value=[ Summary.Value(tag='throughput', simple_value=throughput) ]) self.summary_writer.add_summary(throughput_summary, global_step) total_loss_summary = Summary(value=[ Summary.Value(tag='total_loss', simple_value=total_loss) ]) self.summary_writer.add_summary(total_loss_summary, global_step)
def after_run(self, run_context, run_values): _ = run_context global_step = run_values.results if self._timer.should_trigger_for_step(global_step): elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( global_step) if elapsed_time is not None: steps_per_sec = elapsed_steps / elapsed_time if self._summary_writer is not None: summary = Summary(value=[Summary.Value( tag=self._summary_tag, simple_value=steps_per_sec)]) self._summary_writer.add_summary(summary, global_step) logging.info("%s: %g", self._summary_tag, steps_per_sec)
def _log_and_record(self, elapsed_steps, elapsed_time, global_step): steps_per_sec = elapsed_steps / elapsed_time if self._summary_writer is not None: if self._total_batch_size: image_tag = 'images_sec' image_count = float(steps_per_sec) * self._total_batch_size summary = Summary(value=[ Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec), Summary.Value(tag=image_tag, simple_value=image_count) ]) logging.info("%s: %g, %s: %g, step: %g", self._summary_tag, steps_per_sec, image_tag, image_count, global_step) else: summary = Summary(value=[ Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec) ]) logging.info("%s: %g, step: %g", self._summary_tag, steps_per_sec, global_step) self._summary_writer.add_summary(summary, global_step)
def write_hptuning_metric(args, metric): """ Write a summary containing the tuning loss metric, as required by hyperparam tuning. """ summary = Summary(value=[Summary.Value(tag='training/hptuning/metric', simple_value=metric)]) # for hyperparam tuning, we write a summary log to a directory 'eval' below the job directory eval_path = os.path.join(args['output_dir'], 'eval') summary_writer = tf.summary.FileWriter(eval_path) # Note: adding the summary to the writer is enough for hyperparam tuning. # The ml engine system is looking for any summary added with the hyperparam metric tag. summary_writer.add_summary(summary) summary_writer.flush()
def image(self, tag, image, step): image = np.asarray(image) if image.ndim == 2: image = image[:, :, None] if image.shape[-1] == 1: image = np.repeat(image, 3, axis=-1) bytesio = io.BytesIO() PIL.Image.fromarray(image).save(bytesio, 'PNG') image_summary = Summary.Image(encoded_image_string=bytesio.getvalue(), colorspace=3, height=image.shape[0], width=image.shape[1]) self._write_event(Summary.Value(tag=tag, image=image_summary), step)
def _log_statistics(self, elapsed_steps, elapsed_time, global_step): """ Collect and store all summary values. Arguments: elapsed_steps (int): The number of steps between the current trigger event and the last one. elapsed_time (float): The number of seconds between the current trigger event and the last one. global_step (tf.Tensor): Global step tensor. """ # Write summary for tensorboard. if self._summary_writer is not None: summary_list = list() # Add only summaries. for gpu_id in self._gpu_statistics.keys(): for statistic in self._gpu_statistics[gpu_id].keys(): # only add them if they are requested for logging. if statistic in self._statistics_to_log: values = self._gpu_statistics[gpu_id][statistic] # Only Calculate and write average if there is data available. if values: avg_value = sum(values) / len(values) avg_summary = Summary.Value(tag='{}/{}:{}'.format( self._group_tag, gpu_id, statistic), simple_value=avg_value) summary_list.append(avg_summary) # Write all statistics as simple scalar summaries. summary = Summary(value=summary_list) self._summary_writer.add_summary(summary, global_step) # Log summaries to the logging stream. if not self._suppress_stdout: for gpu_id in self._gpu_statistics.keys(): # Acquire a GPU device handle. handle = nvml.nvmlDeviceGetHandleByIndex(gpu_id) # Query the device name. name = nvml.nvmlDeviceGetName(handle).decode('utf-8') for statistic in self._gpu_statistics[gpu_id].keys(): # Log utilization information with INFO level. logging.debug( "%s: %s", name, '{}: {}'.format( statistic, self._gpu_statistics[gpu_id][statistic]))
def log_event(self, event, phase): """Logs the given event to the summary directory.""" event_name = event + '_' + phase if self._summary_writer is None: logging.warning( 'profile_logger: cannot log event "%s" ' ' because of no summary directory', event_name) return # For now, we only need the event timestamp. No need to pass any value. s = Summary(value=[Summary.Value(tag=event_name, simple_value=0.0)]) self._summary_writer.add_summary(s) self._summary_writer.flush() logging.info('profile_logger: log event "%s"', event_name)
def add_entry(self, index, tag, value, **kwargs): if "image" in kwargs and value is not None: image_string = tf.image.encode_jpeg(value, optimize_size=True, quality=80) summary_value = Summary.Image(width=value.shape[1], height=value.shape[0], colorspace=value.shape[2], encoded_image_string=image_string) else: summary_value = Summary.Value(tag=tag, simple_value=value) if summary_value is not None: entry = Summary(value=[summary_value]) self._train_writer.add_summary(entry, index)
def run_loop(self): # Count the steps. current_step = training_util.global_step(self._sess, self._sv.global_step) added_steps = current_step - self._last_step self._last_step = current_step # Measure the elapsed time. current_time = time.time() elapsed_time = current_time - self._last_time self._last_time = current_time # Reports the number of steps done per second steps_per_sec = added_steps / elapsed_time summary = Summary(value=[Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec)]) if self._sv.summary_writer: self._sv.summary_writer.add_summary(summary, current_step) logging.log_first_n(logging.INFO, "%s: %g", 10, self._summary_tag, steps_per_sec)
def confusion_matrix_summary(tag, cm, classes, normalize=False, recall=True, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ if normalize: if recall: s = cm.sum(axis=1)[:, np.newaxis] + np.finfo(np.float32).eps else: s = cm.sum(axis=0)[:, np.newaxis] + np.finfo(np.float32).eps cm = cm.astype('float') / s plt.close('all') f_size = max(5, int(0.6 * len(classes))) plt.figure(figsize=(f_size, f_size)) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() image = plt_to_image_summary(plt) return Summary(value=[Summary.Value(tag=tag, image=image)])
def after_run(self, run_context, run_values): _ = run_context stale_global_step = run_values.results if self._timer.should_trigger_for_step(stale_global_step + 1): # get the real value after train op. global_step = run_context.session.run(self._global_step_tensor) if self._timer.should_trigger_for_step(global_step): elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( global_step) if elapsed_time is not None: steps_per_sec = elapsed_steps / elapsed_time if self._summary_writer is not None: summary = Summary(value=[ Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec) ]) self._summary_writer.add_summary(summary, global_step) logging.info("%s: %g", self._summary_tag, steps_per_sec) # Check whether the global step has been increased. Here, we do not use the # timer.last_triggered_step as the timer might record a different global # step value such that the comparison could be unreliable. For simplicity, # we just compare the stale_global_step with previously recorded version. if stale_global_step == self._last_global_step: # Here, we use a counter to count how many times we have observed that the # global step has not been increased. For some Optimizers, the global step # is not increased each time by design. For example, SyncReplicaOptimizer # doesn't increase the global step in worker's main train step. self._global_step_check_count += 1 if self._global_step_check_count % 20 == 0: self._global_step_check_count = 0 logging.warning( "It seems that global step (tf.train.get_global_step) has not " "been increased. Current value (could be stable): %s vs previous " "value: %s. You could increase the global step by passing " "tf.train.get_global_step() to Optimizer.apply_gradients or " "Optimizer.minimize.", stale_global_step, self._last_global_step) else: # Whenever we observe the increment, reset the counter. self._global_step_check_count = 0 self._last_global_step = stale_global_step
def after_run(self, run_context, run_values): _ = run_context stale_local_step = run_values.results if stale_local_step > 0: if self._timer.should_trigger_for_step(stale_local_step + 1): # get the real value after train op. global_step, local_step = run_context.session.run( [self._global_step_tensor, self._local_step_tensor]) if self._timer.should_trigger_for_step(local_step): elapsed_time, _ = self._timer.update_last_triggered_step( local_step) if elapsed_time is not None: steps_per_sec = (global_step - self._last_global_step ) * self._scale / elapsed_time logging.info("Speech %s: %g", self._summary_tag, steps_per_sec) if self._summary_writer is not None: aggregated_summary = run_context.session.run( self._summary_train_op) self._summary_writer.add_summary( aggregated_summary, global_step) summary = Summary(value=[ Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec) ]) self._summary_writer.add_summary( summary, global_step) self._exec_count += 1 if (self._test_every_n_steps is not None) and ( self._exec_count % self._test_every_n_steps) == 0: logging.info("Evaluate model start") self._summary_evaluator(run_context.session) aggregated_summary = run_context.session.run( self._summary_test_op) self._summary_writer.add_summary( aggregated_summary, global_step) logging.info("Evaluate model end") self._timer.update_last_triggered_step(local_step) self._last_global_step = global_step self._last_local_step = stale_local_step
def scalar(name, scalar): """Outputs a `Summary` protocol buffer containing a single scalar value. The generated Summary has a Tensor.proto containing the input Tensor. Args: name: A name for the generated node. Will also serve as the series name in TensorBoard. scalar: A real numeric Tensor containing a single value. collections: Optional list of graph collections keys. The new summary op is added to these collections. Defaults to `[GraphKeys.SUMMARIES]`. Returns: A scalar `Tensor` of type `string`. Which contains a `Summary` protobuf. Raises: ValueError: If tensor has the wrong shape or type. """ name = _clean_tag(name) if not isinstance(scalar, float): # try conversion, if failed then need handle by user. scalar = float(scalar) return Summary(value=[Summary.Value(tag=name, simple_value=scalar)])
def histogram(name, values): # pylint: disable=line-too-long """Outputs a `Summary` protocol buffer with a histogram. The generated [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto) has one summary value containing a histogram for `values`. This op reports an `InvalidArgument` error if any value is not finite. Args: name: A name for the generated node. Will also serve as a series name in TensorBoard. values: A real numeric `Tensor`. Any shape. Values to use to build the histogram. Returns: A scalar `Tensor` of type `string`. The serialized `Summary` protocol buffer. """ name = _clean_tag(name) hist = make_histogram(values.astype(float)) return Summary(value=[Summary.Value(tag=name, histo=hist)])
def write_hptuning_metric(args, metric): """ Output a metric measuring the success of the model This metric will be used by hypertuning to find the best performing model. Args: args: a list of parameters metric: the metric (e.g., test rmse) to be recorded """ summary = Summary(value=[Summary.Value(tag='training/hptuning/metric', simple_value=metric)]) # for hyperparam tuning, we write a summary log to a directory 'eval' below the job directory eval_path = os.path.join(args.output_dir, 'eval') summary_writer = tf.summary.FileWriter(eval_path) # Note: adding the summary to the writer is enough for hyperparam tuning. # The ml engine system is looking for any summary added with the hyperparam metric tag. summary_writer.add_summary(summary) summary_writer.flush()