def add_onnx_graph(self, graph, walltime=None): """Adds a `Graph` protocol buffer to the event file. Args: graph: A `Graph` protocol buffer. walltime: float. Optional walltime to override the default (current) _get_file_writerfrom time.time()) """ event = event_pb2.Event(graph_def=graph.SerializeToString()) self.add_event(event, None, walltime)
def test_wall_time_precision(self): # Test a wall time that is exactly representable in float64 but has enough # digits to incur error if converted to nanonseconds the naive way (* 1e9). event1 = event_pb2.Event(step=1, wall_time=1567808404.765432119) event1.summary.value.add(tag="foo", simple_value=1.0) # Test a wall time where as a float64, the fractional part on its own will # introduce error if truncated to 9 decimal places instead of rounded. event2 = event_pb2.Event(step=2, wall_time=1.000000002) event2.summary.value.add(tag="foo", simple_value=2.0) run_proto = write_service_pb2.WriteScalarRequest.Run() self._populate_run_from_events(run_proto, [event1, event2]) self.assertEqual( test_util.timestamp_pb(1567808404765432119), run_proto.tags[0].points[0].wall_time, ) self.assertEqual( test_util.timestamp_pb(1000000002), run_proto.tags[0].points[1].wall_time, )
def testFirstEventTimestamp(self): """Test that FirstEventTimestamp() returns wall_time of the first event.""" gen = _EventGenerator(self) acc = ea.EventAccumulator(gen) gen.AddEvent( event_pb2.Event(wall_time=10, step=20, file_version="brain.Event:2") ) gen.AddScalarTensor("s1", wall_time=30, step=40, value=20) self.assertEqual(acc.FirstEventTimestamp(), 10)
def test_no_room_for_single_point(self): event = event_pb2.Event(step=1, wall_time=123.456) event.summary.value.add(tag="foo", simple_value=1.0) long_run_name = "A" * uploader_lib._MAX_REQUEST_LENGTH_BYTES run_to_events = {long_run_name: [event]} with self.assertRaises(RuntimeError) as cm: builder = uploader_lib._RequestBuilder("123") list(builder.build_requests(run_to_events)) self.assertEqual(str(cm.exception), "Could not make progress uploading data")
def test_no_budget_for_experiment_id(self): event = event_pb2.Event(step=1, wall_time=123.456) event.summary.value.add(tag="foo", simple_value=1.0) run_to_events = {"run_name": [event]} long_experiment_id = "A" * uploader_lib._MAX_REQUEST_LENGTH_BYTES with self.assertRaises(RuntimeError) as cm: builder = uploader_lib._RequestBuilder(long_experiment_id) list(builder.build_requests(run_to_events)) self.assertEqual(str(cm.exception), "Byte budget too small for experiment ID")
def testFirstEventTimestampLoadsEvent(self): """Test that FirstEventTimestamp() doesn't discard the loaded event.""" gen = _EventGenerator(self) acc = ea.EventAccumulator(gen) gen.AddEvent( event_pb2.Event(wall_time=1, step=2, file_version="brain.Event:2")) self.assertEqual(acc.FirstEventTimestamp(), 1) acc.Reload() self.assertEqual(acc.file_version, 2.0)
def test_prunes_tags_and_runs(self): event_1 = event_pb2.Event(step=1) event_1.summary.value.add(tag="foo", simple_value=1.0) event_2 = event_pb2.Event(step=2) event_2.summary.value.add(tag="bar", simple_value=-2.0) run_to_events = collections.OrderedDict([ ("train", [event_1]), ("test", [event_2]), ]) real_create_point = uploader_lib._RequestBuilder._create_point create_point_call_count_box = [0] def mock_create_point(uploader_self, *args, **kwargs): # Simulate out-of-space error the first time that we try to store # the second point. create_point_call_count_box[0] += 1 if create_point_call_count_box[0] == 2: raise uploader_lib._OutOfSpaceError() return real_create_point(uploader_self, *args, **kwargs) with mock.patch.object(uploader_lib._RequestBuilder, "_create_point", mock_create_point): builder = uploader_lib._RequestBuilder("123") requests = list(builder.build_requests(run_to_events)) for request in requests: _clear_wall_times(request) expected = [ write_service_pb2.WriteScalarRequest(experiment_id="123"), write_service_pb2.WriteScalarRequest(experiment_id="123"), ] (expected[0].runs.add(name="train").tags.add( name="foo", metadata=test_util.scalar_metadata("foo")).points.add(step=1, value=1.0)) (expected[1].runs.add(name="test").tags.add( name="bar", metadata=test_util.scalar_metadata("bar")).points.add(step=2, value=-2.0)) self.assertEqual(expected, requests)
def test_no_room_for_single_point(self): mock_client = _create_mock_client() event = event_pb2.Event(step=1, wall_time=123.456) event.summary.value.add(tag="foo", simple_value=1.0) long_run_name = "A" * uploader_lib._MAX_REQUEST_LENGTH_BYTES run_to_events = {long_run_name: [event]} with self.assertRaises(RuntimeError) as cm: builder = uploader_lib._BatchedRequestSender( "123", mock_client, util.RateLimiter(0)) builder.send_requests(run_to_events) self.assertEqual(str(cm.exception), "add_event failed despite flush")
def testReadOneEvent(self): id_ = db.RUN_ROWID.create(1, 1) event = event_pb2.Event(step=123) path = self._save_records('events.out.tfevents.0.localhost', [event.SerializeToString()]) with self.connect_db() as db_conn: with self.EventLog(path) as log: with loader.RunReader(id_, 'doodle') as run: run.add_event_log(db_conn, log) self.assertEqual(event, run.get_next_event()) self.assertIsNone(run.get_next_event())
def testSentinelStepValueAssignedWhenExecutorStepCountKeyIsMissing(self): events_written = [] metadata_event = event_pb2.Event() metadata_event.log_message.message = json.dumps({}) stream_handler = debugger_server_lib.DebuggerDataStreamHandler( events_writer_manager=FakeEventsWriterManager(events_written)) stream_handler.on_core_metadata_event(metadata_event) health_pill_event = self._create_event_with_float_tensor( "MatMul", 0, "DebugNumericSummary", list(range(1, 15))) stream_handler.on_value_event(health_pill_event) self.assertGreater(events_written[0].step, 0)
def testSentinelStepValueAssignedWhenMetadataJsonIsInvalid(self): events_written = [] metadata_event = event_pb2.Event() metadata_event.log_message.message = "some invalid JSON string" stream_handler = debugger_server_lib.DebuggerDataStreamHandler( events_writer_manager=FakeEventsWriterManager(events_written)) stream_handler.on_core_metadata_event(metadata_event) health_pill_event = self._create_event_with_float_tensor( "MatMul", 0, "DebugNumericSummary", list(range(1, 15))) stream_handler.on_value_event(health_pill_event) self.assertGreater(events_written[0].step, 0)
def testMarkWithShrinkingBatchSize_raisesValueError(self): id_ = db.RUN_ROWID.create(1, 1) event1 = event_pb2.Event(step=123) event2 = event_pb2.Event(step=456) path1 = self._save_records('events.out.tfevents.1.localhost', [event1.SerializeToString()]) path2 = self._save_records('events.out.tfevents.2.localhost', [event2.SerializeToString()]) with self.connect_db() as db_conn: with self.EventLog(path1) as log1, self.EventLog(path2) as log2: with loader.RunReader(id_, 'doodle') as run: run.add_event_log(db_conn, log1) run.add_event_log(db_conn, log2) run.mark() self.assertEqual(event1, run.get_next_event()) self.assertEqual(event2, run.get_next_event()) self.assertIsNone(run.get_next_event()) run.reset() self.assertEqual(event1, run.get_next_event()) with six.assertRaisesRegex(self, ValueError, r'monotonic'): run.mark()
def add_summary(self, summary, global_step=None, walltime=None): r"""Adds a `Summary` protocol buffer to the event file. Args: summary: A `Summary` protocol buffer. global_step (int, optional): Optional global step value for training process to record with the summary. walltime (float, optional): Optional walltime to override the default (current) walltime (from time.time()) seconds after epoch. """ event = event_pb2.Event(summary=summary) self.add_event(event, global_step, walltime)
def testNonValueEvents(self): """Non-value events in the generator don't cause early exits.""" gen = _EventGenerator(self) acc = ea.EventAccumulator(gen) gen.AddScalarTensor("s1", wall_time=1, step=10, value=20) gen.AddEvent( event_pb2.Event(wall_time=2, step=20, file_version="nots2") ) gen.AddScalarTensor("s3", wall_time=3, step=100, value=1) acc.Reload() self.assertTagsEqual(acc.Tags(), {ea.TENSORS: ["s1", "s3"],})
def _migrate_event(self, old_event, initial_metadata=None): """Like `migrate_event`, but performs some sanity checks.""" if initial_metadata is None: initial_metadata = {} old_event_copy = event_pb2.Event() old_event_copy.CopyFrom(old_event) new_events = dataclass_compat.migrate_event( old_event, initial_metadata=initial_metadata) for event in new_events: # ensure that wall time and step are preserved self.assertEqual(event.wall_time, old_event.wall_time) self.assertEqual(event.step, old_event.step) return new_events
def _migrate_tagged_run_metadata_event(old_event): result = event_pb2.Event() result.wall_time = old_event.wall_time result.step = old_event.step trm = old_event.tagged_run_metadata value = result.summary.value.add(tag=trm.tag) value.tensor.CopyFrom(tensor_util.make_tensor_proto([trm.run_metadata])) value.metadata.plugin_data.plugin_name = ( graphs_metadata.PLUGIN_NAME_TAGGED_RUN_METADATA) # `value.metadata.plugin_data.content` left empty value.metadata.data_class = summary_pb2.DATA_CLASS_BLOB_SEQUENCE return (result, )
def add_summary(self, summary, global_step=None, walltime=None): """Adds a `Summary` protocol buffer to the event file. This method wraps the provided summary in an `Event` protocol buffer and adds it to the event file. Args: summary: A `Summary` protocol buffer. global_step: Number. Optional global step value for training process to record with the summary. walltime: float. Optional walltime to override the default (current) walltime (from time.time()) seconds after epoch """ event = event_pb2.Event(summary=summary) self.add_event(event, global_step, walltime)
def test_break_at_run_boundary(self): mock_client = _create_mock_client() # Choose run name sizes such that one run fits, but not two. long_run_1 = "A" * 768 long_run_2 = "B" * 768 event_1 = event_pb2.Event(step=1) event_1.summary.value.add(tag="foo", simple_value=1.0) event_2 = event_pb2.Event(step=2) event_2.summary.value.add(tag="bar", simple_value=-2.0) run_to_events = collections.OrderedDict( [(long_run_1, [event_1]), (long_run_2, [event_2])] ) builder = uploader_lib._BatchedRequestSender( "123", mock_client, util.RateLimiter(0) ) builder.send_requests(run_to_events) requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list] for request in requests: _clear_wall_times(request) expected = [ write_service_pb2.WriteScalarRequest(experiment_id="123"), write_service_pb2.WriteScalarRequest(experiment_id="123"), ] ( expected[0] .runs.add(name=long_run_1) .tags.add(name="foo", metadata=test_util.scalar_metadata("foo")) .points.add(step=1, value=1.0) ) ( expected[1] .runs.add(name=long_run_2) .tags.add(name="bar", metadata=test_util.scalar_metadata("bar")) .points.add(step=2, value=-2.0) ) self.assertEqual(requests, expected)
def testMarkReset_acrossFiles(self): id_ = db.RUN_ROWID.create(1, 1) event1 = event_pb2.Event(step=123) event2 = event_pb2.Event(step=456) path1 = self._save_records('events.out.tfevents.1.localhost', [event1.SerializeToString()]) path2 = self._save_records('events.out.tfevents.2.localhost', [event2.SerializeToString()]) with self.connect_db() as db_conn: with self.EventLog(path1) as log1, self.EventLog(path2) as log2: with loader.RunReader(id_, 'doodle') as run: run.add_event_log(db_conn, log1) run.add_event_log(db_conn, log2) run.mark() self.assertEqual(event1, run.get_next_event()) self.assertEqual(event2, run.get_next_event()) self.assertIsNone(run.get_next_event()) run.reset() self.assertEqual(event1, run.get_next_event()) self.assertEqual(event2, run.get_next_event()) self.assertIsNone(run.get_next_event()) run.mark()
def _migrate_graph_event(old_event): result = event_pb2.Event() result.wall_time = old_event.wall_time result.step = old_event.step value = result.summary.value.add(tag=graphs_metadata.RUN_GRAPH_NAME) graph_bytes = old_event.graph_def value.tensor.CopyFrom(tensor_util.make_tensor_proto([graph_bytes])) value.metadata.plugin_data.plugin_name = graphs_metadata.PLUGIN_NAME # `value.metadata.plugin_data.content` left as the empty proto value.metadata.data_class = summary_pb2.DATA_CLASS_BLOB_SEQUENCE # In the short term, keep both the old event and the new event to # maintain compatibility. return (old_event, result)
def test_no_budget_for_experiment_id(self): mock_client = _create_mock_client() event = event_pb2.Event(step=1, wall_time=123.456) event.summary.value.add(tag="foo", simple_value=1.0) run_to_events = {"run_name": [event]} long_experiment_id = "A" * uploader_lib._MAX_REQUEST_LENGTH_BYTES mock_client = _create_mock_client() with self.assertRaises(RuntimeError) as cm: builder = uploader_lib._BatchedRequestSender( long_experiment_id, mock_client, util.RateLimiter(0)) builder.send_requests(run_to_events) self.assertEqual(str(cm.exception), "Byte budget too small for experiment ID")
def add_graph(self, model, input_to_model=None, verbose=False, **kwargs): # prohibit second call? # no, let tensorboard handle it and show its warning message. """Add graph data to summary. Args: model (torch.nn.Module): Model to draw. input_to_model (torch.Tensor or list of torch.Tensor): A variable or a tuple of variables to be fed. verbose (bool): Whether to print graph structure in console. omit_useless_nodes (bool): Default to ``true``, which eliminates unused nodes. operator_export_type (string): One of: ``"ONNX"``, ``"RAW"``. This determines the optimization level of the graph. If error happens during exporting the graph, using ``"RAW"`` might help. """ if hasattr(model, 'forward'): # A valid PyTorch model should have a 'forward' method import torch from distutils.version import LooseVersion if LooseVersion(torch.__version__) >= LooseVersion("0.3.1"): pass else: if LooseVersion(torch.__version__) >= LooseVersion("0.3.0"): print('You are using PyTorch==0.3.0, use add_onnx_graph()') return if not hasattr(torch.autograd.Variable, 'grad_fn'): print('add_graph() only supports PyTorch v0.2.') return self._get_file_writer().add_graph(graph(model, input_to_model, verbose, **kwargs)) else: # Caffe2 models do not have the 'forward' method from caffe2.proto import caffe2_pb2 from caffe2.python import core from ._caffe2_graph import ( model_to_graph_def, nets_to_graph_def, protos_to_graph_def ) if isinstance(model, list): if isinstance(model[0], core.Net): current_graph = nets_to_graph_def( model, **kwargs) elif isinstance(model[0], caffe2_pb2.NetDef): current_graph = protos_to_graph_def( model, **kwargs) else: # Handles cnn.CNNModelHelper, model_helper.ModelHelper current_graph = model_to_graph_def( model, **kwargs) event = event_pb2.Event( graph_def=current_graph.SerializeToString()) self._get_file_writer().add_event(event)
def test_v2_summary(self): event = event_pb2.Event( step=1, wall_time=123.456, summary=scalar_v2.scalar_pb("foo", 5.0) ) run_proto = write_service_pb2.WriteScalarRequest.Run() self._populate_run_from_events(run_proto, [event]) expected_run_proto = write_service_pb2.WriteScalarRequest.Run() foo_tag = expected_run_proto.tags.add() foo_tag.name = "foo" foo_tag.metadata.plugin_data.plugin_name = "scalars" foo_tag.points.add( step=1, wall_time=test_util.timestamp_pb(123456000000), value=5.0 ) self.assertProtoEquals(run_proto, expected_run_proto)
def test_v1_summary_single_value(self): event = event_pb2.Event(step=1, wall_time=123.456) event.summary.value.add(tag="foo", simple_value=5.0) run_proto = write_service_pb2.WriteScalarRequest.Run() self._populate_run_from_events(run_proto, [event]) expected_run_proto = write_service_pb2.WriteScalarRequest.Run() foo_tag = expected_run_proto.tags.add() foo_tag.name = "foo" foo_tag.metadata.display_name = "foo" foo_tag.metadata.plugin_data.plugin_name = "scalars" foo_tag.points.add( step=1, wall_time=test_util.timestamp_pb(123456000000), value=5.0 ) self.assertProtoEquals(run_proto, expected_run_proto)
def migrate_event(event): if not event.HasField("summary"): return event old_values = event.summary.value new_values = [migrate_value(value) for value in old_values] # Optimization: Don't create a new event if there were no changes. if len(old_values) == len(new_values) and all( x is y for (x, y) in zip(old_values, new_values)): return event result = event_pb2.Event() result.CopyFrom(event) del result.summary.value[:] result.summary.value.extend(new_values) return result
def testReloadPopulatesFirstEventTimestamp(self): """Test that Reload() means FirstEventTimestamp() won't load events.""" gen = _EventGenerator(self) acc = ea.EventAccumulator(gen) gen.AddEvent( event_pb2.Event(wall_time=1, step=2, file_version="brain.Event:2")) acc.Reload() def _Die(*args, **kwargs): # pylint: disable=unused-argument raise RuntimeError("Load() should not be called") self.stubs.Set(gen, "Load", _Die) self.assertEqual(acc.FirstEventTimestamp(), 1)
def test_already_newstyle_summary_passes_through(self): # ...even when it's from a known plugin and would otherwise be migrated. old_event = event_pb2.Event() old_event.summary.ParseFromString( scalar_summary.pb("foo", 1.25, display_name="bar", description="baz").SerializeToString()) metadata = old_event.summary.value[0].metadata metadata.data_class = summary_pb2.DATA_CLASS_TENSOR # note: not scalar new_events = self._migrate_event(old_event) self.assertLen(new_events, 1) self.assertIs(new_events[0], old_event)
def AddScalarTensor(self, tag, wall_time=0, step=0, value=0): """Add a rank-0 tensor event. Note: This is not related to the scalar plugin; it's just a convenience function to add an event whose contents aren't important. """ tensor = tensor_util.make_tensor_proto(float(value)) event = event_pb2.Event( wall_time=wall_time, step=step, summary=summary_pb2.Summary( value=[summary_pb2.Summary.Value(tag=tag, tensor=tensor)])) self.AddEvent(event)
def test_start_uploading_graphs(self): mock_client = _create_mock_client() mock_rate_limiter = mock.create_autospec(util.RateLimiter) mock_blob_rate_limiter = mock.create_autospec(util.RateLimiter) uploader = _create_uploader( mock_client, "/logs/foo", rpc_rate_limiter=mock_rate_limiter, blob_rpc_rate_limiter=mock_blob_rate_limiter, allowed_plugins=[ scalars_metadata.PLUGIN_NAME, graphs_metadata.PLUGIN_NAME, ], ) uploader.create_experiment() # Of course a real Event stream will never produce the same Event twice, # but is this test context it's fine to reuse this one. graph_event = event_pb2.Event( graph_def=_create_example_graph_bytes(950)) mock_logdir_loader = mock.create_autospec(logdir_loader.LogdirLoader) mock_logdir_loader.get_run_events.side_effect = [ { "run 1": [graph_event, graph_event], "run 2": [graph_event, graph_event], }, { "run 3": [graph_event, graph_event], "run 4": [graph_event, graph_event], "run 5": [graph_event, graph_event], }, AbortUploadError, ] with mock.patch.object( uploader, "_logdir_loader", mock_logdir_loader), self.assertRaises(AbortUploadError): uploader.start_uploading() self.assertEqual(1, mock_client.CreateExperiment.call_count) self.assertEqual(10, mock_client.WriteBlob.call_count) for (i, call) in enumerate(mock_client.WriteBlob.call_args_list): requests = list(call[0][0]) data = b"".join(r.data for r in requests) self.assertEqual(data, graph_event.graph_def) self.assertEqual( set(r.blob_sequence_id for r in requests), {"blob%d" % i}, ) self.assertEqual(0, mock_rate_limiter.tick.call_count) self.assertEqual(10, mock_blob_rate_limiter.tick.call_count)
def testAlertingEventCallback(self): numerics_alert_callback = tf.test.mock.Mock() stream_handler = debugger_server_lib.DebuggerDataStreamHandler( events_writer_manager=FakeEventsWriterManager( self.events_written), numerics_alert_callback=numerics_alert_callback) stream_handler.on_core_metadata_event(event_pb2.Event()) # The stream handler receives 1 good event and 1 with an NaN value. stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [0] * 14)) stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ])) # The second event should have triggered the callback. numerics_alert_callback.assert_called_once_with( numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0", "Add:0", 0, 1, 0, 0)) # The stream handler receives an event with a -Inf value. numerics_alert_callback.reset_mock() stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ])) numerics_alert_callback.assert_called_once_with( numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0", "Add:0", 0, 0, 1, 0)) # The stream handler receives an event with a +Inf value. numerics_alert_callback.reset_mock() stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 ])) numerics_alert_callback.assert_called_once_with( numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0", "Add:0", 0, 0, 0, 1)) # The stream handler receives an event without any pathetic values. numerics_alert_callback.reset_mock() stream_handler.on_value_event( self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [ 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0 ])) # assert_not_called is not available in Python 3.4. self.assertFalse(numerics_alert_callback.called)