예제 #1
0
def _create_uploader(
    writer_client=_USE_DEFAULT,
    logdir=None,
    allowed_plugins=_USE_DEFAULT,
    logdir_poll_rate_limiter=_USE_DEFAULT,
    rpc_rate_limiter=_USE_DEFAULT,
    blob_rpc_rate_limiter=_USE_DEFAULT,
    name=None,
    description=None,
):
    if writer_client is _USE_DEFAULT:
        writer_client = _create_mock_client()
    if allowed_plugins is _USE_DEFAULT:
        allowed_plugins = _SCALARS_ONLY
    if logdir_poll_rate_limiter is _USE_DEFAULT:
        logdir_poll_rate_limiter = util.RateLimiter(0)
    if rpc_rate_limiter is _USE_DEFAULT:
        rpc_rate_limiter = util.RateLimiter(0)
    if blob_rpc_rate_limiter is _USE_DEFAULT:
        blob_rpc_rate_limiter = util.RateLimiter(0)
    return uploader_lib.TensorBoardUploader(
        writer_client,
        logdir,
        allowed_plugins=allowed_plugins,
        logdir_poll_rate_limiter=logdir_poll_rate_limiter,
        rpc_rate_limiter=rpc_rate_limiter,
        blob_rpc_rate_limiter=blob_rpc_rate_limiter,
        name=name,
        description=description,
    )
예제 #2
0
def _create_request_sender(
    experiment_id=None,
    api=None,
    allowed_plugins=_USE_DEFAULT,
    max_blob_size=_USE_DEFAULT,
    rpc_rate_limiter=_USE_DEFAULT,
    blob_rpc_rate_limiter=_USE_DEFAULT,
):
    if api is _USE_DEFAULT:
        api = _create_mock_client()
    if allowed_plugins is _USE_DEFAULT:
        allowed_plugins = _SCALARS_ONLY
    if max_blob_size is _USE_DEFAULT:
        max_blob_size = 12345
    if rpc_rate_limiter is _USE_DEFAULT:
        rpc_rate_limiter = util.RateLimiter(0)
    if blob_rpc_rate_limiter is _USE_DEFAULT:
        blob_rpc_rate_limiter = util.RateLimiter(0)
    return uploader_lib._BatchedRequestSender(
        experiment_id=experiment_id,
        api=api,
        allowed_plugins=allowed_plugins,
        max_blob_size=max_blob_size,
        rpc_rate_limiter=rpc_rate_limiter,
        blob_rpc_rate_limiter=blob_rpc_rate_limiter,
    )
예제 #3
0
    def __init__(self, writer_client, logdir, rate_limiter=None):
        """Constructs a TensorBoardUploader.

        Args:
          writer_client: a TensorBoardWriterService stub instance
          logdir: path of the log directory to upload
          rate_limiter: a `RateLimiter` to use to limit upload cycle frequency
        """
        self._api = writer_client
        self._logdir = logdir
        self._request_builder = None
        if rate_limiter is None:
            self._rate_limiter = util.RateLimiter(
                _MIN_UPLOAD_CYCLE_DURATION_SECS)
        else:
            self._rate_limiter = rate_limiter
        active_filter = (
            lambda secs: secs + _EVENT_FILE_INACTIVE_SECS >= time.time())
        directory_loader_factory = functools.partial(
            directory_loader.DirectoryLoader,
            loader_factory=event_file_loader.TimestampedEventFileLoader,
            path_filter=io_wrapper.IsTensorFlowEventsFile,
            active_filter=active_filter,
        )
        self._logdir_loader = logdir_loader.LogdirLoader(
            self._logdir, directory_loader_factory)
예제 #4
0
    def test_break_at_tag_boundary(self):
        mock_client = _create_mock_client()
        # Choose tag name sizes such that one tag fits, but not two. Note
        # that tag names appear in both `Tag.name` and the summary metadata.
        long_tag_1 = "a" * 384
        long_tag_2 = "b" * 384
        event = event_pb2.Event(step=1)
        event.summary.value.add(tag=long_tag_1, simple_value=1.0)
        event.summary.value.add(tag=long_tag_2, simple_value=2.0)
        run_to_events = {"train": [event]}

        builder = uploader_lib._BatchedRequestSender("123", mock_client,
                                                     util.RateLimiter(0))
        builder.send_requests(run_to_events)
        requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list]
        for request in requests:
            _clear_wall_times(request)

        expected = [
            write_service_pb2.WriteScalarRequest(experiment_id="123"),
            write_service_pb2.WriteScalarRequest(experiment_id="123"),
        ]
        (expected[0].runs.add(name="train").tags.add(
            name=long_tag_1,
            metadata=test_util.scalar_metadata(long_tag_1)).points.add(
                step=1, value=1.0))
        (expected[1].runs.add(name="train").tags.add(
            name=long_tag_2,
            metadata=test_util.scalar_metadata(long_tag_2)).points.add(
                step=1, value=2.0))
        self.assertEqual(requests, expected)
예제 #5
0
    def test_break_at_run_boundary(self):
        mock_client = _create_mock_client()
        # Choose run name sizes such that one run fits, but not two.
        long_run_1 = "A" * 768
        long_run_2 = "B" * 768
        event_1 = event_pb2.Event(step=1)
        event_1.summary.value.add(tag="foo", simple_value=1.0)
        event_2 = event_pb2.Event(step=2)
        event_2.summary.value.add(tag="bar", simple_value=-2.0)
        run_to_events = collections.OrderedDict([(long_run_1, [event_1]),
                                                 (long_run_2, [event_2])])

        builder = uploader_lib._BatchedRequestSender("123", mock_client,
                                                     util.RateLimiter(0))
        builder.send_requests(run_to_events)
        requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list]

        for request in requests:
            _clear_wall_times(request)

        expected = [
            write_service_pb2.WriteScalarRequest(experiment_id="123"),
            write_service_pb2.WriteScalarRequest(experiment_id="123"),
        ]
        (expected[0].runs.add(name=long_run_1).tags.add(
            name="foo",
            metadata=test_util.scalar_metadata("foo")).points.add(step=1,
                                                                  value=1.0))
        (expected[1].runs.add(name=long_run_2).tags.add(
            name="bar",
            metadata=test_util.scalar_metadata("bar")).points.add(step=2,
                                                                  value=-2.0))
        self.assertEqual(requests, expected)
예제 #6
0
    def test_prunes_tags_and_runs(self):
        mock_client = _create_mock_client()
        event_1 = event_pb2.Event(step=1)
        event_1.summary.value.add(tag="foo", simple_value=1.0)
        event_2 = event_pb2.Event(step=2)
        event_2.summary.value.add(tag="bar", simple_value=-2.0)
        run_to_events = collections.OrderedDict(
            [("train", [event_1]), ("test", [event_2])]
        )

        real_create_point = (
            uploader_lib._ScalarBatchedRequestSender._create_point
        )

        create_point_call_count_box = [0]

        def mock_create_point(uploader_self, *args, **kwargs):
            # Simulate out-of-space error the first time that we try to store
            # the second point.
            create_point_call_count_box[0] += 1
            if create_point_call_count_box[0] == 2:
                raise uploader_lib._OutOfSpaceError()
            return real_create_point(uploader_self, *args, **kwargs)

        with mock.patch.object(
            uploader_lib._ScalarBatchedRequestSender,
            "_create_point",
            mock_create_point,
        ):
            builder = uploader_lib._BatchedRequestSender(
                "123", mock_client, util.RateLimiter(0)
            )
            builder.send_requests(run_to_events)
        requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list]
        for request in requests:
            _clear_wall_times(request)

        expected = [
            write_service_pb2.WriteScalarRequest(experiment_id="123"),
            write_service_pb2.WriteScalarRequest(experiment_id="123"),
        ]
        (
            expected[0]
            .runs.add(name="train")
            .tags.add(name="foo", metadata=test_util.scalar_metadata("foo"))
            .points.add(step=1, value=1.0)
        )
        (
            expected[1]
            .runs.add(name="test")
            .tags.add(name="bar", metadata=test_util.scalar_metadata("bar"))
            .points.add(step=2, value=-2.0)
        )
        self.assertEqual(expected, requests)
예제 #7
0
 def test_no_room_for_single_point(self):
     mock_client = _create_mock_client()
     event = event_pb2.Event(step=1, wall_time=123.456)
     event.summary.value.add(tag="foo", simple_value=1.0)
     long_run_name = "A" * uploader_lib._MAX_REQUEST_LENGTH_BYTES
     run_to_events = {long_run_name: [event]}
     with self.assertRaises(RuntimeError) as cm:
         builder = uploader_lib._BatchedRequestSender(
             "123", mock_client, util.RateLimiter(0))
         builder.send_requests(run_to_events)
     self.assertEqual(str(cm.exception), "add_event failed despite flush")
예제 #8
0
 def _populate_run_from_events(self, run_proto, events):
     mock_client = _create_mock_client()
     builder = uploader_lib._BatchedRequestSender(
         experiment_id="123",
         api=mock_client,
         rpc_rate_limiter=util.RateLimiter(0),
     )
     builder.send_requests({"": events})
     requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list]
     if requests:
         self.assertLen(requests, 1)
         self.assertLen(requests[0].runs, 1)
         run_proto.MergeFrom(requests[0].runs[0])
예제 #9
0
 def test_no_budget_for_experiment_id(self):
     mock_client = _create_mock_client()
     event = event_pb2.Event(step=1, wall_time=123.456)
     event.summary.value.add(tag="foo", simple_value=1.0)
     run_to_events = {"run_name": [event]}
     long_experiment_id = "A" * uploader_lib._MAX_REQUEST_LENGTH_BYTES
     mock_client = _create_mock_client()
     with self.assertRaises(RuntimeError) as cm:
         builder = uploader_lib._BatchedRequestSender(
             long_experiment_id, mock_client, util.RateLimiter(0))
         builder.send_requests(run_to_events)
     self.assertEqual(str(cm.exception),
                      "Byte budget too small for experiment ID")
예제 #10
0
    def __init__(
        self,
        writer_client,
        logdir,
        rpc_rate_limiter=None,
        name=None,
        description=None,
    ):
        """Constructs a TensorBoardUploader.

        Args:
          writer_client: a TensorBoardWriterService stub instance
          logdir: path of the log directory to upload
          rpc_rate_limiter: a `RateLimiter` to use to limit write RPC frequency.
            Note this limit applies at the level of single RPCs in the Scalar
            and Tensor case, but at the level of an entire blob upload in the
            Blob case-- which may require a few preparatory RPCs and a stream
            of chunks.  Note the chunk stream is internally rate-limited by
            backpressure from the server, so it is not a concern that we do not
            explicitly rate-limit within the stream here.
          name: String name to assign to the experiment.
          description: String description to assign to the experiment.
        """
        self._api = writer_client
        self._logdir = logdir
        self._name = name
        self._description = description
        self._request_sender = None
        if rpc_rate_limiter is None:
            self._rpc_rate_limiter = util.RateLimiter(
                _MIN_WRITE_RPC_INTERVAL_SECS)
        else:
            self._rpc_rate_limiter = rpc_rate_limiter
        active_filter = (
            lambda secs: secs + _EVENT_FILE_INACTIVE_SECS >= time.time())
        directory_loader_factory = functools.partial(
            directory_loader.DirectoryLoader,
            loader_factory=event_file_loader.TimestampedEventFileLoader,
            path_filter=io_wrapper.IsTensorFlowEventsFile,
            active_filter=active_filter,
        )
        self._logdir_loader = logdir_loader.LogdirLoader(
            self._logdir, directory_loader_factory)
예제 #11
0
 def test_rate_limiting(self):
     rate_limiter = util.RateLimiter(10)
     fake_time = test_util.FakeTime(current=1000)
     with mock.patch.object(rate_limiter, "_time", fake_time):
         self.assertEqual(1000, fake_time.time())
         # No sleeping for initial tick.
         rate_limiter.tick()
         self.assertEqual(1000, fake_time.time())
         # Second tick requires a full sleep.
         rate_limiter.tick()
         self.assertEqual(1010, fake_time.time())
         # Third tick requires a sleep just to make up the remaining second.
         fake_time.sleep(9)
         self.assertEqual(1019, fake_time.time())
         rate_limiter.tick()
         self.assertEqual(1020, fake_time.time())
         # Fourth tick requires no sleep since we have no remaining seconds.
         fake_time.sleep(11)
         self.assertEqual(1031, fake_time.time())
         rate_limiter.tick()
         self.assertEqual(1031, fake_time.time())
예제 #12
0
    def test_break_at_scalar_point_boundary(self):
        mock_client = _create_mock_client()
        point_count = 2000  # comfortably saturates a single 1024-byte request
        events = []
        for step in range(point_count):
            summary = scalar_v2.scalar_pb("loss", -2.0 * step)
            if step > 0:
                summary.value[0].ClearField("metadata")
            events.append(event_pb2.Event(summary=summary, step=step))
        run_to_events = {"train": events}

        builder = uploader_lib._BatchedRequestSender(
            "123", mock_client, util.RateLimiter(0)
        )
        builder.send_requests(run_to_events)
        requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list]
        for request in requests:
            _clear_wall_times(request)

        self.assertGreater(len(requests), 1)
        self.assertLess(len(requests), point_count)

        total_points_in_result = 0
        for request in requests:
            self.assertLen(request.runs, 1)
            run = request.runs[0]
            self.assertEqual(run.name, "train")
            self.assertLen(run.tags, 1)
            tag = run.tags[0]
            self.assertEqual(tag.name, "loss")
            for point in tag.points:
                self.assertEqual(point.step, total_points_in_result)
                self.assertEqual(point.value, -2.0 * point.step)
                total_points_in_result += 1
            self.assertLessEqual(
                request.ByteSize(), uploader_lib._MAX_REQUEST_LENGTH_BYTES
            )
        self.assertEqual(total_points_in_result, point_count)
예제 #13
0
def create_profile_request_sender() -> profile_uploader.ProfileRequestSender:
    """Creates the `ProfileRequestSender` for the profile plugin.

    A profile request sender is created for the plugin so that after profiling runs
    have finished, data can be uploaded to the tensorboard backend.

    Returns:
        A ProfileRequestSender object.
    """
    api_client = _get_api_client()

    experiment_name = _get_or_create_experiment(
        api_client, training_utils.environment_variables.cloud_ml_job_id)

    upload_limits = _make_upload_limits()

    blob_rpc_rate_limiter = util.RateLimiter(
        upload_limits.min_blob_request_interval / 100)

    blob_storage_bucket, blob_storage_folder = _get_blob_items(api_client, )

    source_bucket = uploader_utils.get_source_bucket(
        training_utils.environment_variables.tensorboard_log_dir)

    profile_request_sender = profile_uploader.ProfileRequestSender(
        experiment_name,
        api_client,
        upload_limits=upload_limits,
        blob_rpc_rate_limiter=blob_rpc_rate_limiter,
        blob_storage_bucket=blob_storage_bucket,
        blob_storage_folder=blob_storage_folder,
        source_bucket=source_bucket,
        tracker=upload_tracker.UploadTracker(verbosity=1),
        logdir=training_utils.environment_variables.tensorboard_log_dir,
    )

    return profile_request_sender
예제 #14
0
    def __init__(
        self,
        writer_client,
        logdir,
        allowed_plugins,
        upload_limits,
        logdir_poll_rate_limiter=None,
        rpc_rate_limiter=None,
        tensor_rpc_rate_limiter=None,
        blob_rpc_rate_limiter=None,
        name=None,
        description=None,
        verbosity=None,
        one_shot=None,
    ):
        """Constructs a TensorBoardUploader.

        Args:
          writer_client: a TensorBoardWriterService stub instance
          logdir: path of the log directory to upload
          allowed_plugins: collection of string plugin names; events will only
            be uploaded if their time series's metadata specifies one of these
            plugin names
          upload_limits: instance of tensorboard.service.UploadLimits proto.
          logdir_poll_rate_limiter: a `RateLimiter` to use to limit logdir
            polling frequency, to avoid thrashing disks, especially on networked
            file systems
          rpc_rate_limiter: a `RateLimiter` to use to limit write RPC frequency.
            Note this limit applies at the level of single RPCs in the Scalar
            and Tensor case, but at the level of an entire blob upload in the
            Blob case-- which may require a few preparatory RPCs and a stream
            of chunks.  Note the chunk stream is internally rate-limited by
            backpressure from the server, so it is not a concern that we do not
            explicitly rate-limit within the stream here.
          name: String name to assign to the experiment.
          description: String description to assign to the experiment.
          verbosity: Level of verbosity, an integer. Supported value:
              0 - No upload statistics is printed.
              1 - Print upload statistics while uploading data (default).
         one_shot: Once uploading starts, upload only the existing data in
            the logdir and then return immediately, instead of the default
            behavior of continuing to listen for new data in the logdir and
            upload them when it appears.
        """
        self._api = writer_client
        self._logdir = logdir
        self._allowed_plugins = frozenset(allowed_plugins)
        self._upload_limits = upload_limits

        self._name = name
        self._description = description
        self._verbosity = 1 if verbosity is None else verbosity
        self._one_shot = False if one_shot is None else one_shot
        self._request_sender = None
        if logdir_poll_rate_limiter is None:
            self._logdir_poll_rate_limiter = util.RateLimiter(
                _MIN_LOGDIR_POLL_INTERVAL_SECS)
        else:
            self._logdir_poll_rate_limiter = logdir_poll_rate_limiter

        if rpc_rate_limiter is None:
            self._rpc_rate_limiter = util.RateLimiter(
                self._upload_limits.min_scalar_request_interval / 1000)
        else:
            self._rpc_rate_limiter = rpc_rate_limiter

        if tensor_rpc_rate_limiter is None:
            self._tensor_rpc_rate_limiter = util.RateLimiter(
                self._upload_limits.min_tensor_request_interval / 1000)
        else:
            self._tensor_rpc_rate_limiter = tensor_rpc_rate_limiter

        if blob_rpc_rate_limiter is None:
            self._blob_rpc_rate_limiter = util.RateLimiter(
                self._upload_limits.min_blob_request_interval / 1000)
        else:
            self._blob_rpc_rate_limiter = blob_rpc_rate_limiter

        active_filter = (
            lambda secs: secs + _EVENT_FILE_INACTIVE_SECS >= time.time())
        directory_loader_factory = functools.partial(
            directory_loader.DirectoryLoader,
            loader_factory=event_file_loader.TimestampedEventFileLoader,
            path_filter=io_wrapper.IsTensorFlowEventsFile,
            active_filter=active_filter,
        )
        self._logdir_loader = logdir_loader.LogdirLoader(
            self._logdir, directory_loader_factory)
예제 #15
0
    def __init__(
        self,
        writer_client,
        logdir,
        allowed_plugins,
        max_blob_size,
        logdir_poll_rate_limiter=None,
        rpc_rate_limiter=None,
        blob_rpc_rate_limiter=None,
        name=None,
        description=None,
    ):
        """Constructs a TensorBoardUploader.

        Args:
          writer_client: a TensorBoardWriterService stub instance
          logdir: path of the log directory to upload
          allowed_plugins: collection of string plugin names; events will only
            be uploaded if their time series's metadata specifies one of these
            plugin names
          max_blob_size: the maximum allowed size for blob uploads.
          logdir_poll_rate_limiter: a `RateLimiter` to use to limit logdir
            polling frequency, to avoid thrashing disks, especially on networked
            file systems
          rpc_rate_limiter: a `RateLimiter` to use to limit write RPC frequency.
            Note this limit applies at the level of single RPCs in the Scalar
            and Tensor case, but at the level of an entire blob upload in the
            Blob case-- which may require a few preparatory RPCs and a stream
            of chunks.  Note the chunk stream is internally rate-limited by
            backpressure from the server, so it is not a concern that we do not
            explicitly rate-limit within the stream here.
          name: String name to assign to the experiment.
          description: String description to assign to the experiment.
        """
        self._api = writer_client
        self._logdir = logdir
        self._allowed_plugins = frozenset(allowed_plugins)
        self._max_blob_size = max_blob_size
        self._name = name
        self._description = description
        self._request_sender = None
        if logdir_poll_rate_limiter is None:
            self._logdir_poll_rate_limiter = util.RateLimiter(
                _MIN_LOGDIR_POLL_INTERVAL_SECS
            )
        else:
            self._logdir_poll_rate_limiter = logdir_poll_rate_limiter
        if rpc_rate_limiter is None:
            self._rpc_rate_limiter = util.RateLimiter(
                _MIN_WRITE_RPC_INTERVAL_SECS
            )
        else:
            self._rpc_rate_limiter = rpc_rate_limiter

        if blob_rpc_rate_limiter is None:
            self._blob_rpc_rate_limiter = util.RateLimiter(
                _MIN_BLOB_WRITE_RPC_INTERVAL_SECS
            )
        else:
            self._blob_rpc_rate_limiter = blob_rpc_rate_limiter

        active_filter = (
            lambda secs: secs + _EVENT_FILE_INACTIVE_SECS >= time.time()
        )
        directory_loader_factory = functools.partial(
            directory_loader.DirectoryLoader,
            loader_factory=event_file_loader.TimestampedEventFileLoader,
            path_filter=io_wrapper.IsTensorFlowEventsFile,
            active_filter=active_filter,
        )
        self._logdir_loader = logdir_loader.LogdirLoader(
            self._logdir, directory_loader_factory
        )