Пример #1
0
    def start(self, run: "Run"):
        gauge_mode = GaugeMode.CGROUP if in_docker() else GaugeMode.SYSTEM
        system_resource_info = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(),
            gpu_monitor=GPUMonitor(),
            os_environ=os.environ,
        ).create(gauge_mode=gauge_mode)
        gauge_factory = GaugeFactory(gauge_mode=gauge_mode)
        metrics_factory = MetricsFactory(
            gauge_factory=gauge_factory, system_resource_info=system_resource_info
        )
        metrics_container = metrics_factory.create_metrics_container()
        metric_reporter = MetricReporterFactory(time.time()).create(
            metrics=metrics_container.metrics()
        )

        for metric in metrics_container.metrics():
            self._gauges_in_resource[metric.resource_type] = len(metric.gauges)

        for metric in metrics_container.metrics():
            for gauge in metric.gauges:
                path = self.get_attribute_name(metric.resource_type, gauge.name())
                if not run.get_attribute(path):
                    run[path] = FloatSeries(
                        [], min=metric.min_value, max=metric.max_value, unit=metric.unit
                    )

        self._thread = self.ReportingThread(self, self._period, run, metric_reporter)
        self._thread.start()
        self._started = True
Пример #2
0
    def create(self, gauge_mode, experiment, reference_timestamp):
        system_resource_info = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(),
            gpu_monitor=GPUMonitor(),
            os_environ=self.__os_environ,
        ).create(gauge_mode=gauge_mode)

        gauge_factory = GaugeFactory(gauge_mode=gauge_mode)
        metrics_factory = MetricsFactory(
            gauge_factory=gauge_factory,
            system_resource_info=system_resource_info)
        metrics_container = metrics_factory.create_metrics_container()

        for metric in metrics_container.metrics():
            metric.internal_id = self.__backend.create_hardware_metric(
                experiment, metric)

        metric_reporter = MetricReporterFactory(reference_timestamp).create(
            metrics=metrics_container.metrics())

        return MetricService(
            backend=self.__backend,
            metric_reporter=metric_reporter,
            experiment=experiment,
            metrics_container=metrics_container,
        )
    def test_empty_gpu_card_indices_on_cuda_env_variable_minus_one(self):
        # given
        gpu_monitor = MagicMock(spec_set=GPUMonitor)
        gpu_monitor.get_card_count.return_value = 4
        # and
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'-1'})

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM)

        # then
        self.assertEqual([], resource_info.gpu_card_indices)
    def test_cgroup_resource_info(self):
        # given
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=GPUMonitor(), os_environ=os.environ)

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.CGROUP)

        # then
        self.assert_float_greater_than(resource_info.cpu_core_count, 0)
        self.assert_int_greater_than(resource_info.memory_amount_bytes, 0)
        self.assert_int_greater_or_equal(resource_info.gpu_card_count, 0)
        self.assert_int_greater_or_equal(resource_info.gpu_memory_amount_bytes, 0)
    def test_gpu_card_indices_without_cuda_env_variable(self):
        # given
        gpu_monitor = MagicMock(spec_set=GPUMonitor)
        gpu_monitor.get_card_count.return_value = 2
        # and
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ=dict())

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM)

        # then
        self.assertEqual([0, 1], resource_info.gpu_card_indices)
    def test_should_ignore_invalid_cuda_env_variable_syntax(self):
        # given
        gpu_monitor = MagicMock(spec_set=GPUMonitor)
        gpu_monitor.get_card_count.return_value = 4
        # and
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'1,3,abc'})

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM)

        # then
        self.assertEqual([0, 1, 2, 3], resource_info.gpu_card_indices)
    def test_should_ignore_gpu_indices_after_index_out_of_range(self):
        # given
        gpu_monitor = MagicMock(spec_set=GPUMonitor)
        gpu_monitor.get_card_count.return_value = 4
        # and
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'1,3,5,2'})

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM)

        # then
        self.assertEqual([1, 3], resource_info.gpu_card_indices)
Пример #8
0
class SystemCpuUsageGauge(Gauge):
    def __init__(self):
        self.__system_monitor = SystemMonitor()

    def name(self):
        return u'cpu'

    def value(self):
        return self.__system_monitor.cpu_percent()

    def __eq__(self, other):
        return self.__class__ == other.__class__

    def __repr__(self):
        return str(u'SystemCpuUsageGauge')
Пример #9
0
class SystemMemoryUsageGauge(Gauge):
    def __init__(self):
        self.__system_monitor = SystemMonitor()

    def name(self):
        return u"ram"

    def value(self):
        virtual_mem = self.__system_monitor.virtual_memory()
        return (virtual_mem.total - virtual_mem.available) / float(BYTES_IN_ONE_GB)

    def __eq__(self, other):
        return self.__class__ == other.__class__

    def __repr__(self):
        return str(u"SystemMemoryUsageGauge")
Пример #10
0
    def start(self,
              abort_callback=None,
              logger=None,
              upload_stdout=True,
              upload_stderr=True,
              send_hardware_metrics=True,
              run_monitoring_thread=True,
              handle_uncaught_exceptions=True):

        if handle_uncaught_exceptions:
            self._set_uncaught_exception_handler()

        abortable = abort_callback is not None or DefaultAbortImpl.requirements_installed(
        )

        if abortable:
            self._run_aborting_thread(abort_callback)

        if logger:
            # pylint: disable=protected-access
            channel = self._experiment._get_channel('logger', 'text',
                                                    ChannelNamespace.SYSTEM)
            channel_writer = ChannelWriter(self._experiment, channel.name,
                                           ChannelNamespace.SYSTEM)
            self._logger_handler = StreamHandler(channel_writer)
            self._logger = logger
            logger.addHandler(self._logger_handler)

        if upload_stdout and not is_notebook():
            self._stdout_uploader = StdOutWithUpload(self._experiment)

        if upload_stderr and not is_notebook():
            self._stderr_uploader = StdErrWithUpload(self._experiment)

        if run_monitoring_thread:
            self._run_monitoring_thread()

        if send_hardware_metrics and SystemMonitor.requirements_installed():
            self._run_hardware_metrics_reporting_thread()
Пример #11
0
 def __init__(self):
     self.__system_monitor = SystemMonitor()
 def create():
     return CGroupMonitor(CGroupFilesystemReader(), SystemMonitor())
Пример #13
0
    def create_experiment(self,
                          name=None,
                          description=None,
                          params=None,
                          properties=None,
                          tags=None,
                          upload_source_files=None,
                          abort_callback=None,
                          upload_stdout=True,
                          upload_stderr=True,
                          send_hardware_metrics=True,
                          run_monitoring_thread=True,
                          handle_uncaught_exceptions=True):
        """
        Raises:
            `ExperimentValidationError`: When provided arguments are invalid.
            `ExperimentLimitReached`: When experiment limit in the project has been reached.
        """

        if name is None:
            name = "Untitled"

        if description is None:
            description = ""

        if params is None:
            params = {}

        if properties is None:
            properties = {}

        if tags is None:
            tags = []

        abortable = abort_callback is not None or DefaultAbortImpl.requirements_installed(
        )

        experiment = self.client.create_experiment(
            project=self,
            name=name,
            description=description,
            params=params,
            properties=properties,
            tags=tags,
            abortable=abortable,
            monitored=run_monitoring_thread)

        if upload_source_files is None:
            main_file = sys.argv[0]
            main_abs_path = os.path.join(os.getcwd(),
                                         os.path.basename(main_file))
            if os.path.isfile(main_abs_path):
                upload_source_files = [
                    os.path.relpath(main_abs_path, os.getcwd())
                ]
            else:
                upload_source_files = []

        experiment.upload_source_files(upload_source_files)

        def exception_handler(exc_type, exc_val, exc_tb):
            experiment.stop("\n".join(traceback.format_tb(exc_tb)) + "\n" +
                            repr(exc_val))

            sys.__excepthook__(exc_type, exc_val, exc_tb)

        if handle_uncaught_exceptions:
            # pylint:disable=protected-access
            experiment._uncaught_exception_handler = exception_handler
            sys.excepthook = exception_handler

        # pylint:disable=protected-access
        experiment._channels_values_sender = ChannelsValuesSender(experiment)

        if abortable:
            # pylint:disable=protected-access
            if abort_callback:
                abort_impl = CustomAbortImpl(abort_callback)
            else:
                abort_impl = DefaultAbortImpl(pid=os.getpid())
            websocket_factory = ReconnectingWebsocketFactory(
                client=self.client, experiment_id=experiment.internal_id)
            experiment._aborting_thread = AbortingThread(
                websocket_factory=websocket_factory,
                abort_impl=abort_impl,
                experiment_id=experiment.internal_id)
            experiment._aborting_thread.start()

        if upload_stdout and not is_notebook():
            # pylint:disable=protected-access
            experiment._stdout_uploader = StdOutWithUpload(experiment)

        if upload_stderr and not is_notebook():
            # pylint:disable=protected-access
            experiment._stderr_uploader = StdErrWithUpload(experiment)

        if run_monitoring_thread:
            # pylint:disable=protected-access
            experiment._ping_thread = PingThread(client=self.client,
                                                 experiment=experiment)
            experiment._ping_thread.start()

        if send_hardware_metrics and SystemMonitor.requirements_installed():
            # pylint:disable=protected-access
            gauge_mode = GaugeMode.CGROUP if in_docker() else GaugeMode.SYSTEM
            metric_service = MetricServiceFactory(
                self.client,
                os.environ).create(gauge_mode=gauge_mode,
                                   experiment=experiment,
                                   reference_timestamp=time.time())

            experiment._hardware_metric_thread = HardwareMetricReportingThread(
                metric_service=metric_service,
                metric_sending_interval_seconds=3)
            experiment._hardware_metric_thread.start()

        push_new_experiment(experiment)

        return experiment