def start(self, run: "Run"): gauge_mode = GaugeMode.CGROUP if in_docker() else GaugeMode.SYSTEM system_resource_info = SystemResourceInfoFactory( system_monitor=SystemMonitor(), gpu_monitor=GPUMonitor(), os_environ=os.environ, ).create(gauge_mode=gauge_mode) gauge_factory = GaugeFactory(gauge_mode=gauge_mode) metrics_factory = MetricsFactory( gauge_factory=gauge_factory, system_resource_info=system_resource_info ) metrics_container = metrics_factory.create_metrics_container() metric_reporter = MetricReporterFactory(time.time()).create( metrics=metrics_container.metrics() ) for metric in metrics_container.metrics(): self._gauges_in_resource[metric.resource_type] = len(metric.gauges) for metric in metrics_container.metrics(): for gauge in metric.gauges: path = self.get_attribute_name(metric.resource_type, gauge.name()) if not run.get_attribute(path): run[path] = FloatSeries( [], min=metric.min_value, max=metric.max_value, unit=metric.unit ) self._thread = self.ReportingThread(self, self._period, run, metric_reporter) self._thread.start() self._started = True
def create(self, gauge_mode, experiment, reference_timestamp): system_resource_info = SystemResourceInfoFactory( system_monitor=SystemMonitor(), gpu_monitor=GPUMonitor(), os_environ=self.__os_environ, ).create(gauge_mode=gauge_mode) gauge_factory = GaugeFactory(gauge_mode=gauge_mode) metrics_factory = MetricsFactory( gauge_factory=gauge_factory, system_resource_info=system_resource_info) metrics_container = metrics_factory.create_metrics_container() for metric in metrics_container.metrics(): metric.internal_id = self.__backend.create_hardware_metric( experiment, metric) metric_reporter = MetricReporterFactory(reference_timestamp).create( metrics=metrics_container.metrics()) return MetricService( backend=self.__backend, metric_reporter=metric_reporter, experiment=experiment, metrics_container=metrics_container, )
def test_empty_gpu_card_indices_on_cuda_env_variable_minus_one(self): # given gpu_monitor = MagicMock(spec_set=GPUMonitor) gpu_monitor.get_card_count.return_value = 4 # and system_resource_info_factory = SystemResourceInfoFactory( system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'-1'}) # when resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM) # then self.assertEqual([], resource_info.gpu_card_indices)
def test_cgroup_resource_info(self): # given system_resource_info_factory = SystemResourceInfoFactory( system_monitor=SystemMonitor(), gpu_monitor=GPUMonitor(), os_environ=os.environ) # when resource_info = system_resource_info_factory.create(GaugeMode.CGROUP) # then self.assert_float_greater_than(resource_info.cpu_core_count, 0) self.assert_int_greater_than(resource_info.memory_amount_bytes, 0) self.assert_int_greater_or_equal(resource_info.gpu_card_count, 0) self.assert_int_greater_or_equal(resource_info.gpu_memory_amount_bytes, 0)
def test_gpu_card_indices_without_cuda_env_variable(self): # given gpu_monitor = MagicMock(spec_set=GPUMonitor) gpu_monitor.get_card_count.return_value = 2 # and system_resource_info_factory = SystemResourceInfoFactory( system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ=dict()) # when resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM) # then self.assertEqual([0, 1], resource_info.gpu_card_indices)
def test_should_ignore_invalid_cuda_env_variable_syntax(self): # given gpu_monitor = MagicMock(spec_set=GPUMonitor) gpu_monitor.get_card_count.return_value = 4 # and system_resource_info_factory = SystemResourceInfoFactory( system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'1,3,abc'}) # when resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM) # then self.assertEqual([0, 1, 2, 3], resource_info.gpu_card_indices)
def test_should_ignore_gpu_indices_after_index_out_of_range(self): # given gpu_monitor = MagicMock(spec_set=GPUMonitor) gpu_monitor.get_card_count.return_value = 4 # and system_resource_info_factory = SystemResourceInfoFactory( system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'1,3,5,2'}) # when resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM) # then self.assertEqual([1, 3], resource_info.gpu_card_indices)
class SystemCpuUsageGauge(Gauge): def __init__(self): self.__system_monitor = SystemMonitor() def name(self): return u'cpu' def value(self): return self.__system_monitor.cpu_percent() def __eq__(self, other): return self.__class__ == other.__class__ def __repr__(self): return str(u'SystemCpuUsageGauge')
class SystemMemoryUsageGauge(Gauge): def __init__(self): self.__system_monitor = SystemMonitor() def name(self): return u"ram" def value(self): virtual_mem = self.__system_monitor.virtual_memory() return (virtual_mem.total - virtual_mem.available) / float(BYTES_IN_ONE_GB) def __eq__(self, other): return self.__class__ == other.__class__ def __repr__(self): return str(u"SystemMemoryUsageGauge")
def start(self, abort_callback=None, logger=None, upload_stdout=True, upload_stderr=True, send_hardware_metrics=True, run_monitoring_thread=True, handle_uncaught_exceptions=True): if handle_uncaught_exceptions: self._set_uncaught_exception_handler() abortable = abort_callback is not None or DefaultAbortImpl.requirements_installed( ) if abortable: self._run_aborting_thread(abort_callback) if logger: # pylint: disable=protected-access channel = self._experiment._get_channel('logger', 'text', ChannelNamespace.SYSTEM) channel_writer = ChannelWriter(self._experiment, channel.name, ChannelNamespace.SYSTEM) self._logger_handler = StreamHandler(channel_writer) self._logger = logger logger.addHandler(self._logger_handler) if upload_stdout and not is_notebook(): self._stdout_uploader = StdOutWithUpload(self._experiment) if upload_stderr and not is_notebook(): self._stderr_uploader = StdErrWithUpload(self._experiment) if run_monitoring_thread: self._run_monitoring_thread() if send_hardware_metrics and SystemMonitor.requirements_installed(): self._run_hardware_metrics_reporting_thread()
def __init__(self): self.__system_monitor = SystemMonitor()
def create(): return CGroupMonitor(CGroupFilesystemReader(), SystemMonitor())
def create_experiment(self, name=None, description=None, params=None, properties=None, tags=None, upload_source_files=None, abort_callback=None, upload_stdout=True, upload_stderr=True, send_hardware_metrics=True, run_monitoring_thread=True, handle_uncaught_exceptions=True): """ Raises: `ExperimentValidationError`: When provided arguments are invalid. `ExperimentLimitReached`: When experiment limit in the project has been reached. """ if name is None: name = "Untitled" if description is None: description = "" if params is None: params = {} if properties is None: properties = {} if tags is None: tags = [] abortable = abort_callback is not None or DefaultAbortImpl.requirements_installed( ) experiment = self.client.create_experiment( project=self, name=name, description=description, params=params, properties=properties, tags=tags, abortable=abortable, monitored=run_monitoring_thread) if upload_source_files is None: main_file = sys.argv[0] main_abs_path = os.path.join(os.getcwd(), os.path.basename(main_file)) if os.path.isfile(main_abs_path): upload_source_files = [ os.path.relpath(main_abs_path, os.getcwd()) ] else: upload_source_files = [] experiment.upload_source_files(upload_source_files) def exception_handler(exc_type, exc_val, exc_tb): experiment.stop("\n".join(traceback.format_tb(exc_tb)) + "\n" + repr(exc_val)) sys.__excepthook__(exc_type, exc_val, exc_tb) if handle_uncaught_exceptions: # pylint:disable=protected-access experiment._uncaught_exception_handler = exception_handler sys.excepthook = exception_handler # pylint:disable=protected-access experiment._channels_values_sender = ChannelsValuesSender(experiment) if abortable: # pylint:disable=protected-access if abort_callback: abort_impl = CustomAbortImpl(abort_callback) else: abort_impl = DefaultAbortImpl(pid=os.getpid()) websocket_factory = ReconnectingWebsocketFactory( client=self.client, experiment_id=experiment.internal_id) experiment._aborting_thread = AbortingThread( websocket_factory=websocket_factory, abort_impl=abort_impl, experiment_id=experiment.internal_id) experiment._aborting_thread.start() if upload_stdout and not is_notebook(): # pylint:disable=protected-access experiment._stdout_uploader = StdOutWithUpload(experiment) if upload_stderr and not is_notebook(): # pylint:disable=protected-access experiment._stderr_uploader = StdErrWithUpload(experiment) if run_monitoring_thread: # pylint:disable=protected-access experiment._ping_thread = PingThread(client=self.client, experiment=experiment) experiment._ping_thread.start() if send_hardware_metrics and SystemMonitor.requirements_installed(): # pylint:disable=protected-access gauge_mode = GaugeMode.CGROUP if in_docker() else GaugeMode.SYSTEM metric_service = MetricServiceFactory( self.client, os.environ).create(gauge_mode=gauge_mode, experiment=experiment, reference_timestamp=time.time()) experiment._hardware_metric_thread = HardwareMetricReportingThread( metric_service=metric_service, metric_sending_interval_seconds=3) experiment._hardware_metric_thread.start() push_new_experiment(experiment) return experiment