Exemplo n.º 1
0
    def test_record_memory(self):
        # One measurement every 0.01 seconds
        frequency = 0.01
        monitoring_time = 10
        metrics = [GPUUsedMemory, GPUFreeMemory]
        gpus = ['all']
        dcgm_monitor = DCGMMonitor(gpus, frequency, metrics)
        dcgm_monitor.start_recording_metrics()
        time.sleep(monitoring_time)
        records = dcgm_monitor.stop_recording_metrics()

        # Assert instance types
        for record in records:
            self.assertIsInstance(record.device(), GPUDevice)
            self.assertIsInstance(record.value(), float)
            self.assertTrue(record.value() == TEST_RECORD_VALUE)
            self.assertIsInstance(record.timestamp(), int)

        # The number of records should be dividable by number of metrics
        self.assertTrue(len(records) % len(metrics) == 0)
        self.assertTrue(len(records) > 0)
        self.assertTrue(records[-1].timestamp() -
                        records[0].timestamp() >= monitoring_time)

        with self.assertRaises(TritonModelAnalyzerException):
            dcgm_monitor.stop_recording_metrics()

        dcgm_monitor.destroy()

        metrics = ['UndefinedTag']
        with self.assertRaises(TritonModelAnalyzerException):
            DCGMMonitor(gpus, frequency, metrics)
Exemplo n.º 2
0
    def test_gpu_id(self):
        frequency = 0.01
        metrics = [GPUUsedMemory, GPUFreeMemory]
        gpus = ['UndefinedId']
        with self.assertRaises(TritonModelAnalyzerException):
            DCGMMonitor(gpus, frequency, metrics)

        gpus = [str(TEST_UUID, encoding='ascii')]
        dcgm_monitor = DCGMMonitor(gpus, frequency, metrics)
        dcgm_monitor.destroy()
Exemplo n.º 3
0
    def _start_monitors(self):
        """
        Start any metrics monitors
        """

        self._dcgm_monitor = DCGMMonitor(self._gpus, self._monitoring_interval,
                                         self._dcgm_metrics)
        self._cpu_monitor = CPUMonitor(self._server, self._monitoring_interval,
                                       self._cpu_metrics)

        self._dcgm_monitor.start_recording_metrics()
        self._cpu_monitor.start_recording_metrics()
Exemplo n.º 4
0
 def test_immediate_start_stop(self):
     frequency = 1
     metrics = [GPUUsedMemory, GPUFreeMemory]
     dcgm_monitor = DCGMMonitor(self._gpus, frequency, metrics)
     dcgm_monitor.start_recording_metrics()
     dcgm_monitor.stop_recording_metrics()
     dcgm_monitor.destroy()
Exemplo n.º 5
0
 def test_immediate_start_stop(self):
     frequency = 0.01
     tags = [GPUUsedMemory, GPUFreeMemory]
     gpus = ['all']
     dcgm_monitor = DCGMMonitor(gpus, frequency, tags)
     dcgm_monitor.start_recording_metrics()
     dcgm_monitor.stop_recording_metrics()
     dcgm_monitor.destroy()
    def _start_monitors(self, cpu_only=False):
        """
        Start any metrics monitors
        """

        if not cpu_only:
            try:
                if self._config.use_local_gpu_monitor:
                    self._gpu_monitor = DCGMMonitor(
                        self._gpus, self._config.monitoring_interval,
                        self._gpu_metrics)
                    self._check_triton_and_model_analyzer_gpus()
                else:
                    self._gpu_monitor = RemoteMonitor(
                        self._config.triton_metrics_url,
                        self._config.monitoring_interval, self._gpu_metrics)
                self._gpu_monitor.start_recording_metrics()
            except TritonModelAnalyzerException:
                self._destroy_monitors()
                raise

        self._cpu_monitor = CPUMonitor(self._server,
                                       self._config.monitoring_interval,
                                       self._cpu_metrics)
        self._cpu_monitor.start_recording_metrics()
Exemplo n.º 7
0
    def test_record_utilization(self):
        # One measurement every 0.01 seconds
        frequency = 0.01
        monitoring_time = 10
        metrics = [GPUUtilization]
        gpus = ['all']
        dcgm_monitor = DCGMMonitor(gpus, frequency, metrics)
        dcgm_monitor.start_recording_metrics()
        time.sleep(monitoring_time)
        records = dcgm_monitor.stop_recording_metrics()

        # Assert instance types
        for record in records:
            self.assertIsInstance(record.device(), GPUDevice)
            self.assertIsInstance(record.value(), float)
            self.assertTrue(record.value() <= 100)
            self.assertTrue(record.value() == TEST_RECORD_VALUE)
            self.assertIsInstance(record.timestamp(), int)

        # The number of records should be dividable by number of metrics
        self.assertTrue(len(records) % len(metrics) == 0)
        self.assertTrue(len(records) > 0)
        self.assertTrue(records[-1].timestamp() -
                        records[0].timestamp() >= monitoring_time)

        dcgm_monitor.destroy()
Exemplo n.º 8
0
    def test_record_power(self):
        # One measurement every 0.01 seconds
        frequency = 1
        monitoring_time = 2
        metrics = [GPUPowerUsage]
        dcgm_monitor = DCGMMonitor(self._gpus, frequency, metrics)
        dcgm_monitor.start_recording_metrics()
        time.sleep(monitoring_time)
        records = dcgm_monitor.stop_recording_metrics()

        # Assert instance types
        for record in records:
            self.assertIsInstance(record.device_uuid(), str)
            self.assertIsInstance(record.value(), float)
            self.assertTrue(record.value() == TEST_RECORD_VALUE)
            self.assertIsInstance(record.timestamp(), int)

        # The number of records should be dividable by number of metrics
        self.assertTrue(len(records) % len(metrics) == 0)
        self.assertTrue(len(records) > 0)
        self.assertTrue(records[-1].timestamp() -
                        records[0].timestamp() >= monitoring_time)

        dcgm_monitor.destroy()
Exemplo n.º 9
0
class MetricsManager:
    """
    This class handles the profiling
    categorization of metrics
    """
    def __init__(self, config, metric_tags, server, result_manager):
        """
        Parameters
        ----------
        config : AnalyzerConfig
            The model analyzer's config 
        metric_tags : List of str
            The list of tags corresponding to the metric types to monitor.
        server : TritonServer
            Handle to the instance of Triton being used
        result_manager : ResultManager
            instance that manages the result tables and 
            adding results
        """

        self._server = server
        self._gpus = config.gpus
        self._monitoring_interval = config.monitoring_interval
        self._perf_analyzer_path = config.perf_analyzer_path
        self._config = config
        self._result_manager = result_manager

        self._dcgm_metrics = []
        self._perf_metrics = []
        self._cpu_metrics = []

        self._create_metric_tables(metrics=MetricsManager.get_metric_types(
            tags=metric_tags))

    def _create_metric_tables(self, metrics):
        """
        Splits up monitoring metrics into various
        categories, defined in ___init___ and
        requests result manager to make
        corresponding table
        """

        # Separates metrics and objectives into related lists
        for metric in metrics:
            if metric in DCGMMonitor.model_analyzer_to_dcgm_field:
                self._dcgm_metrics.append(metric)
            elif metric in PerfAnalyzer.perf_metrics:
                self._perf_metrics.append(metric)
            elif metric in CPUMonitor.cpu_metrics:
                self._cpu_metrics.append(metric)

        self._result_manager.create_tables(
            gpu_specific_metrics=self._dcgm_metrics,
            non_gpu_specific_metrics=self._perf_metrics + self._cpu_metrics)

    def profile_server(self):
        """
        Runs the DCGM monitor on the triton server without the perf_analyzer

        Raises
        ------
        TritonModelAnalyzerException
        """

        self._start_monitors()
        server_gpu_metrics = self._get_gpu_inference_metrics()
        self._result_manager.add_server_data(data=server_gpu_metrics)

    def profile_model(self, perf_config, perf_output_writer=None):
        """
        Runs monitors while running perf_analyzer with a specific set of
        arguments. This will profile model inferencing.

        Parameters
        ----------
        perf_config : dict
            The keys are arguments to perf_analyzer The values are their
            values
        perf_output_writer : OutputWriter
            Writer that writes the output from perf_analyzer to the output
            stream/file. If None, the output is not written
        
        Returns
        -------
        (dict of lists, list)
            The gpu specific and non gpu metrics
        """

        # Start monitors and run perf_analyzer
        self._start_monitors()
        perf_analyzer_metrics_or_status = self._get_perf_analyzer_metrics(
            perf_config, perf_output_writer)

        # Failed Status
        if perf_analyzer_metrics_or_status == 1:
            self._stop_monitors()
            self._destroy_monitors()
            return None, None
        else:
            perf_analyzer_metrics = perf_analyzer_metrics_or_status

        # Get metrics for model inference and combine metrics that do not have GPU ID
        model_gpu_metrics = self._get_gpu_inference_metrics()
        model_cpu_metrics = self._get_cpu_inference_metrics()
        model_non_gpu_metrics = list(perf_analyzer_metrics.values()) + list(
            model_cpu_metrics.values())

        return model_gpu_metrics, model_non_gpu_metrics

    def _start_monitors(self):
        """
        Start any metrics monitors
        """

        self._dcgm_monitor = DCGMMonitor(self._gpus, self._monitoring_interval,
                                         self._dcgm_metrics)
        self._cpu_monitor = CPUMonitor(self._server, self._monitoring_interval,
                                       self._cpu_metrics)

        self._dcgm_monitor.start_recording_metrics()
        self._cpu_monitor.start_recording_metrics()

    def _stop_monitors(self):
        """
        Stop any metrics monitors
        """

        self._dcgm_monitor.stop_recording_metrics()
        self._cpu_monitor.stop_recording_metrics()

    def _destroy_monitors(self):
        """
        Destroy the monitors created by start
        """

        self._dcgm_monitor.destroy()
        self._cpu_monitor.destroy()

    def _get_perf_analyzer_metrics(self, perf_config, perf_output_writer=None):
        """
        Gets the aggregated metrics from the perf_analyzer

        Parameters
        ----------
        perf_config : dict
            The keys are arguments to perf_analyzer The values are their
            values
        perf_output_writer : OutputWriter
            Writer that writes the output from perf_analyzer to the output
            stream/file. If None, the output is not written

        Raises
        ------
        TritonModelAnalyzerException
        """

        try:
            perf_analyzer = PerfAnalyzer(
                path=self._perf_analyzer_path,
                config=perf_config,
                timeout=self._config.perf_analyzer_timeout,
                max_cpu_util=self._config.perf_analyzer_cpu_util)
            status = perf_analyzer.run(self._perf_metrics)
            # PerfAnalzyer run was not succesful
            if status == 1:
                return 1
        except FileNotFoundError as e:
            raise TritonModelAnalyzerException(
                f"perf_analyzer binary not found : {e}")

        if perf_output_writer:
            perf_output_writer.write(perf_analyzer.output() + '\n')

        perf_records = perf_analyzer.get_records()
        perf_record_aggregator = RecordAggregator()
        perf_record_aggregator.insert_all(perf_records)

        return perf_record_aggregator.aggregate()

    def _get_gpu_inference_metrics(self):
        """
        Stops GPU monitor and aggregates any records
        that are GPU specific

        Returns
        -------
        dict
            keys are gpu ids and values are metric values
            in the order specified in self._dcgm_metrics
        """

        # Stop and destroy DCGM monitor
        dcgm_records = self._dcgm_monitor.stop_recording_metrics()
        self._destroy_monitors()

        # Insert all records into aggregator and get aggregated DCGM records
        dcgm_record_aggregator = RecordAggregator()
        dcgm_record_aggregator.insert_all(dcgm_records)

        records_groupby_gpu = {}
        records_groupby_gpu = dcgm_record_aggregator.groupby(
            self._dcgm_metrics, lambda record: record.device().device_id())

        gpu_metrics = defaultdict(list)
        for _, metric in records_groupby_gpu.items():
            for gpu_id, metric_value in metric.items():
                gpu_metrics[gpu_id].append(metric_value)

        return gpu_metrics

    def _get_cpu_inference_metrics(self):
        """
        Stops any monitors that just need the records to be aggregated
        like the CPU mmetrics
        """

        cpu_records = self._cpu_monitor.stop_recording_metrics()
        self._destroy_monitors()

        cpu_record_aggregator = RecordAggregator()
        cpu_record_aggregator.insert_all(cpu_records)
        return cpu_record_aggregator.aggregate()

    @staticmethod
    def get_metric_types(tags):
        """
        Parameters
        ----------
        tags : list of str
            Human readable names for the 
            metrics to monitor. They correspond
            to actual record types.

        Returns
        -------
        List
            of record types being monitored
        """

        return [RecordType.get(tag) for tag in tags]
Exemplo n.º 10
0
class MetricsManager:
    """
    This class handles the profiling
    categorization of metrics
    """
    def __init__(self, config, metric_tags, server, result_manager):
        """
        Parameters
        ----------
        config : AnalyzerConfig
            The model analyzer's config 
        metric_tags : List of str
            The list of tags corresponding to the metric types to monitor.
        server : TritonServer
            Handle to the instance of Triton being used
        result_manager : ResultManager
            instance that manages the result tables and 
            adding results
        """

        self._server = server
        self._gpus = config.gpus
        self._monitoring_interval = config.monitoring_interval
        self._perf_analyzer_path = config.perf_analyzer_path
        self._result_manager = result_manager

        self._dcgm_metrics = []
        self._perf_metrics = []
        self._cpu_metrics = []

        self._create_metric_tables(metrics=MetricsManager.get_metric_types(
            tags=metric_tags))

    def _create_metric_tables(self, metrics):
        """
        Splits up monitoring metrics into various
        categories, defined in ___init___ and
        requests result manager to make
        corresponding table
        """

        # Separates metrics and objectives into related lists
        for metric in metrics:
            if metric in DCGMMonitor.model_analyzer_to_dcgm_field:
                self._dcgm_metrics.append(metric)
            elif metric in PerfAnalyzer.perf_metrics:
                self._perf_metrics.append(metric)
            elif metric in CPUMonitor.cpu_metrics:
                self._cpu_metrics.append(metric)

        self._result_manager.create_tables(
            gpu_specific_metrics=self._dcgm_metrics,
            non_gpu_specific_metrics=self._perf_metrics + self._cpu_metrics,
            aggregation_tag='Max')

    def configure_result_manager(self, config_model):
        """
        Processes the constraints and objectives
        for given ConfigModel and creates a result
        comparator to pass to the result manager

        Parameters
        ----------
        config_model : ConfigModel
            The config model object for the model that is currently being
            run
        """

        constraints = {}

        # Construct dict of record types for objectives and constraints
        objective_tags = list(config_model.objectives().keys())
        objective_metrics = MetricsManager.get_metric_types(
            tags=objective_tags)
        objectives = {
            objective_metrics[i]: config_model.objectives()[objective_tags[i]]
            for i in range(len(objective_tags))
        }

        # Constraints may be empty
        if config_model.constraints():
            constraint_tags = list(config_model.constraints().keys())
            constraint_metrics = MetricsManager.get_metric_types(
                tags=constraint_tags)
            constraints = {
                constraint_metrics[i]:
                config_model.constraints()[constraint_tags[i]]
                for i in range(len(constraint_tags))
            }

        self._result_comparator = ResultComparator(
            gpu_metric_types=self._dcgm_metrics,
            non_gpu_metric_types=self._perf_metrics + self._cpu_metrics,
            metric_objectives=objectives)

        self._result_manager.set_constraints_and_comparator(
            constraints=constraints, comparator=self._result_comparator)

    def profile_server(self, default_value):
        """
        Runs the DCGM monitor on the triton server without the perf_analyzer

        Parameters
        ----------
        default_value : str
            The value to fill in for columns in the table that don't apply to
            profiling server only

        Raises
        ------
        TritonModelAnalyzerException
        """

        self._start_monitors()
        server_gpu_metrics = self._get_gpu_inference_metrics()
        self._result_manager.add_server_data(data=server_gpu_metrics,
                                             default_value=default_value)

    def profile_model(self, perf_config, perf_output_writer=None):
        """
        Runs monitors while running perf_analyzer with a specific set of
        arguments. This will profile model inferencing.

        Parameters
        ----------
        perf_config : dict
            The keys are arguments to perf_analyzer The values are their
            values
        perf_output_writer : OutputWriter
            Writer that writes the output from perf_analyzer to the output
            stream/file. If None, the output is not written
        """

        # Start monitors and run perf_analyzer
        self._start_monitors()
        perf_analyzer_metrics = self._get_perf_analyzer_metrics(
            perf_config, perf_output_writer)

        # Get metrics for model inference and combine metrics that do not have GPU ID
        model_gpu_metrics = self._get_gpu_inference_metrics()
        model_cpu_metrics = self._get_cpu_inference_metrics()
        model_non_gpu_metric_values = list(
            perf_analyzer_metrics.values()) + list(model_cpu_metrics.values())

        # Construct a measurement
        model_measurement = Measurement(
            gpu_data=model_gpu_metrics,
            non_gpu_data=model_non_gpu_metric_values,
            perf_config=perf_config,
            comparator=self._result_comparator)

        self._result_manager.add_model_data(measurement=model_measurement)

    def _start_monitors(self):
        """
        Start any metrics monitors
        """

        self._dcgm_monitor = DCGMMonitor(self._gpus, self._monitoring_interval,
                                         self._dcgm_metrics)
        self._cpu_monitor = CPUMonitor(self._server, self._monitoring_interval,
                                       self._cpu_metrics)

        self._dcgm_monitor.start_recording_metrics()
        self._cpu_monitor.start_recording_metrics()

    def _destroy_monitors(self):
        """
        Destroy the monitors created by start
        """

        self._dcgm_monitor.destroy()
        self._cpu_monitor.destroy()

    def _get_perf_analyzer_metrics(self, perf_config, perf_output_writer=None):
        """
        Gets the aggregated metrics from the perf_analyzer

        Parameters
        ----------
        perf_config : dict
            The keys are arguments to perf_analyzer The values are their
            values
        perf_output_writer : OutputWriter
            Writer that writes the output from perf_analyzer to the output
            stream/file. If None, the output is not written

        Raises
        ------
        TritonModelAnalyzerException
        """

        try:
            perf_analyzer = PerfAnalyzer(path=self._perf_analyzer_path,
                                         config=perf_config)
            perf_analyzer.run(self._perf_metrics)
        except FileNotFoundError as e:
            raise TritonModelAnalyzerException(
                f"perf_analyzer binary not found : {e}")

        if perf_output_writer:
            perf_output_writer.write(perf_analyzer.output() + '\n')

        perf_records = perf_analyzer.get_records()
        perf_record_aggregator = RecordAggregator()
        perf_record_aggregator.insert_all(perf_records)

        return perf_record_aggregator.aggregate()

    def _get_gpu_inference_metrics(self):
        """
        Stops GPU monitor and aggregates any records
        that are GPU specific

        Returns
        -------
        dict
            keys are gpu ids and values are metric values
            in the order specified in self._dcgm_metrics
        """

        # Stop and destroy DCGM monitor
        dcgm_records = self._dcgm_monitor.stop_recording_metrics()
        self._destroy_monitors()

        # Insert all records into aggregator and get aggregated DCGM records
        dcgm_record_aggregator = RecordAggregator()
        dcgm_record_aggregator.insert_all(dcgm_records)

        records_groupby_gpu = {}
        records_groupby_gpu = dcgm_record_aggregator.groupby(
            self._dcgm_metrics, lambda record: record.device().device_id())

        gpu_metrics = defaultdict(list)
        for _, metric in records_groupby_gpu.items():
            for gpu_id, metric_value in metric.items():
                gpu_metrics[gpu_id].append(metric_value)

        return gpu_metrics

    def _get_cpu_inference_metrics(self):
        """
        Stops any monitors that just need the records to be aggregated
        like the CPU mmetrics
        """

        cpu_records = self._cpu_monitor.stop_recording_metrics()
        self._destroy_monitors()

        cpu_record_aggregator = RecordAggregator()
        cpu_record_aggregator.insert_all(cpu_records)
        return cpu_record_aggregator.aggregate()

    @staticmethod
    def get_metric_types(tags):
        """
        Parameters
        ----------
        tags : list of str
            Human readable names for the 
            metrics to monitor. They correspond
            to actual record types.

        Returns
        -------
        List
            of record types being monitored
        """

        return [RecordType.get(tag) for tag in tags]