def destroy(self):
        """
        Destroy the DCGMMonitor. This function must be called
        in order to appropriately deallocate the resources.
        """

        dcgm_agent.dcgmShutdown()
        super().destroy()
예제 #2
0
    def destroy(self):
        """
        Destroy the DCGMMonitor. This function must be called
        in order to appropriately deallocate the resources.
        """

        dcgm_agent.dcgmShutdown()
        self._thread_pool.terminate()
        self._thread_pool.close()
    def __init__(self, gpus, frequency, metrics, dcgmPath=None):
        """
        Parameters
        ----------
        gpus : list of GPUDevice
            The gpus to be monitored
        frequency : int
            Sampling frequency for the metric
        metrics : list
            List of Record types to monitor
        dcgmPath : str (optional)
            DCGM installation path
        """

        super().__init__(frequency, metrics)
        structs._dcgmInit(dcgmPath)
        dcgm_agent.dcgmInit()

        self._gpus = gpus

        # Start DCGM in the embedded mode to use the shared library
        self.dcgm_handle = dcgm_handle = dcgm_agent.dcgmStartEmbedded(
            structs.DCGM_OPERATION_MODE_MANUAL)

        # Create DCGM monitor group
        self.group_id = dcgm_agent.dcgmGroupCreate(dcgm_handle,
                                                   structs.DCGM_GROUP_EMPTY,
                                                   "triton-monitor")
        # Add the GPUs to the group
        for gpu in self._gpus:
            dcgm_agent.dcgmGroupAddDevice(dcgm_handle, self.group_id,
                                          gpu.device_id())

        frequency = int(self._frequency * 1000)
        fields = []
        try:
            for metric in metrics:
                fields.append(self.model_analyzer_to_dcgm_field[metric])
        except KeyError:
            dcgm_agent.dcgmShutdown()
            raise TritonModelAnalyzerException(
                f'{metric} is not supported by Model Analyzer DCGM Monitor')

        self.dcgm_field_group_id = dcgm_agent.dcgmFieldGroupCreate(
            dcgm_handle, fields, 'triton-monitor')

        self.group_watcher = dcgm_field_helpers.DcgmFieldGroupWatcher(
            dcgm_handle, self.group_id, self.dcgm_field_group_id.value,
            structs.DCGM_OPERATION_MODE_MANUAL, frequency, 3600, 0, 0)
예제 #4
0
    def create_device_by_uuid(uuid, dcgmPath=None):
        """
        Create a GPU device using the GPU uuid.

        Parameters
        ----------
        uuid : str
            index of the device in the list of visible CUDA devices.

        Returns
        -------
        Device
            The device associated with the uuid.

        Raises
        ------
        TritonModelAnalyzerExcpetion
            If the uuid does not exist this exception will be raised.
        """

        structs._dcgmInit(dcgmPath)
        dcgm_agent.dcgmInit()

        # Start DCGM in the embedded mode to use the shared library
        dcgm_handle = dcgm_agent.dcgmStartEmbedded(
            structs.DCGM_OPERATION_MODE_MANUAL)
        gpu_devices = dcgm_agent.dcgmGetAllSupportedDevices(dcgm_handle)
        for gpu_device in gpu_devices:
            device_atrributes = dcgm_agent.dcgmGetDeviceAttributes(
                dcgm_handle, gpu_device).identifiers
            pci_bus_id = bytes(
                device_atrributes.pciBusId.decode('ascii').upper(),
                encoding='ascii')
            device_uuid = device_atrributes.uuid
            if bytes(uuid, encoding='ascii') == device_uuid:
                gpu_device = GPUDevice(gpu_device, pci_bus_id, device_uuid)
                dcgm_agent.dcgmShutdown()
                return gpu_device
        else:
            dcgm_agent.dcgmShutdown()
            raise TritonModelAnalyzerException(
                f'GPU UUID {uuid} was not found.')
예제 #5
0
    def create_device_by_bus_id(bus_id, dcgmPath=None):
        """
        Create a GPU device by using its bus ID.

        Parameters
        ----------
        bus_id : bytes
            Bus id corresponding to the GPU. The bus id should be created by
            converting the colon separated hex notation into a bytes type
            using ascii encoding. The bus id before conversion to bytes
            should look like "00:65:00".

        Returns
        -------
        Device
            The device associated with this bus id.
        """

        structs._dcgmInit(dcgmPath)
        dcgm_agent.dcgmInit()

        # Start DCGM in the embedded mode to use the shared library
        dcgm_handle = dcgm_agent.dcgmStartEmbedded(
            structs.DCGM_OPERATION_MODE_MANUAL)
        gpu_devices = dcgm_agent.dcgmGetAllSupportedDevices(dcgm_handle)
        for gpu_device in gpu_devices:
            device_atrributes = dcgm_agent.dcgmGetDeviceAttributes(
                dcgm_handle, gpu_device).identifiers
            pci_bus_id = bytes(
                device_atrributes.pciBusId.decode('ascii').upper(),
                encoding='ascii')
            device_uuid = device_atrributes.uuid
            if pci_bus_id == bus_id:
                gpu_device = GPUDevice(gpu_device, bus_id, device_uuid)
                dcgm_agent.dcgmShutdown()
                return gpu_device
        else:
            dcgm_agent.dcgmShutdown()
            raise TritonModelAnalyzerException(
                f'GPU with {bus_id} bus id is not supported by DCGM.')
예제 #6
0
    def init_all_devices(self, dcgmPath=None):
        """
        Create GPUDevice objects for all DCGM visible
        devices.

        Parameters
        ----------
        dcgmPath : str
            Absolute path to dcgm shared library
        """

        if numba.cuda.is_available():
            logger.info("Initiliazing GPUDevice handles...")
            structs._dcgmInit(dcgmPath)
            dcgm_agent.dcgmInit()

            # Start DCGM in the embedded mode to use the shared library
            dcgm_handle = dcgm_agent.dcgmStartEmbedded(
                structs.DCGM_OPERATION_MODE_MANUAL)

            # Create a GPU device for every supported DCGM device
            dcgm_device_ids = dcgm_agent.dcgmGetAllSupportedDevices(dcgm_handle)

            for device_id in dcgm_device_ids:
                device_atrributes = dcgm_agent.dcgmGetDeviceAttributes(
                    dcgm_handle, device_id).identifiers
                pci_bus_id = device_atrributes.pciBusId.decode('utf-8').upper()
                device_uuid = str(device_atrributes.uuid, encoding='utf-8')
                device_name = str(device_atrributes.deviceName,
                                  encoding='utf-8')
                gpu_device = GPUDevice(device_name, device_id, pci_bus_id,
                                       device_uuid)

                self._devices.append(gpu_device)
                self._devices_by_bus_id[pci_bus_id] = gpu_device
                self._devices_by_uuid[device_uuid] = gpu_device

            dcgm_agent.dcgmShutdown()