예제 #1
0
 def test_ulimit_invalid_type(self):
     with pytest.raises(ValueError):
         Ulimit(name=None)
     with pytest.raises(ValueError):
         Ulimit(name='hello', soft='123')
     with pytest.raises(ValueError):
         Ulimit(name='hello', hard='456')
예제 #2
0
파일: utils.py 프로젝트: wannaphong/epicbox
def create_ulimits(limits):
    ulimits = []
    if limits['cputime']:
        cpu = limits['cputime']
        ulimits.append(Ulimit(name='cpu', soft=cpu, hard=cpu))
    if 'file_size' in limits:
        fsize = limits['file_size']
        ulimits.append(Ulimit(name='fsize', soft=fsize, hard=fsize))
    return ulimits or None
예제 #3
0
 def test_compare_ulimits_neg(self):
     self.fake_data['params']['dimensions'] = {
         'ulimits': {
             'nofile': {
                 'soft': 131072,
                 'hard': 131072
             }
         }
     }
     ulimits_nofile = Ulimit(name='nofile', soft=131072, hard=131072)
     container_info = dict()
     container_info['HostConfig'] = {
         'CpuPeriod': 0,
         'KernelMemory': 0,
         'Memory': 0,
         'CpuQuota': 0,
         'CpusetCpus': '',
         'CpuShares': 0,
         'BlkioWeight': 0,
         'CpusetMems': '',
         'MemorySwap': 0,
         'MemoryReservation': 0,
         'Ulimits': [ulimits_nofile]
     }
     self.dw = get_DockerWorker(self.fake_data['params'])
     self.assertFalse(self.dw.compare_dimensions(container_info))
예제 #4
0
 def test_create_host_config_obj_ulimit(self):
     ulimit_dct = Ulimit(name='nofile', soft=8096)
     config = create_host_config(ulimits=[ulimit_dct],
                                 version=DEFAULT_DOCKER_API_VERSION)
     assert 'Ulimits' in config
     assert len(config['Ulimits']) == 1
     ulimit_obj = config['Ulimits'][0]
     assert isinstance(ulimit_obj, Ulimit)
     assert ulimit_obj == ulimit_dct
예제 #5
0
 def test_create_host_config_obj_ulimit(self):
     ulimit_dct = Ulimit(name='nofile', soft=8096)
     config = create_host_config(ulimits=[ulimit_dct],
                                 version=DEFAULT_DOCKER_API_VERSION)
     self.assertIn('Ulimits', config)
     self.assertEqual(len(config['Ulimits']), 1)
     ulimit_obj = config['Ulimits'][0]
     self.assertTrue(isinstance(ulimit_obj, Ulimit))
     self.assertEqual(ulimit_obj, ulimit_dct)
예제 #6
0
    def start(self) -> None:
        """
        Start Triton Server Container
        """
        devices = [
            DeviceRequest(capabilities=[["gpu"]], device_ids=self._devices),
        ]

        LOGGER.info(
            f"Triton environment: {json.dumps(self._environment, indent=4)}")

        LOGGER.info(f"Starting Triton container {self.name}.")
        self._container = self._docker_client.containers.run(
            image=self._image,
            name=self.name,
            device_requests=devices,
            detach=True,
            tty=True,
            shm_size=self._shm_size,
            ulimits=[
                Ulimit(name="memlock", soft=-1, hard=-1),
                Ulimit(name="stack", soft=67108864, hard=67108864),
            ],
            volumes=self._volumes,
            environment=self._environment,
            network_mode=self._network,
            auto_remove=True,
            ipc_mode="host",
        )
        LOGGER.info(f"Triton command:")
        LOGGER.info(f"  {self._command}")
        LOGGER.info(f"Starting Triton Server {self.name}.")
        self._triton_exec = self._docker_api_client.exec_create(
            container=self._container.id,
            cmd=self._command,
        )
        stream_generator = self._docker_api_client.exec_start(
            exec_id=self._triton_exec["Id"], stream=True)

        self._logging_thread = Thread(target=TritonServerContainer._logging,
                                      args=(self, stream_generator),
                                      daemon=True)
        self._logging_thread.start()
예제 #7
0
 def test_ulimit_invalid_type(self):
     self.assertRaises(ValueError, lambda: Ulimit(name=None))
     self.assertRaises(ValueError, lambda: Ulimit(name='hello', soft='123'))
     self.assertRaises(ValueError, lambda: Ulimit(name='hello', hard='456'))
예제 #8
0
def serve(
    save_path: Union[Path, str],
    device: str = 'cpu',
    name: str = None,
    batch_size: int = 16,
) -> Container:
    """Serve the given model save path in a Docker container.

    Args:
        save_path (Union[Path, str]): Saved path to the model.
        device (str): Device name. E.g.: cpu, cuda, cuda:1.
        name (str): Container name. Default to None.
        batch_size (int): Batch size for passing to serving containers.

    Returns:
        Container: Docker container object created.

    """

    info = parse_path(Path(save_path))
    architecture: str = info['architecture']
    engine: Engine = info['engine']

    cuda, device_num = get_device(device)

    docker_client = docker.from_env()

    # set mount
    mounts = [
        Mount(target=f'/models/{architecture}',
              source=str(info['base_dir']),
              type='bind',
              read_only=True)
    ]

    common_kwargs = remove_dict_null({
        'detach': True,
        'auto_remove': True,
        'mounts': mounts,
        'name': name
    })
    environment = dict()

    if cuda:
        common_kwargs['runtime'] = 'nvidia'
        environment['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
        environment['CUDA_VISIBLE_DEVICES'] = device_num

    if engine == Engine.TFS:
        # Tensorflow Serving 2.2.0 has the issue: https://github.com/tensorflow/serving/issues/1663
        docker_tag = '2.1.0-gpu' if cuda else '2.1.0'
        ports = {'8501': config.TFS_HTTP_PORT, '8500': config.TFS_GRPC_PORT}
        environment['MODEL_NAME'] = architecture
        container = docker_client.containers.run(
            f'tensorflow/serving:{docker_tag}',
            environment=environment,
            ports=ports,
            **common_kwargs)
    elif engine == Engine.TORCHSCRIPT:
        docker_tag = 'latest-gpu' if cuda else 'latest'
        ports = {
            '8000': config.TORCHSCRIPT_HTTP_PORT,
            '8001': config.TORCHSCRIPT_GRPC_PORT
        }
        environment['MODEL_NAME'] = architecture
        container = docker_client.containers.run(
            f'mlmodelci/pytorch-serving:{docker_tag}',
            environment=environment,
            ports=ports,
            **common_kwargs)
    elif engine == Engine.ONNX:
        docker_tag = 'latest-gpu' if cuda else 'latest'
        ports = {'8000': config.ONNX_HTTP_PORT, '8001': config.ONNX_GRPC_PORT}
        environment['MODEL_NAME'] = architecture
        container = docker_client.containers.run(
            f'mlmodelci/onnx-serving:{docker_tag}',
            environment=environment,
            ports=ports,
            **common_kwargs)
    elif engine == Engine.TRT:
        if not cuda:
            raise RuntimeError(
                'TensorRT cannot be run without CUDA. Please specify a CUDA device.'
            )

        ports = {
            '8000': config.TRT_HTTP_PORT,
            '8001': config.TRT_GRPC_PORT,
            '8002': config.TRT_PROMETHEUS_PORT
        }
        ulimits = [
            Ulimit(name='memlock', soft=-1, hard=-1),
            Ulimit(name='stack', soft=67100864, hard=67100864)
        ]
        trt_kwargs = {'ulimits': ulimits, 'shm_size': '1G'}
        container = docker_client.containers.run(
            f'nvcr.io/nvidia/tensorrtserver:19.10-py3',
            'trtserver --model-repository=/models',
            environment=environment,
            ports=ports,
            **common_kwargs,
            **trt_kwargs,
        )
    else:
        raise RuntimeError(
            f'Not able to serve model with path `{str(save_path)}`.')

    return container
예제 #9
0
DOCKER_PARAMETERS = {
    "image": "pl:latest",
    "auto_remove": True,
    "cpu_period": 1000,
    "cpu_shares": 1024,
    "cpu_quota": 0,
    "cpuset_cpus": "0",
    "detach": True,
    "environment": {},
    "mem_limit": "100m",
    "memswap_limit": "200m",
    "network_mode": "none",
    "network_disabled": True,
    # "storage_opt":      {},
    "tty": True,
    "ulimits": [Ulimit(name="core", soft=0, hard=0)],
}

# Check if any of the above settings are override by a config.py file.
logger = logging.getLogger(__name__)
try:
    from config import *  # noqa
    logger.info("Using config.py...")
except ModuleNotFoundError:
    logger.info("No config file found")
del logger

# Override some settings from testing purpose
if TESTING:
    DOCKER_COUNT = 5
예제 #10
0
    def _run_start_start(self):
        """Start the container and wait for it to finish starting"""
        # Get the value of the URL prefix that the app will have to the outside.
        url_prefix = reverse(
            "dockerapps:docker-proxy",
            kwargs={
                "project": self.process.project.sodar_uuid,
                "image": self.image.sodar_uuid,
                "process": self.image.process.sodar_uuid,
                "path": "",
            },
        )

        with transaction.atomic():
            self.process.refresh_from_db()
            if self.process.state in (STATE_IDLE, STATE_FAILED):
                self.job.add_log_entry("Starting container for %s:%s..." %
                                       (self.image.repository, self.image.tag))
                # Build environment, interpreting placeholders.
                environment = {}
                for entry in self.process.environment:
                    if "__KIOSC_URL_PREFIX__" in entry["value"]:
                        environment[entry["name"]] = entry["value"].replace(
                            "__KIOSC_URL_PREFIX__", url_prefix)
                    else:
                        environment[entry["name"]] = entry["value"]
                # Create and start the Docker container, update database record.
                host_config = self.cli.create_host_config()
                container = self.cli.create_container(
                    detach=True,
                    image=self.image.image_id,
                    environment=environment,
                    command=shlex.split(self.process.command)
                    if self.process.command else None,
                    ports=[self.process.internal_port],
                    host_config=self.cli.create_host_config(
                        port_bindings={
                            self.process.internal_port: self.process.host_port
                        },
                        ulimits=[
                            Ulimit(
                                name="nofile",
                                soft=settings.
                                KIOSC_DOCKER_MAX_ULIMIT_NOFILE_SOFT,
                                hard=settings.
                                KIOSC_DOCKER_MAX_ULIMIT_NOFILE_HARD,
                            )
                        ],
                    ),
                )
                self.cli.start(container=container.get("Id"))
                self.process.container_id = container.get("Id")
                self.process.state = STATE_STARTING
                self.process.save()
            else:
                self.job.add_log_entry(
                    "Process state is %s, not attempting to start" %
                    self.process.state)
        self.job.add_log_entry("Waiting for container to start...")
        timeout_start = time.time()
        while time.time() < timeout_start + self.timeout:
            if (self.cli.inspect_container(self.process.container_id).get(
                    "State", {}).get("Running")):
                self.job.add_log_entry("Container is running...")
                with transaction.atomic():
                    self.process.refresh_from_db()
                    self.process.state = STATE_RUNNING
                    self.process.save()
                break
            time.sleep(self.sleep_time)
        else:
            raise RuntimeError("Container did not start on time")
예제 #11
0
    def create_container(self,
                         name,
                         image,
                         ram,
                         working_directory,
                         gpus=None,
                         environment=None,
                         enable_fuse=False):
        """
        Creates a docker container with the given arguments. This docker container is running endlessly until
        container.stop() is called.
        If nvidia gpus are specified, the nvidia runtime is used, if available. Otherwise a device request for nvidia
        gpus is added.

        :param name: The name of the container
        :type name: str
        :param image: The image to use for this container
        :type image: str
        :param ram: The ram limit for this container in megabytes
        :type ram: int
        :param working_directory: The working directory inside the docker container
        :type working_directory: str
        :param gpus: A specification of gpus to enable in this docker container
        :type gpus: List[GPUDevice]
        :param environment: A dictionary containing environment variables, which should be set inside the container
        :type environment: Dict[str, Any]
        :param enable_fuse: If True, SYS_ADMIN capabilities are granted for this container and /dev/fuse is mounted
        :type enable_fuse: bool

        :return: The created container
        :rtype: Container

        :raise RuntimeNotSupportedError: If the specified runtime is not installed on the docker host
        """
        if environment is None:
            environment = {}

        mem_limit = None
        if ram is not None:
            mem_limit = '{}m'.format(ram)

        gpu_ids = None
        if gpus:
            set_nvidia_environment_variables(
                environment, map(lambda gpu: gpu.device_id, gpus))
            gpu_ids = [gpu.device_id for gpu in gpus]

        # enable fuse
        devices = []
        capabilities = []
        if enable_fuse:
            devices.append('/dev/fuse')
            capabilities.append('SYS_ADMIN')

        container = create_container_with_gpus(
            self._client,
            image,
            command='/bin/sh',
            gpus=gpu_ids,
            available_runtimes=self._runtimes,
            name=name,
            user='******',
            working_dir=working_directory,
            mem_limit=mem_limit,
            memswap_limit=mem_limit,
            environment=environment,
            cap_add=capabilities,
            devices=devices,
            ulimits=[
                Ulimit(name='nofile', soft=NOFILE_LIMIT, hard=NOFILE_LIMIT)
            ],
            # needed to run the container endlessly
            tty=True,
            stdin_open=True,
            auto_remove=False,
        )
        container.start()

        return container
예제 #12
0
def get_ulimits_config(config):
    return list(map(lambda ulimit:
                    Ulimit(name=ulimit['name'], soft=ulimit['soft'], hard=ulimit['hard']), config))