def stop(self):
     containers = list_containers(
         self.docker_client, filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}'], 'status': 'running'}
     )
     for container in containers:
         container.stop()
         self.logger.info(f'Container name={container.name} stopped.')
    def _start_mongo_db(self):
        """Start Mongo DB service.
            From https://stackoverflow.com/a/53522699/13173608.
        """
        mongo_name = f'mongo-{random.randint(0, 100000)}'

        self.docker_client.containers.run(
            'mongo', ports={'27017/tcp': self.mongo_port}, name=mongo_name,
            environment={
                'MONGO_INITDB_USERNAME': MONGO_USERNAME,
                'MONGO_INITDB_PASSWORD': MONGO_PASSWORD,
                'MONGO_INITDB_DATABASE': MONGO_DB,
            },
            labels={**self.common_labels, MODELCI_DOCKER_PORT_LABELS['mongo']: str(self.mongo_port)},
            **self.extra_container_kwargs
        )

        time.sleep(1)
        try:
            # create MongoDB user
            client = MongoClient(f'{MONGO_HOST}:{MONGO_PORT}')
            kwargs = {'pwd': MONGO_PASSWORD, 'roles': [{'role': 'readWrite', 'db': MONGO_DB}]}
            getattr(client, MONGO_DB).command("createUser", MONGO_USERNAME, **kwargs)
        except Exception as e:
            self.logger.error(f'Exception during starting MongoDB: {e}')
            container = list_containers(self.docker_client, filters={'name': mongo_name})[0]
            container.kill()
            container.remove()
            return

        check_container_status(self.docker_client, name=mongo_name)
        self.logger.info(f'Container name={mongo_name} stared')
Пример #3
0
    def _start_gpu_metrics_node_exporter(self):
        rand_num = random.randint(0, 100000)
        gpu_metrics_name = f'gpu-metrics-exporter-{rand_num}'
        dcgm_container = list_containers(
            self.docker_client,
            filters={
                'label': [MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter']]
            })[0]
        # start gpu-metric-exporter
        self.docker_client.containers.run(
            'bgbiao/gpu-metrics-exporter',
            privileged=True,
            name=gpu_metrics_name,
            ports={'9400/tcp': self.node_exporter_port},
            volumes_from=[dcgm_container.id],
            labels={
                **self.common_labels,
                MODELCI_DOCKER_PORT_LABELS['gpu_metrics_node_exporter']:
                str(self.node_exporter_port),
                MODELCI_GPU_LABEL:
                str(self.enable_gpu),
            },
            **self.extra_container_kwargs)

        check_container_status(self.docker_client, gpu_metrics_name)
        self.logger.info(f'{gpu_metrics_name} stared')
Пример #4
0
 def remove_all(self):
     containers = list_containers(
         self.docker_client,
         filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']})
     for container in containers:
         container.stop()
         container.remove()
         self.logger.info(f'Container {container.id} is removed.')
Пример #5
0
    def start(self):
        """Start the ModelCI service."""

        # remove incorrect containers with different GPU enabled flag
        self._remove(
            filters={'label': [f'{MODELCI_GPU_LABEL}={not self.enable_gpu}']})

        containers_in_cluster = list_containers(
            docker_client=self.docker_client,
            filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']})

        if containers_in_cluster == 4:
            self.logger.error(
                f'Cluster {self.cluster_name} cannot be started because it already exists.'
            )
            return False

        # download all required docker images
        self._download_serving_containers()

        # obtain which containers has started
        all_labels = dict()
        for container in containers_in_cluster:
            all_labels.update(container.labels)
            if container.attrs['State']['Status'] != 'running':
                # try start stopped container
                self.logger.warning(
                    f'Service already exist, found container name={container.name}.'
                )
                container.start()
                self.logger.info('Service started.')
            else:
                self.logger.warning(
                    f'Service with container name={container.name} already started.'
                )

        if not MODELCI_DOCKER_PORT_LABELS['mongo'] in all_labels:
            self._start_mongo_db()

        if not MODELCI_DOCKER_PORT_LABELS['cadvisor'] in all_labels:
            self._start_cadvisor()

        if not MODELCI_DOCKER_PORT_LABELS['dcgm_node_exporter'] in all_labels:
            self._start_dcgm_node_exporter()

        if not MODELCI_DOCKER_PORT_LABELS[
                'gpu_metrics_node_exporter'] in all_labels:
            self._start_gpu_metrics_node_exporter()

        return self.connect()
    def connect(self):
        """Use the cluster name to update ports. Because they might not match as in
        start_clipper the ports might be changed.
        """
        containers = list_containers(
            docker_client=self.docker_client,
            filters={'label': [f'{MODELCI_DOCKER_LABEL}={self.cluster_name}']})
        all_labels = dict()
        for container in containers:
            all_labels.update(container.labels)

        self.mongo_port = all_labels[MODELCI_DOCKER_PORT_LABELS['mongo']]
        self.cadvisor_port = all_labels[MODELCI_DOCKER_PORT_LABELS['cadvisor']]
        self.node_exporter_port = all_labels[MODELCI_DOCKER_PORT_LABELS['gpu_metrics_node_exporter']]

        return True
 def _remove(self, filters):
     containers = list_containers(self.docker_client, filters)
     for container in containers:
         container.stop()
         container.remove()
         self.logger.info(f'Container {container.id} is removed.')