def submit(self, data: Data, argv): """Run process with SLURM. For details, see :meth:`~resolwe.flow.managers.workload_connectors.base.BaseConnector.submit`. """ limits = data.get_resource_limits() logger.debug( __( "Connector '{}' running for Data with id {} ({}).", self.__class__.__module__, data.id, repr(argv), )) # Compute target partition. partition = getattr(settings, "FLOW_SLURM_PARTITION_DEFAULT", None) if data.process.slug in getattr(settings, "FLOW_SLURM_PARTITION_OVERRIDES", {}): partition = settings.FLOW_SLURM_PARTITION_OVERRIDES[ data.process.slug] try: # Make sure the resulting file is executable on creation. runtime_dir = storage_settings.FLOW_VOLUMES["runtime"]["config"][ "path"] script_path = os.path.join(runtime_dir, "slurm-{}.sh".format(data.pk)) file_descriptor = os.open(script_path, os.O_WRONLY | os.O_CREAT, mode=0o555) with os.fdopen(file_descriptor, "wt") as script: script.write("#!/bin/bash\n") script.write( "#SBATCH --mem={}M\n".format(limits["memory"] + EXECUTOR_MEMORY_OVERHEAD)) script.write("#SBATCH --cpus-per-task={}\n".format( limits["cores"])) if partition: script.write("#SBATCH --partition={}\n".format(partition)) script.write( "#SBATCH --output slurm-url-{}-job-%j.out\n".format( data.location.subpath)) # Render the argument vector into a command line. line = " ".join(map(shlex.quote, argv)) script.write(line + "\n") command = ["/usr/bin/env", "sbatch", script_path] subprocess.Popen(command, cwd=runtime_dir, stdin=subprocess.DEVNULL).wait() except OSError as err: logger.error( __( "OSError occurred while preparing SLURM script for Data {}: {}", data.id, err, ))
def start(self, data: Data, listener_connection: Tuple[str, str, str]): """Start process execution. Construct kubernetes job description and pass it to the kubernetes. """ container_environment = self._prepare_environment( data, listener_connection) location_subpath = Path(data.location.subpath) # Create kubernetes API every time otherwise it will time out # eventually and raise API exception. try: kubernetes.config.load_kube_config() except kubernetes.config.config_exception.ConfigException: kubernetes.config.load_incluster_config() batch_api = kubernetes.client.BatchV1Api() core_api = kubernetes.client.CoreV1Api() container_name_prefix = (getattr(settings, "FLOW_EXECUTOR", {}).get( "CONTAINER_NAME_PREFIX", "resolwe").replace("_", "-").lower()) container_name = self._generate_container_name(container_name_prefix, data.pk) # Set resource limits. requests = dict() limits = data.get_resource_limits() requests["cpu"] = limits.pop("cores") limits["cpu"] = requests["cpu"] + 1 # Overcommit CPU by 20%. requests["cpu"] *= 0.8 # The memory in the database is stored in megabytes but the kubertenes # requires memory in bytes. # We request 10% less memory than stored in the database and set limit # at 10% more plus KUBERNETES_MEMORY_HARD_LIMIT_BUFFER. The processes # usually require 16GB, 32GB... and since the node usualy has 64GB of # memory and some of it is consumed by the system processes only one # process process that requires 32GB can run on a node instead of 2. requests["memory"] = 0.9 * limits["memory"] limits["memory"] = 1.1 * limits[ "memory"] + KUBERNETES_MEMORY_HARD_LIMIT_BUFFER limits["memory"] *= 2**20 # 2 ** 20 = mebibyte requests["memory"] *= 2**20 # Get the limits and requests for the communicator container. communicator_limits = getattr( settings, "FLOW_KUBERNETES_COMMUNICATOR_LIMITS", { "memory": "256M", "cpu": 0.1 }, ) communicator_requests = getattr( settings, "FLOW_KUBERNETES_COMMUNICATOR_REQUESTS", { "memory": "256M", "cpu": 0.1 }, ) resources = data.process.requirements.get("resources", {}) network = "bridge" use_host_network = False if "network" in resources: # Configure Docker network mode for the container (if specified). # By default, current Docker versions use the 'bridge' mode which # creates a network stack on the default Docker bridge. network = getattr(settings, "FLOW_EXECUTOR", {}).get("NETWORK", "") use_host_network = network == "host" # Generate and set seccomp policy to limit syscalls. security_context = { "runAsUser": os.getuid(), "runAsGroup": os.getgid(), "allowPrivilegeEscalation": False, "privileged": False, "capabilities": { "drop": ["ALL"] }, } annotations = dict() # Do not evict job from node. annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false" if not getattr(settings, "FLOW_DOCKER_DISABLE_SECCOMP", False): # The path is a relative path in the kubelet root # directory: # <seccomp_root>/<path>, where <seccomp_root> is defined via the # --seccomp-profile-root flag on the Kubelet. If the # --seccomp-profile-root flag is not defined, the default path will # be used, which is <root-dir>/seccomp where <root-dir> is # specified by the --root-dir flag. # https://kubernetes.io/docs/concepts/policy/pod-security-policy/ # # The file is transfered to kubelets with daemonset ? Currently I # mount my /tmp directory to the /seccomp directory in minikube. annotations[ "seccomp.security.alpha.kubernetes.io/pod"] = "runtime/default" mapper = getattr(settings, "FLOW_CONTAINER_IMAGE_MAP", {}) communicator_image = getattr( settings, "FLOW_DOCKER_COMMUNICATOR_IMAGE", "public.ecr.aws/s4q6j6e8/resolwe/com:latest", ) communicator_image = self._image_mapper(communicator_image, mapper) requirements = data.process.requirements.get("executor", {}).get("docker", {}) processing_container_image = str( requirements.get( "image", getattr( settings, "FLOW_DOCKER_DEFAULT_PROCESSING_CONTAINER_IMAGE", "public.ecr.aws/s4q6j6e8/resolwe/base:ubuntu-20.04", ), ), ) processing_container_image = self._image_mapper( processing_container_image, mapper) affinity = {} kubernetes_affinity = getattr(settings, "FLOW_KUBERNETES_AFFINITY", None) if kubernetes_affinity: affinity = { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [{ "matchExpressions": [{ "key": "nodegroup", "operator": "In", "values": [kubernetes_affinity], }] }] } } } job_type = dict( Process.SCHEDULING_CLASS_CHOICES)[data.process.scheduling_class] job_description = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": sanitize_kubernetes_label(container_name) }, "spec": { # Keep finished pods around for ten seconds. If job is not # deleted its PVC claim persists and it causes PV to stay # around. # This can be changed by running a cron job that periodically # checks for PVC that can be deleted. "ttlSecondsAfterFinished": 300, "template": { "metadata": { "name": sanitize_kubernetes_label(container_name), "labels": { "app": "resolwe", "data_id": str(data.pk), "process": sanitize_kubernetes_label(data.process.slug), "job_type": sanitize_kubernetes_label(job_type), }, "annotations": annotations, }, "spec": { "affinity": affinity, "hostNetwork": use_host_network, "volumes": self._volumes(data.id, location_subpath, core_api), "initContainers": [ { "name": sanitize_kubernetes_label( f"{container_name}-init"), "image": communicator_image, "imagePullPolicy": "Always", "workingDir": "/", "command": ["/usr/local/bin/python3"], "args": ["-m", "executors.init_container"], "securityContext": { "privileged": True }, "volumeMounts": self._init_container_mountpoints(), "env": container_environment, }, ], "containers": [ { "name": sanitize_kubernetes_label(container_name), "image": processing_container_image, "resources": { "limits": limits, "requests": requests }, "securityContext": security_context, "env": container_environment, "workingDir": os.fspath(constants.PROCESSING_VOLUME), "imagePullPolicy": "Always", "command": ["/usr/bin/python3"], "args": ["/processing.py"], "volumeMounts": self._processing_mountpoints( location_subpath, data.process.run.get("language", None), ), }, { "name": sanitize_kubernetes_label( f"{container_name}-communicator"), "image": communicator_image, "imagePullPolicy": "Always", "resources": { "limits": communicator_limits, "requests": communicator_requests, }, "securityContext": security_context, "env": container_environment, "command": ["/usr/local/bin/python3"], "args": ["/startup.py"], "volumeMounts": self._communicator_mountpoints( location_subpath), }, ], "restartPolicy": "Never", }, }, "backoffLimit": 0, }, } start_time = time.time() processing_name = constants.PROCESSING_VOLUME_NAME input_name = constants.INPUTS_VOLUME_NAME if self._should_create_pvc( storage_settings.FLOW_VOLUMES[processing_name]): claim_name = unique_volume_name( storage_settings.FLOW_VOLUMES[processing_name]["config"] ["name"], data.id, ) claim_size = limits.pop("storage", 200) * ( 2**30) # Default 200 gibibytes core_api.create_namespaced_persistent_volume_claim( body=self._persistent_volume_claim( claim_name, claim_size, storage_settings.FLOW_VOLUMES[processing_name]["config"], ), namespace=self.kubernetes_namespace, _request_timeout=KUBERNETES_TIMEOUT, ) if input_name in storage_settings.FLOW_VOLUMES: if self._should_create_pvc( storage_settings.FLOW_VOLUMES[input_name]): claim_size = self._data_inputs_size(data) claim_name = unique_volume_name( storage_settings.FLOW_VOLUMES[input_name]["config"] ["name"], data.id, ) core_api.create_namespaced_persistent_volume_claim( body=self._persistent_volume_claim( claim_name, claim_size, storage_settings.FLOW_VOLUMES[input_name]["config"], ), namespace=self.kubernetes_namespace, _request_timeout=KUBERNETES_TIMEOUT, ) logger.debug(f"Creating namespaced job: {job_description}") batch_api.create_namespaced_job( body=job_description, namespace=self.kubernetes_namespace, _request_timeout=KUBERNETES_TIMEOUT, ) end_time = time.time() logger.info( "It took {:.2f}s to send config to kubernetes".format(end_time - start_time))
def process_data_object(data: Data): """Process a single data object.""" # Lock for update. Note that we want this transaction to be as short as possible in # order to reduce contention and avoid deadlocks. This is why we do not lock all # resolving objects for update, but instead only lock one object at a time. This # allows managers running in parallel to process different objects. data = Data.objects.select_for_update().get(pk=data.pk) if data.status != Data.STATUS_RESOLVING: # The object might have already been processed while waiting for the lock to be # obtained. In this case, skip the object. return dep_status = dependency_status(data) if dep_status == Data.STATUS_ERROR: data.status = Data.STATUS_ERROR data.process_error.append( "One or more inputs have status ERROR") data.process_rc = 1 data.save() if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save(update_fields=["status"]) return elif dep_status != Data.STATUS_DONE: return run_in_executor = False if data.process.run: try: # Check if execution engine is sound and evaluate workflow. execution_engine_name = data.process.run.get( "language", None) execution_engine = self.get_execution_engine( execution_engine_name) run_in_executor = execution_engine_name != "workflow" if not run_in_executor: execution_engine.evaluate(data) else: # Set allocated resources resource_limits = data.get_resource_limits() data.process_memory = resource_limits["memory"] data.process_cores = resource_limits["cores"] except (ExecutionError, InvalidEngineError) as error: data.status = Data.STATUS_ERROR data.process_error.append( "Error in process script: {}".format(error)) data.save() if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save(update_fields=["status"]) return if data.status != Data.STATUS_DONE: # The data object may already be marked as done by the execution engine. In this # case we must not revert the status to STATUS_WAITING. data.status = Data.STATUS_WAITING data.save(render_name=True) # Actually run the object only if there was nothing with the # transaction and was not already evaluated. if run_in_executor: transaction.on_commit( # Make sure the closure gets the right values here, since they're # changed in the loop. lambda d=data: self._data_execute(d))