예제 #1
0
            def load_machines(self, service_name):
                self.entities_usage = EntityUsage.initialize_entities_usage(
                    self.worker._mongo_client, service_name)
                for resource, machine in six.iteritems(self._machines):
                    current_xpu_usage = Capacity()
                    keygr = 'gpu_resource:%s:%s' % (self.worker._service,
                                                    resource)
                    keycr = 'cpu_resource:%s:%s' % (self.worker._service,
                                                    resource)

                    gpu_tasks = self.worker._redis.hgetall(keygr)
                    cpu_tasks = self.worker._redis.hgetall(keycr)

                    # can not launch multiple tasks on service with no multi-tasking (ec2)
                    # or launch multiple tasks on service with hybrid task mode and dynamic resource mode (nova)
                    if not _is_resource_multitask(
                            service, resource) and (gpu_tasks or cpu_tasks):
                        continue
                    tmp_tasks = {}
                    for _, v in six.iteritems(gpu_tasks):
                        if v not in tmp_tasks:
                            task_entity = task.get_owner_entity(
                                self.worker._redis, v)
                            tmp_tasks[v] = task_entity
                        else:
                            task_entity = tmp_tasks[v]

                        if v not in self.preallocated_task_resource:
                            self.preallocated_task_resource[v] = resource
                        self._machines[resource].add_task(
                            v, self.worker._redis)
                        current_xpu_usage.incr_ngpus(1)
                        self.entities_usage[task_entity].add_current_usage(
                            Capacity(ngpus=1))

                    for _, v in six.iteritems(cpu_tasks):
                        if v not in tmp_tasks:
                            task_entity = task.get_owner_entity(
                                self.worker._redis, v)
                            tmp_tasks[v] = task_entity
                        else:
                            task_entity = tmp_tasks[v]

                        if v not in self.preallocated_task_resource:
                            self.preallocated_task_resource[v] = resource

                        self._machines[resource].add_task(
                            v, self.worker._redis)
                        current_xpu_usage.incr_ncpus(1)
                        self.entities_usage[task_entity].add_current_usage(
                            Capacity(ncpus=1))

                    available_xpus = machine._init_capacity - current_xpu_usage
                    self._machines[resource].set_available(available_xpus)
                    self.worker._logger.debug("\tresource %s: - free %s",
                                              resource, available_xpus)

                return len(resource_mgr._machines) > 0
예제 #2
0
    def _allocate_resource(self, task_id, request_resource, service,
                           task_expected_capacity):
        """Allocates a resource for task_id and returns the name of the resource
           (or None if none where allocated), and the number of allocated gpus/cpus
        """
        task_entity = task.get_owner_entity(self._redis, task_id)
        resources = service.list_resources()

        # Distribute resource by type
        only_cpus_task_machines, only_gpus_task_machines, mix_task_machines = self._split_machines_by_task_support(
            resources=resources, service=service)
        is_required_gpu_task = self._is_required_gpu_task(
            task_expected_capacity)

        if is_required_gpu_task:
            best_resource = self._distribute_machine_for_task(
                task_id, task_entity, task_expected_capacity, request_resource,
                service, {
                    **only_gpus_task_machines,
                    **mix_task_machines
                })
        else:
            best_resource = self._distribute_machine_for_task(
                task_id, task_entity, task_expected_capacity, request_resource,
                service, only_cpus_task_machines)
            if not best_resource:
                best_resource = self._distribute_machine_for_task(
                    task_id, task_entity, task_expected_capacity,
                    request_resource, service, mix_task_machines)
        return best_resource
예제 #3
0
 def _get_current_config(self, task_id):
     task_entity = task.get_owner_entity(self._redis, task_id)
     storages_entities_filter = task.get_storages_entity(
         self._redis, task_id)
     current_config = config.get_entity_cfg_from_redis(
         self._redis, self._service, storages_entities_filter, task_entity)
     return current_config
예제 #4
0
            def try_create(next_task_id):
                next_keyt = 'task:%s' % next_task_id
                parent = self._redis.hget(next_keyt, 'parent')
                task_entity = task.get_owner_entity(self._redis, next_task_id)

                if task_entity not in resource_mgr.entities_usage:
                    self._logger.error(
                        "\t[Task %s] entity %s - without usage limit !",
                        next_task_id, task_entity)
                    return None

                # check parent dependency
                if parent:
                    keyp = 'task:%s' % parent
                    if self._redis.exists(keyp):
                        # if the parent task is in the database, check for dependencies
                        parent_status = self._redis.hget(keyp, 'status')
                        if parent_status != 'stopped':
                            if parent_status == 'running':
                                # parent is still running so update queued time to be as close
                                # as possible to terminate time of parent task
                                self._redis.hset(next_keyt, "queued_time",
                                                 time.time())
                            return None

                        if self._redis.hget(keyp, 'message') != 'completed':
                            task.terminate(self._redis,
                                           next_task_id,
                                           phase='dependency_error')
                            return None

                task_capacity = Capacity(self._redis.hget(next_keyt, 'ngpus'),
                                         self._redis.hget(next_keyt, 'ncpus'))
                candidate_task = CandidateTask(
                    next_task_id, task_entity, self._redis, task_capacity,
                    resource_mgr.entities_usage[task_entity], self._logger)
                # check now the task has a chance to be processed by any machine
                for _, machine in six.iteritems(resource_mgr._machines):
                    can_be_processed = machine._is_authorized(candidate_task._entity, candidate_task._capacity) \
                                       and candidate_task._capacity.inf_or_eq(machine._init_capacity)
                    if can_be_processed:
                        return candidate_task

                return None