示例#1
0
    async def submit_jobs(self, req):
        """Send submit request to the manager instance.

        Args:
            req: the submit request to send

        Returns:
            dict: with:
               njobs (int): number of submitted jobs
               names (list(str)): list of submited job names

        Raises:
            InvalidRequest: when request has wrong format
        """
        await self.get_socket().send_json({'request': 'submit', 'jobs': req})

        msg = await self.get_socket().recv_json()
        if not msg['code'] == 0:
            raise InvalidRequest('Failed to submit jobs: {}'.format(
                msg.get('message', '')))

        self.submitted_jobs += msg.get('data', {}).get('submitted', 0)
        return {
            'njobs': msg.get('data', {}).get('submitted', 0),
            'names': msg.get('data', {}).get('jobs', [])
        }
示例#2
0
    def __init__(self, data):
        """Initialize request.

        Args:
            data (dict): request data
        """
        assert data is not None

        if 'entity' not in data or not data['entity'] in self.NOTIFY_ENTITY:
            raise InvalidRequest('Wrong notify request - missing/unknown entity')

        if 'params' not in data:
            raise InvalidRequest('Wrong notify request - missing register parameters')

        if not all(e in data['params'] for e in ['name', 'state', 'attributes']):
            raise InvalidRequest('Wrong notify request - missing key notify parameters')

        self.entity = data['entity']
        self.params = data['params']
示例#3
0
    def __init__(self, data):
        """Initialize request.

        Args:
            data (dict): request data
        """
        assert data is not None

        if 'entity' not in data or not data['entity'] in self.REQ_REGISTER_ENTITIES:
            raise InvalidRequest('Wrong register request - missing/unknown entity')

        if 'params' not in data:
            raise InvalidRequest('Wrong register request - missing register parameters')

        if not all(e in data['params'] for e in ['id', 'address', 'resources']):
            raise InvalidRequest('Wrong register request - missing key register parameters')

        self.entity = data['entity']
        self.params = data['params']
示例#4
0
    def parse(cls, data):
        """
        Parse request.

        Args:
            data (dict): parsed data

        Returns:
            req (Request): request object

        Raises:
            InvalidRequest: in case of wrong or unknown request
        """
        if not isinstance(data, dict) or 'request' not in data or not data['request']:
            raise InvalidRequest('Invalid request')

        if data['request'] not in __REQS__:
            raise InvalidRequest('Unknown request name: %s' % data['request'])

        return __REQS__[data['request']](data)
示例#5
0
    def __init__(self, data):
        """Initialize request.

        Args:
            data (dict): request data
        """
        assert data is not None

        if 'jobNames' not in data or not isinstance(data['jobNames'], list) or len(data['jobNames']) < 1:
            raise InvalidRequest('Wrong remove job request - missing job names')

        self.job_names = data['jobNames']
示例#6
0
    def __init__(self, data):
        """Initialize request.

        Args:
            data (dict): request data
        Raises:
            InvalidRequest: in case of wrong request format
        """
        assert data is not None

        if 'jobNames' not in data or not isinstance(data['jobNames'], list) or len(data['jobNames']) < 1:
            raise InvalidRequest('Wrong job info request - missing job names')

        self.include_childs = False
        if data.get('params'):
            if data['params'].get('withChilds'):
                self.include_childs = True
        self.job_names = data['jobNames']
示例#7
0
    def generate(self):
        """Generate exact job's resource requirements for next iteration.

        Yields:
            exact resource requirements for following iterations

        Raises:
            InvalidRequest: when parameter ``max`` is used in resource description
        """
        _logger.debug("iteration scheduler '%s' algorithm called",
                      MaximumIters.SCHED_NAME)

        if 'max' in self.job_resources:
            raise InvalidRequest(
                'Wrong submit request - split-into directive mixed with max directive'
            )

        pmin = 1
        if 'min' in self.job_resources:
            pmin = self.job_resources['min']

        if self.iterations * pmin <= self.avail_resources:
            # a single round
            _logger.debug(
                "iterations in single round to schedule: %s, available resources: %s, "
                "minimum iteration resources: %s", self.iterations,
                self.avail_resources, pmin)

            avail_resources = self.avail_resources
            for iteration in range(self.iterations):
                # assign part of round_resources to the iteration_in_round
                iteration_resources = math.floor(avail_resources /
                                                 (self.iterations - iteration))
                avail_resources -= iteration_resources

                _logger.debug(
                    "iteration: %s/%s, iteration_resources: %s, rest avail_resources: %s",
                    iteration, self.iterations, iteration_resources,
                    avail_resources)

                yield IterScheduler.get_exact_iter_plan(
                    self.job_resources.copy(), iteration_resources)
        else:
            # more than one round

            # minimum number of needed rounds
            rounds = math.ceil(
                (self.iterations /
                 math.floor(float(self.avail_resources) / pmin)))
            iterations_to_schedule = self.iterations

            _logger.debug(
                "iterations to schedule: %s, rounds: %s, resources: %s, minimum iteration "
                "resources: %s", iterations_to_schedule, rounds,
                self.avail_resources, pmin)

            while iterations_to_schedule > 0:
                for ex_round in range(rounds):
                    iterations_in_round = math.ceil(iterations_to_schedule /
                                                    (rounds - ex_round))
                    round_resources = self.avail_resources

                    _logger.debug("round: %s/%s, iterations_in_round: %s",
                                  ex_round, rounds, iterations_in_round)

                    for iteration_in_round in range(iterations_in_round):
                        # assign part of round_resources to the iteration_in_round
                        iteration_resources = math.floor(
                            round_resources /
                            (iterations_in_round - iteration_in_round))
                        round_resources -= iteration_resources

                        _logger.debug(
                            "round: %s/%s, iteration_in_round: %s/%s, iteration_resources: %s, rest "
                            "round_resources: %s", ex_round, rounds,
                            iteration_in_round, iterations_in_round,
                            iteration_resources, round_resources)

                        yield IterScheduler.get_exact_iter_plan(
                            self.job_resources.copy(), iteration_resources)

                    iterations_to_schedule -= iterations_in_round
                    _logger.debug(
                        "end of round: %s/%s, iterations_to_schedule: %s",
                        ex_round, rounds, iterations_to_schedule)
示例#8
0
    def __init__(self, data):
        """Initialize request.

        Args:
            data (dict): request data

        Raises:
            InvalidRequest: in case of wrong job description format
        """
        self.jobs = []

        assert data is not None

        if 'jobs' not in data or not data['jobs'] or not isinstance(data['jobs'], list):
            raise InvalidRequest('Wrong submit request - missing jobs data')

        # watch out for values - this data can be copied with the 'shallow' method
        # so complex structures should be omited
        job_vars = {
            'rcnt': str(SubmitReq.REQ_CNT),
            'uniq': str(uuid.uuid4()),
            'sname': 'local',
            'date': str(datetime.datetime.today()),
            'time': str(datetime.time()),
            'dateTime': str(datetime.datetime.now())
        }

        SubmitReq.REQ_CNT += 1

        _logger.debug('request data contains %s jobs', len(data['jobs']))
        new_jobs = []

        for job_desc in data['jobs']:
            if not isinstance(job_desc, dict):
                raise InvalidRequest('Wrong submit request - wrong job data')

            if 'name' not in job_desc:
                raise InvalidRequest('Missing name in job description')

            if 'execution' not in job_desc:
                raise InvalidRequest('Missing execution element in job description')

            # look for 'iterate' directive
            if 'iteration' in job_desc:
                if not isinstance(job_desc['iteration'], dict) or \
                        all(attr not in job_desc['iteration'] for attr in ['stop', 'values']):
                    raise InvalidRequest('Wrong format of iteration directive: not a dictionary')

                if 'stop' in job_desc['iteration']:
                    start = job_desc['iteration'].get('start', 0)
                    end = job_desc['iteration']['stop']
                    if start > end:
                        raise InvalidRequest('Wrong format of iteration directive: start index larger then stop one')

            # default value for missing 'resources' definition
            if 'resources' not in job_desc:
                job_desc['resources'] = {'numCores': {'exact': 1}}

            new_jobs.append({'req': job_desc, 'vars': job_vars.copy()})

        self.jobs.extend(new_jobs)
示例#9
0
    async def _launch_partition_managers(self, nodes_in_partition):
        """Launch partition managers.

        The information about Slurm allocation is gathered, and all nodes are split by ``nodes_in_partition`` to form
        a single partition, and partition manager instance is created to control each partition.

        Args:
            nodes_in_partition (int): how many nodes each partition should have

        Raises:
            InvalidRequest: when
                * ``nodes_in_partition`` is less than 1
                * governor manager has not been launched in Slurm allocation
                * missing nodes in Slurm allocation
            InternalError: when
                * missing ZMQ interface in governor manager
        """
        _logger.info('setup allocation split into partitions by %s nodes',
                     nodes_in_partition)

        if nodes_in_partition < 1:
            _logger.error(
                'Failed to partition resources - partition size must be greater or equal 1'
            )
            raise InvalidRequest(
                'Failed to partition resources - partition size must be greater or equal 1'
            )

        # get slurm resources - currently only slurm is supported
        if not in_slurm_allocation():
            _logger.error(
                'Failed to partition resources - partitioning resources is currently available only within '
                'slurm allocation')
            raise InvalidRequest(
                'Failed to partition resources - partitioning resources is currently available only '
                'within slurm allocation')

        if not self.zmq_address:
            _logger.error(
                'Failed to partition resources - missing zmq interface address'
            )
            raise InternalError(
                'Failed to partition resources - missing zmq interface address'
            )

        slurm_resources = parse_slurm_resources(self._config)

        if slurm_resources.total_nodes < 1:
            raise InvalidRequest(
                'Failed to partition resources - allocation contains no nodes')

        npartitions = math.ceil(slurm_resources.total_nodes /
                                nodes_in_partition)
        _logger.info(
            '%s partitions will be created (in allocation containing %s total nodes)',
            npartitions, slurm_resources.total_nodes)

        # launch partition manager in the same directory as governor
        partition_manager_wdir = Config.EXECUTOR_WD.get(self._config)
        partition_manager_auxdir = Config.AUX_DIR.get(self._config)

        _logger.debug('partition managers working directory %s',
                      partition_manager_wdir)

        for part_idx in range(npartitions):
            _logger.debug('creating partition manager %s configuration',
                          part_idx)

            part_node = slurm_resources.nodes[part_idx * nodes_in_partition]

            _logger.debug('partition manager node %s', part_node.name)

            self._partition_managers.append(
                PartitionManager(
                    'partition-{}'.format(part_idx), part_node.name,
                    part_idx * nodes_in_partition,
                    min(part_idx * nodes_in_partition + nodes_in_partition,
                        slurm_resources.total_nodes), partition_manager_wdir,
                    self.zmq_address, partition_manager_auxdir, self._config))

            _logger.debug('partition manager %s configuration created',
                          part_idx)

        self._min_scheduling_managers = len(self._partition_managers)

        _logger.info(
            'created partition managers configuration and set minimum scheduling managers to %s',
            self._min_scheduling_managers)

        asyncio.ensure_future(self.manage_start_partition_managers())

        # launch task in the background that schedule buffered submit requests
        self._schedule_buffered_jobs_task = asyncio.ensure_future(
            self._schedule_buffered_jobs())