def _create_blue_batch(self, batch): """ Creates a dictionary containing the data for a blue batch. :param batch: The batch description :type batch: dict :return: A dictionary containing a blue batch :rtype: dict :raise TrusteeServiceError: If the trustee service is unavailable or unable to collect the requested secret keys :raise ValueError: If there was more than one blue batch after red_to_blue """ batch_id = str(batch['_id']) batch_secret_keys = get_batch_secret_keys(batch) response = self._trustee_client.collect(batch_secret_keys) if response['state'] == 'failed': debug_info = 'Trustee service failed:\n{}'.format( response['debug_info']) disable_retry = response.get('disable_retry') batch_failure(self._mongo, batch_id, debug_info, None, batch['state'], disable_retry_if_failed=disable_retry) raise TrusteeServiceError(debug_info) batch_secrets = response['secrets'] batch = fill_batch_secrets(batch, batch_secrets) experiment_id = batch['experimentId'] experiment = self._mongo.db['experiments'].find_one( {'_id': ObjectId(experiment_id)}) red_data = { 'redVersion': experiment['redVersion'], 'cli': experiment['cli'], 'inputs': batch['inputs'], 'outputs': batch['outputs'] } blue_batches = convert_red_to_blue(red_data) if len(blue_batches) != 1: raise ValueError( 'Got {} batches, but only one was asserted.'.format( len(blue_batches))) return blue_batches[0]
def _set_offline(self, debug_info): print('Node offline:', self._node_name) self._online.clear() timestamp = time.time() bson_node_id = ObjectId(self._node_id) self._mongo.db['nodes'].update_one({'_id': bson_node_id}, { '$set': { 'state': 'offline' }, '$push': { 'history': { 'state': 'offline', 'time': timestamp, 'debugInfo': debug_info } } }) # change state of assigned batches cursor = self._mongo.db['batches'].find( { 'node': self._node_name, 'state': { '$in': ['scheduled', 'processing'] } }, { '_id': 1, 'state': 1 }) for batch in cursor: bson_id = batch['_id'] batch_id = str(bson_id) debug_info = 'Node offline: {}'.format(self._node_name) batch_failure(self._mongo, batch_id, debug_info, None, batch['state'])
def _check_exited_container(self, container, batch): """ Inspects the logs of the given exited container and updates the database accordingly. :param container: The container to inspect :type container: Container :param batch: The batch to update according to the result of the container execution. :type batch: dict """ bson_batch_id = batch['_id'] batch_id = str(bson_batch_id) try: stdout_logs = container.logs(stderr=False).decode('utf-8') stderr_logs = container.logs(stdout=False).decode('utf-8') docker_stats = container.stats(stream=False) except Exception as e: err_str = repr(e) self._log('Failed to get container logs:\n{}'.format(err_str)) debug_info = 'Could not get logs or stats of container: {}'.format( err_str) batch_failure(self._mongo, batch_id, debug_info, None, batch['state']) return data = None try: data = json.loads(stdout_logs) except json.JSONDecodeError as e: err_str = repr(e) debug_info = 'CC-Agent data is not a valid json object: {}\n\nstdout was:\n{}'.format( err_str, stdout_logs) batch_failure(self._mongo, batch_id, debug_info, data, batch['state'], docker_stats=docker_stats) self._log( 'Failed to load json from blue agent:\n{}'.format(err_str)) return try: jsonschema.validate(data, agent_result_schema) except jsonschema.ValidationError as e: err_str = repr(e) debug_info = 'CC-Agent data sent by callback does not comply with jsonschema: {}'.format( err_str) batch_failure(self._mongo, batch_id, debug_info, data, batch['state'], docker_stats=docker_stats) self._log( 'Failed to validate blue agent output:\n{}'.format(err_str)) return if data['state'] == 'failed': debug_info = 'Batch failed.\nContainer stderr:\n{}\ndebug info:\n{}'.format( stderr_logs, data['debugInfo']) batch_failure(self._mongo, batch_id, debug_info, data, batch['state'], docker_stats=docker_stats) return batch = self._mongo.db['batches'].find_one({'_id': bson_batch_id}, { 'attempts': 1, 'node': 1, 'state': 1 }) if batch['state'] != 'processing': debug_info = 'Batch failed.\nExited container, but not in state processing.' batch_failure(self._mongo, batch_id, debug_info, data, batch['state'], docker_stats=docker_stats) return self._mongo.db['batches'].update_one( { '_id': bson_batch_id, 'state': 'processing' }, { '$set': { 'state': 'succeeded' }, '$push': { 'history': { 'state': 'succeeded', 'time': time.time(), 'debugInfo': None, 'node': batch['node'], 'ccagent': data, 'dockerStats': docker_stats } } })
def _pull_image_failure(self, debug_info, batch_id, current_state): batch_failure(self._mongo, batch_id, debug_info, None, current_state)
def _run_batch_container_failure(self, batch_id, debug_info, current_state): batch_failure(self._mongo, batch_id, debug_info, None, current_state)
def _schedule_batch(self, next_batch, nodes, batch_count_cache): """ Tries to find a node that is capable of processing the given batch. If no capable node could be found, None is returned. If a node was found, that is capable of processing the given batch, this node is written to the node property of the batch. The batches state is then updated to 'scheduled'. :param next_batch: The batch to schedule. :param nodes: The nodes on which the batch should be scheduled. :type nodes: List[CompleteNode] :param batch_count_cache: A dictionary mapping experiment ids to the number of batches of this experiment, which in state processing or scheduled. This dictionary is allowed to overestimate the number of batches. :type batch_count_cache: Dict[str, int] :return: The name of the node on which the given batch is scheduled If the batch could not be scheduled None is returned :raise TrusteeServiceError: If the trustee service is unavailable. """ batch_id = str(next_batch['_id']) experiment_id = next_batch['experimentId'] try: experiment = self._get_experiment_of_batch(experiment_id) except Exception as e: batch_failure(self._mongo, batch_id, repr(e), None, next_batch['state'], disable_retry_if_failed=True) return None ram = experiment['container']['settings']['ram'] # limit the number of currently executed batches from a single experiment concurrency_limit = experiment.get('execution', {}).get( 'settings', {}).get('batchConcurrencyLimit', 64) # number of batches which are scheduled or processing of the given experiment batch_count = self._get_number_of_batches_of_experiment( experiment_id, batch_count_cache) if batch_count >= concurrency_limit: return None # check impossible experiments if not Scheduler._check_nodes_possibly_sufficient(nodes, experiment): debug_info = 'There are no nodes configured that are possibly sufficient for experiment "{}"' \ .format(next_batch['experimentId']) batch_failure(self._mongo, batch_id, debug_info, None, next_batch['state'], disable_retry_if_failed=True) return None # select node selected_node = Scheduler._get_best_node(nodes, experiment) if selected_node is None: return None # calculate ram / gpus selected_node.ram_available -= ram used_gpu_ids = None if selected_node.gpus_available: gpu_requirements = get_gpu_requirements( experiment['container']['settings'].get('gpus')) available_gpus = selected_node.gpus_available used_gpus = match_gpus(available_gpus, requirements=gpu_requirements) used_gpu_ids = [] for gpu in used_gpus: used_gpu_ids.append(gpu.device_id) available_gpus.remove(gpu) # check mounting mount_connectors = red_get_mount_connectors_from_inputs( next_batch['inputs']) is_mounting = bool(mount_connectors) allow_insecure_capabilities = self._conf.d['controller']['docker'].get( 'allow_insecure_capabilities', False) if not allow_insecure_capabilities and is_mounting: # set state to failed, because insecure_capabilities are not allowed but needed, by this batch. debug_info = 'FUSE support for this agency is disabled, but the following input/output-keys are ' \ 'configured to mount inside a docker container.{}{}'.format(os.linesep, mount_connectors) batch_failure(self._mongo, batch_id, debug_info, None, next_batch['state'], disable_retry_if_failed=True) return None # update batch data update_result = self._mongo.db['batches'].update_one( { '_id': next_batch['_id'], 'state': next_batch['state'] }, { '$set': { 'state': 'scheduled', 'node': selected_node.node_name, 'usedGPUs': used_gpu_ids, 'mount': is_mounting }, '$push': { 'history': { 'state': 'scheduled', 'time': time(), 'debugInfo': None, 'node': selected_node.node_name, 'ccagent': None, 'dockerStats': None } }, '$inc': { 'attempts': 1 } }) if update_result.modified_count == 1: # The state of the scheduled batch switched from 'registered' to 'scheduled', so increase the batch_count. # batch_count_cache always contains experiment_id, because _get_number_of_batches_of_experiment() # always inserts the given experiment_id batch_count_cache[experiment_id] += 1 return selected_node.node_name else: return None