def test_max_resources(self): """Tests successfully calculating the max resources in a cluster""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(22048.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) offer_3 = ResourceOffer( 'offer_3', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(225.0), Mem(1024.0), Disk(22048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2, offer_3]) resource_mgr.refresh_agent_resources([], now()) max = resource_mgr.get_max_available_resources() self.assertTrue( max.is_equal( NodeResources([Cpus(250.0), Mem(22048.0), Disk(24096.0)])))
def _process_queue(self, nodes, job_types, job_type_limits, job_type_resources, workspaces): """Retrieves the top of the queue and schedules new job executions on available nodes as resources and limits allow :param nodes: The dict of scheduling nodes stored by node ID for all nodes ready to accept new job executions :type nodes: dict :param job_types: The dict of job type models stored by job type ID :type job_types: dict :param job_type_limits: The dict of job type IDs mapping to job type limits :type job_type_limits: dict :param job_type_resources: The list of all of the job type resource requirements :type job_type_resources: list :param workspaces: A dict of all workspaces stored by name :type workspaces: dict :returns: The list of queued job executions that were scheduled :rtype: list """ scheduled_job_executions = [] started = now() type_warnings = {} # We can schedule as long as there are nodes if not nodes: logger.warning( 'There are no nodes available. Waiting to schedule until there are free resources...' ) return scheduled_job_executions ignore_job_type_ids = self._calculate_job_types_to_ignore( job_types, job_type_limits) max_cluster_resources = resource_mgr.get_max_available_resources() for queue in Queue.objects.get_queue( scheduler_mgr.config.queue_mode, ignore_job_type_ids)[:QUEUE_LIMIT]: job_exe = QueuedJobExecution(queue) # Canceled job executions get processed as scheduled executions if job_exe.is_canceled: scheduled_job_executions.append(job_exe) continue jt = job_type_mgr.get_job_type(queue.job_type.id) name = INVALID_RESOURCES.name + jt.name title = INVALID_RESOURCES.title % jt.name warning = SchedulerWarning(name=name, title=title, description=None) if jt.unmet_resources and scheduler_mgr.is_warning_active(warning): # previously checked this job type and found we lacked resources; wait until warning is inactive to check again continue invalid_resources = [] insufficient_resources = [] # get resource names offered and compare to job type resources for resource in job_exe.required_resources.resources: # Check for invalid resource or sharedmem if (resource.name not in max_cluster_resources._resources) or ( resource.name.lower() == 'sharedmem'): # Skip sharedmem if its 0 if (resource.name.lower() == 'sharedmem') and (resource.value <= 0): continue if jt.name in type_warnings: type_warnings[jt.name]['count'] += 1 if resource.name not in type_warnings[ jt.name]['warning']: type_warnings[jt.name]['warning'] += ( ', %s' % resource.name) else: type_warnings[jt.name] = { 'warning': '%s job types could not be scheduled as the following resources do not exist in the available cluster resources: %s' % (jt.name, resource.name), 'count': 1 } # resource does not exist in cluster invalid_resources.append(resource.name) elif resource.value > max_cluster_resources._resources[ resource.name].value: # resource exceeds the max available from any node insufficient_resources.append(resource.name) if invalid_resources: description = INVALID_RESOURCES.description % invalid_resources scheduler_mgr.warning_active(warning, description) if insufficient_resources: description = INSUFFICIENT_RESOURCES.description % insufficient_resources scheduler_mgr.warning_active(warning, description) if invalid_resources or insufficient_resources: invalid_resources.extend(insufficient_resources) jt.unmet_resources = ','.join(invalid_resources) jt.save(update_fields=["unmet_resources"]) continue else: # reset unmet_resources flag jt.unmet_resources = None scheduler_mgr.warning_inactive(warning) jt.save(update_fields=["unmet_resources"]) # Make sure execution's job type and workspaces have been synced to the scheduler job_type_id = queue.job_type_id if job_type_id not in job_types: scheduler_mgr.warning_active( UNKNOWN_JOB_TYPE, description=UNKNOWN_JOB_TYPE.description % job_type_id) continue workspace_names = job_exe.configuration.get_input_workspace_names() workspace_names.extend( job_exe.configuration.get_output_workspace_names()) missing_workspace = False for name in workspace_names: missing_workspace = missing_workspace or name not in workspaces if missing_workspace: if jt.name in type_warnings: type_warnings[jt.name]['count'] += 1 else: type_warnings[jt.name] = { 'warning': '%s job types could not be scheduled due to missing workspace' % jt.name, 'count': 1 } continue # Check limit for this execution's job type if job_type_id in job_type_limits and job_type_limits[ job_type_id] < 1: if jt.name in type_warnings: type_warnings[jt.name]['count'] += 1 else: type_warnings[jt.name] = { 'warning': '%s job types could not be scheduled due to scheduling limit reached' % jt.name, 'count': 1 } continue # Try to schedule job execution and adjust job type limit if needed if self._schedule_new_job_exe(job_exe, nodes, job_type_resources): scheduled_job_executions.append(job_exe) if job_type_id in job_type_limits: job_type_limits[job_type_id] -= 1 duration = now() - started if type_warnings: for warn in type_warnings: logger.warning('%d %s', type_warnings[warn]['count'], type_warnings[warn]['warning']) msg = 'Processing queue took %.3f seconds' if duration > PROCESS_QUEUE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds()) return scheduled_job_executions