def tryRun(self): while not self.stop: with throttle(self.scaler.config.scaleInterval): try: queuedJobs = self.scaler.leader.getJobs() queuedJobShapes = [ Shape(wallTime=self.scaler.getAverageRuntime( jobName=job.jobName, service=isinstance(job, ServiceJobNode)), memory=job.memory, cores=job.cores, disk=job.disk, preemptable=job.preemptable) for job in queuedJobs ] currentNodeCounts = {} for nodeShape in self.scaler.nodeShapes: nodeType = self.scaler.nodeShapeToType[nodeShape] currentNodeCounts[nodeShape] = len( self.scaler.leader.provisioner. getProvisionedWorkers( nodeType=nodeType, preemptable=nodeShape.preemptable)) estimatedNodeCounts = self.scaler.getEstimatedNodeCounts( queuedJobShapes, currentNodeCounts) self.scaler.updateClusterSize(estimatedNodeCounts) if self.stats: self.stats.checkStats() except: logger.exception( "Exception encountered in scaler thread. Making a best-effort " "attempt to keep going, but things may go wrong from now on." ) self.scaler.shutDown()
def _startServices(jobGraphsWithServicesToStart, jobGraphsWithServicesThatHaveStarted, serviceJobsToStart, terminate, jobStore): """ Thread used to schedule services. """ servicesThatAreStarting = set() servicesRemainingToStartForJob = {} serviceToJobGraph = {} while True: with throttle(1.0): if terminate.is_set(): logger.debug('Received signal to quit starting services.') break try: jobGraph = jobGraphsWithServicesToStart.get_nowait() if len(jobGraph.services) > 1: # Have to fall back to the old blocking behavior to # ensure entire service "groups" are issued as a whole. blockUntilServiceGroupIsStarted(jobGraph, jobGraphsWithServicesThatHaveStarted, serviceJobsToStart, terminate, jobStore) continue # Found a new job that needs to schedule its services. for serviceJob in jobGraph.services[0]: serviceToJobGraph[serviceJob] = jobGraph servicesRemainingToStartForJob[jobGraph] = len(jobGraph.services[0]) # Issue the service jobs all at once. for serviceJob in jobGraph.services[0]: logger.debug("Service manager is starting service job: %s, start ID: %s", serviceJob, serviceJob.startJobStoreID) serviceJobsToStart.put(serviceJob) # We should now start to monitor these services to see if # they've started yet. servicesThatAreStarting.update(jobGraph.services[0]) except Empty: # No new jobs that need services scheduled. pass for serviceJob in list(servicesThatAreStarting): if not jobStore.fileExists(serviceJob.startJobStoreID): # Service has started! servicesThatAreStarting.remove(serviceJob) parentJob = serviceToJobGraph[serviceJob] servicesRemainingToStartForJob[parentJob] -= 1 assert servicesRemainingToStartForJob[parentJob] >= 0 del serviceToJobGraph[serviceJob] # Find if any jobGraphs have had *all* their services started. jobGraphsToRemove = set() for jobGraph, remainingServices in servicesRemainingToStartForJob.items(): if remainingServices == 0: jobGraphsWithServicesThatHaveStarted.put(jobGraph) jobGraphsToRemove.add(jobGraph) for jobGraph in jobGraphsToRemove: del servicesRemainingToStartForJob[jobGraph]
def _startServices(jobDescriptionsWithServicesToStart, jobDescriptionsWithServicesThatHaveStarted, jobDescriptionsWithServicesThatHaveFailedToStart, serviceJobsToStart, terminate, jobStore): """ Thread used to schedule services. """ # Keep the user informed, but not too informed, as services start up logLimiter = LocalThrottle(60) # These are all keyed by service JobDescription object, not ID # TODO: refactor! servicesThatAreStarting = set() servicesRemainingToStartForJob = {} serviceToParentJobDescription = {} jobDescriptionsWithFailedServices = set() while True: with throttle(1.0): if terminate.is_set(): logger.debug('Received signal to quit starting services.') break try: jobDesc = jobDescriptionsWithServicesToStart.get_nowait() if len(list(jobDesc.serviceHostIDsInBatches())) > 1: # Have to fall back to the old blocking behavior to # ensure entire service "groups" are issued as a whole. blockUntilServiceGroupIsStarted( jobDesc, jobDescriptionsWithServicesThatHaveStarted, jobDescriptionsWithServicesThatHaveFailedToStart, serviceJobsToStart, terminate, jobStore) continue # Found a new job that needs to schedule its services. for onlyBatch in jobDesc.serviceHostIDsInBatches(): # There should be just one batch so we can do it here. servicesRemainingToStartForJob[jobDesc] = len( onlyBatch) for serviceJobID in onlyBatch: # Load up the service object. # TODO: cache? serviceJobDesc = jobStore.load(serviceJobID) # Remember the parent job serviceToParentJobDescription[ serviceJobDesc] = jobDesc # We should now start to monitor this service to see if # it has started yet. servicesThatAreStarting.add(serviceJobDesc) # Send the service JobDescription off to be started logger.debug( 'Service manager is starting service job: %s, start ID: %s', serviceJobDesc, serviceJobDesc.startJobStoreID) serviceJobsToStart.put(serviceJobDesc) except Empty: # No new jobs that need services scheduled. pass pendingServiceCount = len(servicesThatAreStarting) if pendingServiceCount > 0 and logLimiter.throttle(False): logger.debug('%d services are starting...', pendingServiceCount) for serviceJobDesc in list(servicesThatAreStarting): if not jobStore.fileExists(serviceJobDesc.startJobStoreID): # Service has started (or failed) logger.debug( 'Service %s has removed %s and is therefore started', serviceJobDesc, serviceJobDesc.startJobStoreID) servicesThatAreStarting.remove(serviceJobDesc) parentJob = serviceToParentJobDescription[ serviceJobDesc] servicesRemainingToStartForJob[parentJob] -= 1 assert servicesRemainingToStartForJob[parentJob] >= 0 del serviceToParentJobDescription[serviceJobDesc] if not jobStore.fileExists( serviceJobDesc.errorJobStoreID): logger.error( 'Service %s has immediately failed before it could be used', serviceJobDesc) # It probably hasn't fileld in the promise that the job that uses the service needs. jobDescriptionsWithFailedServices.add(parentJob) # Find if any JobDescriptions have had *all* their services started. jobDescriptionsToRemove = set() for jobDesc, remainingServices in servicesRemainingToStartForJob.items( ): if remainingServices == 0: if jobDesc in jobDescriptionsWithFailedServices: logger.error( 'Job %s has had all its services try to start, but at least one failed', jobDesc) jobDescriptionsWithServicesThatHaveFailedToStart.put( jobDesc) else: logger.debug('Job %s has all its services started', jobDesc) jobDescriptionsWithServicesThatHaveStarted.put( jobDesc) jobDescriptionsToRemove.add(jobDesc) for jobDesc in jobDescriptionsToRemove: del servicesRemainingToStartForJob[jobDesc]
def __start_services(self) -> None: """ Thread used to schedule services. """ # Keep the user informed, but not too informed, as services start up log_limiter = LocalThrottle(60) # These are all keyed by ID starting_services = set() remaining_services_by_client = {} service_to_client = {} clients_with_failed_services = set() while True: with throttle(1.0): if self.__terminate.is_set(): logger.debug('Received signal to quit starting services.') break try: client_id = self.__clients_in.get_nowait() client = self.__toil_state.get_job(client_id) host_id_batches = list(client.serviceHostIDsInBatches()) logger.debug( "Service manager processing client %s with %d batches of services", client, len(host_id_batches)) if len(host_id_batches) > 1: # Have to fall back to the old blocking behavior to # ensure entire service "groups" are issued as a whole. self.__start_batches_blocking(client_id) continue # Found a new job that needs to schedule its services. for batch in host_id_batches: # There should be just one batch so we can do it here. remaining_services_by_client[client_id] = len(batch) for service_id in batch: # Load up the service object. service_job_desc = self.__toil_state.get_job( service_id) # Remember the parent job service_to_client[service_id] = client_id # We should now start to monitor this service to see if # it has started yet. starting_services.add(service_id) # Send the service JobDescription off to be started logger.debug( 'Service manager is starting service job: %s, start ID: %s', service_job_desc, service_job_desc.startJobStoreID) self.__services_out.put(service_id) except Empty: # No new jobs that need services scheduled. pass pending_service_count = len(starting_services) if pending_service_count > 0 and log_limiter.throttle(False): logger.debug('%d services are starting...', pending_service_count) for service_id in list(starting_services): service_job_desc = self.__toil_state.get_job(service_id) if not self.__job_store.fileExists( service_job_desc.startJobStoreID): # Service has started (or failed) logger.debug( 'Service %s has removed %s and is therefore started', service_job_desc, service_job_desc.startJobStoreID) starting_services.remove(service_id) client_id = service_to_client[service_id] remaining_services_by_client[client_id] -= 1 assert remaining_services_by_client[client_id] >= 0 del service_to_client[service_id] if not self.__job_store.fileExists( service_job_desc.errorJobStoreID): logger.error( 'Service %s has immediately failed before it could be used', service_job_desc) # It probably hasn't fileld in the promise that the job that uses the service needs. clients_with_failed_services.add(client_id) # Find if any clients have had *all* their services started. ready_clients = set() for client_id, remainingServices in remaining_services_by_client.items( ): if remainingServices == 0: if client_id in clients_with_failed_services: logger.error( 'Job %s has had all its services try to start, but at least one failed', self.__toil_state.get_job(client_id)) self.__failed_clients_out.put(client_id) else: logger.debug('Job %s has all its services started', self.__toil_state.get_job(client_id)) self.__clients_out.put(client_id) ready_clients.add(client_id) for client_id in ready_clients: del remaining_services_by_client[client_id]