def test_adding_endpoints(self): container = arc.EndpointContainer() endpoint1 = arc.Endpoint() endpoint2 = arc.Endpoint() container.addEntity(endpoint1) container.addEntity(endpoint2) self.expect(container).to_have(2).endpoints()
def test_getting_the_endpoints(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back( [arc.Endpoint()]) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) self.retriever = arc.ServiceEndpointRetriever(self.usercfg) container = arc.EndpointContainer() self.retriever.addConsumer(container) self.expect(container).to_be_empty() registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest") self.retriever.addEndpoint(registry) self.retriever.wait() self.expect(container).to_have(1).endpoint()
def test_filtering(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([ arc.Endpoint("test1.nordugrid.org", ["cap1", "cap2"]), arc.Endpoint("test2.nordugrid.org", ["cap3", "cap4"]), arc.Endpoint("test3.nordugrid.org", ["cap1", "cap3"]) ]) arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([ arc.Endpoint("test1.nordugrid.org", ["cap1", "cap2"]), arc.Endpoint("test2.nordugrid.org", ["cap3", "cap4"]), arc.Endpoint("test3.nordugrid.org", ["cap1", "cap3"]) ]) arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([ arc.Endpoint("test1.nordugrid.org", ["cap1", "cap2"]), arc.Endpoint("test2.nordugrid.org", ["cap3", "cap4"]), arc.Endpoint("test3.nordugrid.org", ["cap1", "cap3"]) ]) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest") options = arc.ServiceEndpointQueryOptions(False, ["cap1"]) self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options) container = arc.EndpointContainer() self.retriever.addConsumer(container) self.retriever.addEndpoint(registry) self.retriever.wait() self.expect(container).to_have(2).endpoints() options = arc.ServiceEndpointQueryOptions(False, ["cap2"]) self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options) container = arc.EndpointContainer() self.retriever.addConsumer(container) self.retriever.addEndpoint(registry) self.retriever.wait() self.expect(container).to_have(1).endpoint() options = arc.ServiceEndpointQueryOptions(False, ["cap5"]) self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options) container = arc.EndpointContainer() self.retriever.addConsumer(container) self.retriever.addEndpoint(registry) self.retriever.wait() self.expect(container).to_have(0).endpoints()
def test_getting_status(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back( [arc.Endpoint()]) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.FAILED)) self.retriever = arc.ServiceEndpointRetriever(self.usercfg) container = arc.EndpointContainer() self.retriever.addConsumer(container) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest") self.retriever.addEndpoint(registry) self.retriever.wait() status = self.retriever.getStatusOfEndpoint(registry) self.expect(status).to_be_an_instance_of(arc.EndpointQueryingStatus) self.expect(status).to_be(arc.EndpointQueryingStatus.FAILED)
def example(): uc = arc.UserConfig() # Create a JobSupervisor to handle all the jobs job_supervisor = arc.JobSupervisor(uc) # Retrieve all the jobs from this computing element endpoint = arc.Endpoint("https://piff.hep.lu.se:443/arex", arc.Endpoint.JOBLIST) sys.stdout.write("Querying %s for jobs...\n" % endpoint.str()) retriever = arc.JobListRetriever(uc) retriever.addConsumer(job_supervisor) retriever.addEndpoint(endpoint) retriever.wait() sys.stdout.write("%s jobs found\n" % len(job_supervisor.GetAllJobs())) sys.stdout.write("Getting job states...\n") # Update the states of the jobs job_supervisor.Update() # Print state of updated jobs sys.stdout.write("The jobs have the following states: %s\n" % (", ".join( [job.State.GetGeneralState() for job in job_supervisor.GetAllJobs()]))) # Select failed jobs job_supervisor.SelectByStatus(["Failed"]) failed_jobs = job_supervisor.GetSelectedJobs() sys.stdout.write("The failed jobs:\n") for job in failed_jobs: job.SaveToStream(arc.CPyOstream(sys.stdout), True)
def test_resubmit(self): self.usercfg.Broker("TEST") arc.TargetInformationRetrieverPluginTESTControl.targets = [ self.create_test_target("http://test2.nordugrid.org") ] arc.TargetInformationRetrieverPluginTESTControl.status = arc.EndpointQueryingStatus( arc.EndpointQueryingStatus.SUCCESSFUL) js = arc.JobSupervisor(self.usercfg, [ self.create_test_job( job_id="http://test.nordugrid.org/1234567890test1", state=arc.JobState.FAILED), self.create_test_job( job_id="http://test.nordugrid.org/1234567890test2", state=arc.JobState.RUNNING) ]) self.expect(js.GetAllJobs()).to_have(2).jobs() endpoints = [ arc.Endpoint("http://test2.nordugrid.org", arc.Endpoint.COMPUTINGINFO, "org.nordugrid.tirtest") ] resubmitted = arc.JobList() result = js.Resubmit(0, endpoints, resubmitted)
def get_job(self, job_id): """ Return an instance of ``arc.Job`` representing the job with the given ID :param job_id: ID of the job as returned by `submit_job` :raises JobNotFoundError: if no job with the given ID could be found :return: Instance of ``arc.Job`` representing the job """ user_config = self.get_user_config() # Create a JobSupervisor to handle all the jobs job_supervisor = arc.JobSupervisor(user_config) # Retrieve all the jobs from this computing element endpoint = arc.Endpoint(self.config.ARC_SERVER, arc.Endpoint.JOBLIST) retriever = arc.JobListRetriever(user_config) retriever.addConsumer(job_supervisor) retriever.addEndpoint(endpoint) retriever.wait() # Update the states of the jobs job_supervisor.Update() # Get all jobs and find job by ID jobs = job_supervisor.GetAllJobs() for job in jobs: if job.JobID == job_id: return job raise JobNotFoundError( "Could not find a job with ID '{}'".format(job_id))
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Creating an endpoint for a Computing Element endpoint = arc.Endpoint("piff.hep.lu.se:443/arex", arc.Endpoint.COMPUTINGINFO) # Creating a container which will store the retrieved jobs jobs = arc.JobContainer() # Create a job list retriever retriever = arc.JobListRetriever(uc) # Add our container as the consumer of this retriever, so it will get the results retriever.addConsumer(jobs) # Add our endpoint to the retriever, which starts querying it retriever.addEndpoint(endpoint) # Wait until it finishes retriever.wait() # Get the status of the retrieval sys.stdout.write("%s\n"%retriever.getStatusOfEndpoint(endpoint).str()) sys.stdout.write("Number of jobs found: %d\n"%len(jobs)) for job in jobs: job.SaveToStream(arc.CPyOstream(sys.stdout), True)
def test_removing_consumer(self): self.retriever = arc.ServiceEndpointRetriever(self.usercfg) container = arc.EndpointContainer() self.retriever.addConsumer(container) self.retriever.removeConsumer(container) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest") self.retriever.addEndpoint(registry) self.retriever.wait() self.expect(container).to_have(0).endpoints()
def test_rejected_services(self): rejected = "http://test.nordugrid.org" not_rejected = "http://test2.nordugrid.org" arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back( [arc.Endpoint(rejected), arc.Endpoint(not_rejected)]) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) options = arc.ServiceEndpointQueryOptions(False, [], [rejected]) self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options) container = arc.EndpointContainer() self.retriever.addConsumer(container) registry = arc.Endpoint("registry.nordugrid.org", arc.Endpoint.REGISTRY) self.retriever.addEndpoint(registry) self.retriever.wait() self.expect(container).to_have(1).endpoint() self.expect(container[0].URLString).to_be(not_rejected)
def test_the_status_is_started_first(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back( [arc.Endpoint()]) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) self.condition = arc.SimpleCondition() arc.ServiceEndpointRetrieverPluginTESTControl.condition.push_back( self.condition) self.retriever = arc.ServiceEndpointRetriever(self.usercfg) container = arc.EndpointContainer() self.retriever.addConsumer(container) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest") self.retriever.addEndpoint(registry) status = self.retriever.getStatusOfEndpoint(registry) self.expect(status).to_be(arc.EndpointQueryingStatus.STARTED) self.condition.signal() self.retriever.wait() status = self.retriever.getStatusOfEndpoint(registry) self.expect(status).to_be(arc.EndpointQueryingStatus.SUCCESSFUL)
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Query two registries (index servers) for Computing Services registries = [ # for the index1, we specify that it is an EGIIS service arc.Endpoint("index1.nordugrid.org:2135/Mds-Vo-name=NorduGrid,o=grid", arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis"), # for the arc-emi.grid.upjs.sk, we don't specify the type (the InterfaceName) # we let the system to try all possibilities arc.Endpoint("nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.archery") ] retriever = retrieve(uc, registries) # The retriever acts as a list containing all the discovered ComputingServices: sys.stdout.write("Discovered ComputingServices: %s\n" % (", ".join([service.Name for service in retriever]))) # Get all the ExecutionTargets on these ComputingServices targets = retriever.GetExecutionTargets() sys.stdout.write( "Number of ExecutionTargets on these ComputingServices: %d\n" % len(targets)) # Query the local infosys (COMPUTINGINFO) of computing elements computing_elements = [ # for piff, we specify that we want to query the LDAP GLUE2 tree arc.Endpoint("piff.hep.lu.se", arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2"), # for pgs03, we don't specify the interface, we let the system try all possibilities arc.Endpoint("pgs03.grid.upjs.sk", arc.Endpoint.COMPUTINGINFO) ] retriever2 = retrieve(uc, computing_elements) # Get all the ExecutionTargets on these ComputingServices targets2 = retriever2.GetExecutionTargets() sys.stdout.write("The discovered ExecutionTargets:\n") for target in targets2: sys.stdout.write("%s\n" % str(target)) # Query both registries and computing elements at the same time: endpoints = [ arc.Endpoint("arc-emi.grid.upjs.sk/O=Grid/Mds-Vo-Name=ARC-EMI", arc.Endpoint.REGISTRY), arc.Endpoint("piff.hep.lu.se", arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2") ] retriever3 = retrieve(uc, endpoints) sys.stdout.write("Discovered ComputingServices: %s\n" % (", ".join([service.Name for service in retriever3])))
def getCEStatus(self): """ Method to return information on running and pending jobs. We hope to satisfy both instances that use robot proxies and those which use proper configurations. """ result = self._prepareProxy() if not result['OK']: gLogger.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) # Try to find out which VO we are running for. vo = '' res = getVOfromProxyGroup() if res['OK']: vo = res['Value'] result = S_OK() result['SubmittedJobs'] = 0 if not vo: # Presumably the really proper way forward once the infosys-discuss WG comes up with a solution # and it is implemented. Needed for DIRAC instances which use robot certificates for pilots. endpoints = [arc.Endpoint("ldap://" + self.ceHost + "/MDS-Vo-name=local,o=grid", arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')] retriever = arc.ComputingServiceRetriever(self.usercfg, endpoints) retriever.wait() # Takes a bit of time to get and parse the ldap information targets = retriever.GetExecutionTargets() ceStats = targets[0].ComputingShare gLogger.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs)) gLogger.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs)) result['RunningJobs'] = ceStats.RunningJobs result['WaitingJobs'] = ceStats.WaitingJobs else: # The system which works properly at present for ARC CEs that are configured correctly. # But for this we need the VO to be known - ask me (Raja) for the whole story if interested. cmd = 'ldapsearch -x -LLL -H ldap://%s:2135 -b mds-vo-name=resource,o=grid "(GlueVOViewLocalID=%s)"' % ( self.ceHost, vo.lower()) res = shellCall(0, cmd) if not res['OK']: gLogger.debug("Could not query CE %s - is it down?" % self.ceHost) return res try: ldapValues = res['Value'][1].split("\n") running = [lValue for lValue in ldapValues if 'GlueCEStateRunningJobs' in lValue] waiting = [lValue for lValue in ldapValues if 'GlueCEStateWaitingJobs' in lValue] result['RunningJobs'] = int(running[0].split(":")[1]) result['WaitingJobs'] = int(waiting[0].split(":")[1]) except IndexError: res = S_ERROR('Unknown ldap failure for site %s' % self.ceHost) return res return result
def setUp(self): self.usercfg = arc.UserConfig( arc.initializeCredentialsType( arc.initializeCredentialsType.SkipCredentials)) self.ce = arc.Endpoint() self.ce.URLString = "test.nordugrid.org" self.ce.InterfaceName = "org.nordugrid.tirtest" arc.TargetInformationRetrieverPluginTESTControl.delay = 0 arc.TargetInformationRetrieverPluginTESTControl.targets = [ arc.ComputingServiceType() ] arc.TargetInformationRetrieverPluginTESTControl.status = arc.EndpointQueryingStatus( arc.EndpointQueryingStatus.SUCCESSFUL)
def test_constructor_returns_immediately(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back( [arc.Endpoint()]) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) self.condition = arc.SimpleCondition() arc.ServiceEndpointRetrieverPluginTESTControl.condition.push_back( self.condition) self.retriever = arc.ServiceEndpointRetriever(self.usercfg) container = arc.EndpointContainer() self.retriever.addConsumer(container) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest") self.retriever.addEndpoint(registry) # the endpoint should not arrive yet self.expect(container).to_have(0).endpoints() self.condition.signal() # we are not interested in it anymore self.retriever.removeConsumer(container) # we must wait until self.retriever is done otherwise 'condition' will go out of scope while being used. self.retriever.wait()
def test_recursivity_with_filtering(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([ arc.Endpoint("emir.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest"), arc.Endpoint("ce.nordugrid.org", arc.Endpoint.COMPUTINGINFO, "org.ogf.glue.emies.resourceinfo"), ]) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([ arc.Endpoint("emir.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest"), arc.Endpoint("ce.nordugrid.org", arc.Endpoint.COMPUTINGINFO, "org.ogf.glue.emies.resourceinfo"), ]) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) options = arc.ServiceEndpointQueryOptions( True, ["information.discovery.resource"]) self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options) container = arc.EndpointContainer() self.retriever.addConsumer(container) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest") self.retriever.addEndpoint(registry) self.retriever.wait() # expect to only get the ce.nordugrid.org, but that will be there twice # once from test.nordugrid.org, once from emir.nordugrid.org self.expect(container).to_have(2).endpoints() emirs = [ endpoint for endpoint in container if "emir" in endpoint.URLString ] ces = [ endpoint for endpoint in container if "ce" in endpoint.URLString ] self.expect(emirs).to_have(0).endpoints() self.expect(ces).to_have(2).endpoints()
def test_status_of_typeless_registry(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back( arc.EndpointList()) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) self.retriever = arc.ServiceEndpointRetriever(self.usercfg) container = arc.EndpointContainer() self.retriever.addConsumer(container) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY) self.retriever.addEndpoint(registry) self.retriever.wait() status = self.retriever.getStatusOfEndpoint(registry) self.expect(status).to_be(arc.EndpointQueryingStatus.SUCCESSFUL)
def test_deleting_the_consumer_before_the_retriever(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back( arc.EndpointList()) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) self.retriever = arc.ServiceEndpointRetriever(self.usercfg) container = arc.EndpointContainer() self.retriever.addConsumer(container) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.sertest") self.retriever.addEndpoint(registry) self.retriever.removeConsumer(container) del container self.retriever.wait()
def test_empty_registry_type(self): arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back( arc.EndpointList()) arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back( arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL)) self.retriever = arc.ServiceEndpointRetriever(self.usercfg) container = arc.EndpointContainer() self.retriever.addConsumer(container) registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY) self.retriever.addEndpoint(registry) self.retriever.wait() # it should fill the empty type with the available plugins: # among them the TEST plugin which doesn't return any endpoint self.expect(container).to_have(0).endpoint()
def setUp(self): self.usercfg = arc.UserConfig( arc.initializeCredentialsType( arc.initializeCredentialsType.SkipCredentials)) self.ce = arc.Endpoint() self.ce.URLString = "test.nordugrid.org" self.ce.InterfaceName = "org.nordugrid.tirtest" self.ce.Capability.append( arc.Endpoint_GetStringForCapability(arc.Endpoint.COMPUTINGINFO)) arc.TargetInformationRetrieverPluginTESTControl.delay = 0 arc.TargetInformationRetrieverPluginTESTControl.targets = [ self.create_test_target() ] arc.TargetInformationRetrieverPluginTESTControl.status = arc.EndpointQueryingStatus( arc.EndpointQueryingStatus.SUCCESSFUL)
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Creating an endpoint for a Computing Element endpoint = arc.Endpoint("piff.hep.lu.se", arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2") # Get the ExecutionTargets of this ComputingElement retriever = arc.ComputingServiceRetriever(uc, [endpoint]) retriever.wait() targets = retriever.GetExecutionTargets() # Shuffle the targets to simulate a random broker targets = list(targets) random.shuffle(targets) # Create a JobDescription jobdesc = arc.JobDescription() jobdesc.Application.Executable.Path = "/bin/hostname" jobdesc.Application.Output = "stdout.txt" # create an empty job object which will contain our submitted job job = arc.Job() success = False # Submit job directly to the execution targets, without a broker for target in targets: sys.stdout.write("Trying to submit to %s (%s) ... " % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName)) sys.stdout.flush() success = target.Submit(uc, jobdesc, job) if success: sys.stdout.write("succeeded!\n") break else: sys.stdout.write("failed!\n") if success: sys.stdout.write("Job was submitted:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), False) else: sys.stdout.write("Job submission failed\n")
def getCEStatus(self): """Method to return information on running and pending jobs. We hope to satisfy both instances that use robot proxies and those which use proper configurations. """ result = self._prepareProxy() if not result["OK"]: self.log.error("ARCComputingElement: failed to set up proxy", result["Message"]) return result self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"]) # Creating an endpoint endpoint = arc.Endpoint(self.ceHost, arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2") # Get the ExecutionTargets of the ComputingElement (Can be REST, EMI-ES or GRIDFTP) retriever = arc.ComputingServiceRetriever(self.usercfg, [endpoint]) retriever.wait() # Takes a bit of time to get and parse the ldap information targetsWithQueues = retriever.GetExecutionTargets() # Targets also include queues # Some of them might be used by different VOs targets = [] for target in targetsWithQueues: if target.ComputingShare.Name == self.arcQueue: self.log.debug( "Adding target:", "%s (%s)" % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName), ) targets.append(target) # We extract stat from the AREX service (targets[0]) ceStats = targets[0].ComputingShare self.log.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs)) self.log.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs)) result = S_OK() result["SubmittedJobs"] = 0 result["RunningJobs"] = ceStats.RunningJobs result["WaitingJobs"] = ceStats.WaitingJobs return result
usercfg = arc.UserConfig("", "") # Two simple job descriptions which output hostname to stdout jobdescstring = "+(&(executable=/bin/hostname)(stdout=stdout))(&(executable=/bin/hostname)(stdout=stdout))" # Parse job description jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(jobdescstring, jobdescs): logger.msg(arc.ERROR, "Invalid job description") sys.exit(1) # Use 'arc.JobDescription_ParseFromFile("helloworld.xrsl", jobdescs)' # to parse job description from file. # Use top-level NorduGrid information index to find resources index = arc.Endpoint("nordugrid.org", arc.Endpoint.REGISTRY, "org.nordugrid.archery") services = arc.EndpointList(1, index) # Do the submission jobs = arc.JobList() submitter = arc.Submitter(usercfg) if submitter.BrokeredSubmit(services, jobdescs, jobs) != arc.SubmissionStatus.NONE: logger.msg(arc.ERROR, "Failed to submit job") sys.exit(1) # Write information on submitted job to local job list (~/.arc/jobs.xml) jobList = arc.JobInformationStorageSQLite(usercfg.JobListFile()) if not jobList.Write(jobs): logger.msg(arc.WARNING, "Failed to write to local job list %s" % usercfg.JobListFile())
def _arc_submit(self, xrsl, arcces, userconfig, log): '''Check the available CEs and submit''' queuelist = [] for arcce in arcces: (ce_endpoint, ce_queue) = arcce aris = arc.URL(str(ce_endpoint)) ce_host = aris.Host() if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] else: aris = 'ldap://' + aris.Host() + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(userconfig, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process for target in targets: if not target.ComputingService.ID: log.info( "Target {0} does not have ComputingService ID defined, skipping" .format(target.ComputingService.Name)) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \ and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': log.debug( "Rejecting target interface {0} because not EMI-ES". format(target.ComputingEndpoint.InterfaceName)) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if targethost != ce_host: log.debug( 'Rejecting target host {0} as it does not match {1}'. format(targethost, ce_host)) continue if targetqueue != ce_queue: log.debug( 'Rejecting target queue {0} as it does not match {1}'. format(targetqueue, ce_queue)) continue queuelist.append(target) log.debug("Adding target {0}:{1}".format( targethost, targetqueue)) # check if any queues are available, if not leave and try again next time if not queuelist: raise Exception("No free queues available") log.debug("preparing submission") jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(str(xrsl), jobdescs): raise Exception("Failed to prepare job description") # Run the submission in a separate thread thr = SubmitThr(queuelist, jobdescs, userconfig) return self._run_submit(thr)
def submitJob( self, executableFile, proxy, numberOfJobs = 1 ): """ Method to submit job """ # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue> # And none of our supported batch systems have a "-" in their name self.arcQueue = self.queue.split("-",2)[2] result = self._prepareProxy() self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) if not result['OK']: gLogger.error( 'ARCComputingElement: failed to set up proxy', result['Message'] ) return result gLogger.verbose( "Executable file path: %s" % executableFile ) if not os.access( executableFile, 5 ): os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH ) batchIDList = [] stampDict = {} endpoint = arc.Endpoint( self.ceHost + ":2811/jobs", arc.Endpoint.JOBSUBMIT, "org.nordugrid.gridftpjob") # Submit jobs iteratively for now. Tentatively easier than mucking around with the JobSupervisor class for __i in range(numberOfJobs): # The basic job description jobdescs = arc.JobDescriptionList() # Get the job into the ARC way xrslString, diracStamp = self.__writeXRSL( executableFile ) gLogger.debug("XRSL string submitted : %s" %xrslString) gLogger.debug("DIRAC stamp for job : %s" %diracStamp) if not arc.JobDescription_Parse(xrslString, jobdescs): gLogger.error("Invalid job description") break # Submit the job jobs = arc.JobList() # filled by the submit process submitter = arc.Submitter(self.usercfg) result = submitter.Submit(endpoint, jobdescs, jobs) # Save info or else ..else. if ( result == arc.SubmissionStatus.NONE ): # Job successfully submitted pilotJobReference = jobs[0].JobID batchIDList.append( pilotJobReference ) stampDict[pilotJobReference] = diracStamp gLogger.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost)) else: message = "Failed to submit job because " if (result.isSet(arc.SubmissionStatus.NOT_IMPLEMENTED) ): gLogger.warn( "%s feature not implemented on CE? (weird I know - complain to site admins" % message ) if ( result.isSet(arc.SubmissionStatus.NO_SERVICES) ): gLogger.warn( "%s no services are running on CE? (open GGUS ticket to site admins" % message ) if ( result.isSet(arc.SubmissionStatus.ENDPOINT_NOT_QUERIED) ): gLogger.warn( "%s endpoint was not even queried. (network ..?)" % message ) if ( result.isSet(arc.SubmissionStatus.BROKER_PLUGIN_NOT_LOADED) ): gLogger.warn( "%s BROKER_PLUGIN_NOT_LOADED : ARC library installation problem?" % message ) if ( result.isSet(arc.SubmissionStatus.DESCRIPTION_NOT_SUBMITTED) ): gLogger.warn( "%s Job not submitted - incorrect job description? (missing field in XRSL string?)" % message ) if ( result.isSet(arc.SubmissionStatus.SUBMITTER_PLUGIN_NOT_LOADED) ): gLogger.warn( "%s SUBMITTER_PLUGIN_NOT_LOADED : ARC library installation problem?" % message ) if ( result.isSet(arc.SubmissionStatus.AUTHENTICATION_ERROR) ): gLogger.warn( "%s authentication error - screwed up / expired proxy? Renew / upload pilot proxy on machine?" % message ) if ( result.isSet(arc.SubmissionStatus.ERROR_FROM_ENDPOINT) ): gLogger.warn( "%s some error from the CE - possibly CE problems?" % message ) gLogger.warn( "%s ... maybe above messages will give a hint." % message ) break # Boo hoo *sniff* if batchIDList: result = S_OK( batchIDList ) result['PilotStampDict'] = stampDict else: result = S_ERROR('No pilot references obtained from the ARC job submission') return result
def submitJob(self, executableFile, proxy, numberOfJobs=1, inputs=None, outputs=None): """Method to submit job""" # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue> # And none of our supported batch systems have a "-" in their name self.arcQueue = self.queue.split("-", 2)[2] result = self._prepareProxy() if not result["OK"]: self.log.error("ARCComputingElement: failed to set up proxy", result["Message"]) return result self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"]) self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH) executables = None if self.preamble: executables = [executableFile] executableFile = self._bundlePreamble(executableFile) batchIDList = [] stampDict = {} if self.endpointType == "Gridftp": endpoint = arc.Endpoint(str(self.ceHost + ":2811/jobs"), arc.Endpoint.JOBSUBMIT, "org.nordugrid.gridftpjob") else: endpoint = arc.Endpoint( str("https://" + self.ceHost + ":8443/arex"), arc.Endpoint.JOBSUBMIT, "org.ogf.glue.emies.activitycreation", ) # Submit jobs iteratively for now. Tentatively easier than mucking around with the JobSupervisor class for __i in range(numberOfJobs): # The basic job description jobdescs = arc.JobDescriptionList() # Get the job into the ARC way xrslString, diracStamp = self._writeXRSL(executableFile, inputs, outputs, executables) self.log.debug("XRSL string submitted : %s" % xrslString) self.log.debug("DIRAC stamp for job : %s" % diracStamp) # The arc bindings don't accept unicode objects in Python 2 so xrslString must be explicitly cast result = arc.JobDescription_Parse(str(xrslString), jobdescs) if not result: self.log.error("Invalid job description", "%r, message=%s" % (xrslString, result.str())) break # Submit the job jobs = arc.JobList() # filled by the submit process submitter = arc.Submitter(self.usercfg) result = submitter.Submit(endpoint, jobdescs, jobs) # Save info or else ..else. if result == arc.SubmissionStatus.NONE: # Job successfully submitted pilotJobReference = jobs[0].JobID batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp self.log.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost)) else: self._analyzeSubmissionError(result) break # Boo hoo *sniff* if self.preamble: os.unlink(executableFile) if batchIDList: result = S_OK(batchIDList) result["PilotStampDict"] = stampDict else: result = S_ERROR( "No pilot references obtained from the ARC job submission") return result
def submit(self): """ Main function to submit jobs. """ global queuelist # check for stopsubmission flag if self.conf.get(['downtime', 'stopsubmission']) == "true": self.log.info('Submission suspended due to downtime') return # check for any site-specific limits or status clusterstatus = self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}", ["status"]) or 'online' if clusterstatus == 'offline': self.log.info('Site status is offline') return clustermaxjobs = int( self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}", ["maxjobs"]) or 999999) nsubmitted = self.db.getNArcJobs(f"cluster='{self.cluster}'") if nsubmitted >= clustermaxjobs: self.log.info( f'{nsubmitted} submitted jobs is greater than or equal to max jobs {clustermaxjobs}' ) return # Get cluster host and queue: cluster/queue clusterhost = clusterqueue = None if self.cluster: cluster = self.cluster if cluster.find('://') == -1: cluster = 'gsiftp://' + cluster clusterurl = arc.URL(cluster) clusterhost = clusterurl.Host() clusterqueue = clusterurl.Path()[1:] # strip off leading slash # Apply fair-share if self.cluster: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist like '%" + self.cluster + "%'", ['fairshare', 'proxyid']) else: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist=''", ['fairshare', 'proxyid']) if not fairshares: self.log.info('Nothing to submit') return # split by proxy for GU queues fairshares = list( set([(p['fairshare'], p['proxyid']) for p in fairshares])) # For proxy bug - see below shuffle(fairshares) for fairshare, proxyid in fairshares: # apply maxjobs limit (check above should make sure greater than zero) # Note: relies on exit after first loop limit = min(clustermaxjobs - nsubmitted, 10) try: # catch any exceptions here to avoid leaving lock if self.cluster: # Lock row for update in case multiple clusters are specified #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare), jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' and proxyid='{2}' limit {3}" .format(self.cluster, fairshare, proxyid, limit), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid", "clusterlist" ], lock=True) if jobs: self.log.debug("started lock for writing %d jobs" % len(jobs)) else: jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist='' and fairshare='{0} and proxyid={1}' limit {2}" .format(fairshare, proxyid, limit), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid", "clusterlist" ]) # mark submitting in db jobs_taken = [] for j in jobs: jd = { 'cluster': self.cluster, 'arcstate': 'submitting', 'tarcstate': self.db.getTimeStamp() } self.db.updateArcJobLazy(j['id'], jd) jobs_taken.append(j) jobs = jobs_taken finally: if self.cluster: try: self.db.Commit(lock=True) self.log.debug("ended lock") except: self.log.warning("Failed to release DB lock") else: self.db.Commit() if len(jobs) == 0: #self.log.debug("No jobs to submit") continue self.log.info( "Submitting %d jobs for fairshare %s and proxyid %d" % (len(jobs), fairshare, proxyid)) # max waiting priority try: maxpriowaiting = max(jobs, key=lambda x: x['priority'])['priority'] except: maxpriowaiting = 0 self.log.info("Maximum priority of waiting jobs: %d" % maxpriowaiting) # Query infosys - either local or index if self.cluster: if self.cluster.find('://') != -1: aris = arc.URL(self.cluster) else: aris = arc.URL('gsiftp://%s' % self.cluster) if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] elif aris.Protocol() == 'local': infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.local') ] else: aris = 'ldap://' + aris.Host( ) + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] else: giises = self.conf.getList(['atlasgiis', 'item']) infoendpoints = [] for g in giises: # Specify explicitly EGIIS infoendpoints.append( arc.Endpoint(str(g), arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")) # Set UserConfig credential for querying infosys proxystring = str(self.db.getProxy(proxyid)) self.uc.CredentialString(proxystring) global usercred usercred = self.uc # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process queuelist = [] for target in targets: if not target.ComputingService.ID: self.log.info( "Target %s does not have ComputingService ID defined, skipping" % target.ComputingService.Name) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[ 0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': self.log.debug( "Rejecting target interface %s because not EMI-ES" % target.ComputingEndpoint.InterfaceName) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if clusterhost and targethost != clusterhost: self.log.debug( 'Rejecting target host %s as it does not match %s' % (targethost, clusterhost)) continue if clusterqueue and targetqueue != clusterqueue: self.log.debug( 'Rejecting target queue %s as it does not match %s' % (targetqueue, clusterqueue)) continue if targetqueue in self.conf.getList(['queuesreject', 'item']): self.log.debug( 'Rejecting target queue %s in queuesreject list' % targetqueue) continue elif targethost in self.conf.getList( ['clustersreject', 'item']): self.log.debug( 'Rejecting target host %s in clustersreject list' % targethost) continue else: # tmp hack target.ComputingShare.LocalWaitingJobs = 0 target.ComputingShare.PreLRMSWaitingJobs = 0 target.ExecutionEnvironment.CPUClockSpeed = 2000 qjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='submitted' and fairshare='%s'" % fairshare, ['id', 'priority']) rjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='running' and fairshare='%s'" % fairshare, ['id']) # max queued priority try: maxprioqueued = max( qjobs, key=lambda x: x['priority'])['priority'] except: maxprioqueued = 0 self.log.info("Max priority queued: %d" % maxprioqueued) # Limit number of submitted jobs using configuration or default (0.15 + 100/num of shares) # Note: assumes only a few shares are used qfraction = float(self.conf.get([ 'jobs', 'queuefraction' ])) if self.conf.get(['jobs', 'queuefraction']) else 0.15 qoffset = int(self.conf.get([ 'jobs', 'queueoffset' ])) if self.conf.get(['jobs', 'queueoffset']) else 100 jlimit = len(rjobs) * qfraction + qoffset / len(fairshares) self.log.debug("running %d, queued %d, queue limit %d" % (len(rjobs), len(qjobs), jlimit)) if str(self.cluster).find('arc-boinc-0') != -1: jlimit = len(rjobs) * 0.15 + 400 if str(self.cluster).find('XXXpikolit') != -1: jlimit = len(rjobs) * 0.15 + 100 if str(self.cluster).find('arc05.lcg') != -1: jlimit = len(rjobs) * 0.15 + 400 target.ComputingShare.PreLRMSWaitingJobs = len(qjobs) if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued) and (maxpriowaiting > 10)): if maxpriowaiting > maxprioqueued: self.log.info( "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d" % (maxpriowaiting, maxprioqueued)) queuelist.append(target) self.log.debug("Adding target %s:%s" % (targethost, targetqueue)) else: self.log.info( "%s/%s already at limit of submitted jobs for fairshare %s" % (targethost, targetqueue, fairshare)) # check if any queues are available, if not leave and try again next time if not queuelist: self.log.info("No free queues available") self.db.Commit() continue self.log.info("start submitting") # Just run one thread for each job in sequence. Strange things happen # when trying to create a new UserConfig object for each thread. tasks = [] for j in jobs: self.log.debug("%s: preparing submission" % j['appjobid']) jobdescstr = str( self.db.getArcJobDescription(str(j['jobdesc']))) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse( jobdescstr, jobdescs): self.log.error("%s: Failed to prepare job description" % j['appjobid']) continue tasks.append((j['id'], j['appjobid'], jobdescstr, proxystring, int(self.conf.get(['atlasgiis', 'timeout'])))) npools = 1 if any(s in self.cluster for s in self.conf.getList(['parallelsubmit', 'item'])): npools = int(self.conf.get(['parallelsubmit', 'npools'])) self.log.debug("Starting submitters: %s" % npools) pool = multiprocessing.Pool(npools) #results = [] #for task in tasks: # result = pool.apply_async(Submit,(task)) # results.append(result) # Submit in workers results = [pool.apply_async(Submit, (t)) for t in tasks] # timeout per submission timeout = 60 stopflag = False for result, task in zip(results, tasks): try: jdb = result.get(timeout) jconv = JobConv() job = jconv.db2job(jdb) except multiprocessing.TimeoutError: self.log.error( "%s: submission timeout: exit and try again" % task[1]) # abort submission if Submit process is stuck #pool.terminate() KillPool(pool) pool.join() stopflag = True # reduce timeout to finish quickly timeout = 0.1 continue if job is None: self.log.error("%s: no job defined for %d" % (task[1], task[0])) continue jd = {} jd['arcstate'] = 'submitted' # initial offset to 1 minute to force first status check jd['tarcstate'] = self.db.getTimeStamp( time.time() - int(self.conf.get(['jobs', 'checkinterval'])) + 120) jd['tstate'] = self.db.getTimeStamp() # extract hostname of cluster (depends on JobID being a URL) self.log.info("%s: job id %s" % (task[1], job.JobID)) jd['cluster'] = self.cluster self.db.updateArcJobLazy(task[0], jd, job) if not stopflag: pool.terminate() pool.join() else: # stop submitting, gsiftp connection problem likely raise ExceptInterrupt(15) self.log.info("threads finished") # commit transaction to release row locks self.db.Commit() # still proxy bug - exit if there are multiple proxies if len(self.db.getProxiesInfo('TRUE', ['id'])) > 1: raise ExceptInterrupt(15) self.log.info("end submitting") return
# Simple job description which outputs hostname to stdout jobdescstring = "&(executable=/bin/hostname)(stdout=stdout)" # Parse job description jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(jobdescstring, jobdescs): logger.msg(arc.ERROR, "Invalid job description") sys.exit(1) # Use 'arc.JobDescription_ParseFromFile("helloworld.xrsl", jobdescs)' # to parse job description from file. # Use top-level NorduGrid information index to find resources index = arc.Endpoint( "ldap://index1.nordugrid.org:2135/Mds-Vo-name=NorduGrid,o=grid", arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis") services = arc.EndpointList(1, index) # Do the submission jobs = arc.JobList() submitter = arc.Submitter(usercfg) if submitter.BrokeredSubmit(services, jobdescs, jobs) != arc.SubmissionStatus.NONE: logger.msg(arc.ERROR, "Failed to submit job") sys.exit(1) # Write information on submitted job to local job list (~/.arc/jobs.xml) jobList = arc.JobInformationStorageXML(usercfg.JobListFile()) if not jobList.Write(jobs): logger.msg(arc.WARNING, "Failed to write to local job list %s",
def getCEStatus(self): """ Method to return information on running and pending jobs. We hope to satisfy both instances that use robot proxies and those which use proper configurations. """ result = self._prepareProxy() if not result['OK']: self.log.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) # Try to find out which VO we are running for. vo = '' res = getVOfromProxyGroup() if res['OK']: vo = res['Value'] result = S_OK() result['SubmittedJobs'] = 0 if not vo: # Presumably the really proper way forward once the infosys-discuss WG comes up with a solution # and it is implemented. Needed for DIRAC instances which use robot certificates for pilots. endpoints = [arc.Endpoint(str("ldap://" + self.ceHost + "/MDS-Vo-name=local,o=grid"), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')] retriever = arc.ComputingServiceRetriever(self.usercfg, endpoints) retriever.wait() # Takes a bit of time to get and parse the ldap information targets = retriever.GetExecutionTargets() ceStats = targets[0].ComputingShare self.log.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs)) self.log.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs)) result['RunningJobs'] = ceStats.RunningJobs result['WaitingJobs'] = ceStats.WaitingJobs else: # The system which works properly at present for ARC CEs that are configured correctly. # But for this we need the VO to be known - ask me (Raja) for the whole story if interested. # cmd = 'ldapsearch -x -LLL -H ldap://%s:2135 -b mds-vo-name=resource,o=grid "(GlueVOViewLocalID=%s)"' % ( # self.ceHost, vo.lower()) if not self.queue: self.log.error('ARCComputingElement: No queue ...') res = S_ERROR('Unknown queue (%s) failure for site %s' % (self.queue, self.ceHost)) return res cmd1 = "ldapsearch -x -o ldif-wrap=no -LLL -h %s:2135 -b \'o=glue\' " % self.ceHost cmd2 = '"(&(objectClass=GLUE2MappingPolicy)(GLUE2PolicyRule=vo:%s))"' % vo.lower() cmd3 = ' | grep GLUE2MappingPolicyShareForeignKey | grep %s' % (self.queue.split("-")[-1]) cmd4 = ' | sed \'s/GLUE2MappingPolicyShareForeignKey: /GLUE2ShareID=/\' ' cmd5 = ' | xargs -L1 ldapsearch -x -o ldif-wrap=no -LLL -h %s:2135 -b \'o=glue\' ' % self.ceHost cmd6 = ' | egrep \'(ShareWaiting|ShareRunning)\'' res = shellCall(0, cmd1 + cmd2 + cmd3 + cmd4 + cmd5 + cmd6) if not res['OK']: self.log.debug("Could not query CE %s - is it down?" % self.ceHost) return res try: ldapValues = res['Value'][1].split("\n") running = [lValue for lValue in ldapValues if 'GLUE2ComputingShareRunningJobs' in lValue] waiting = [lValue for lValue in ldapValues if 'GLUE2ComputingShareWaitingJobs' in lValue] result['RunningJobs'] = int(running[0].split(":")[1]) result['WaitingJobs'] = int(waiting[0].split(":")[1]) except IndexError: res = S_ERROR('Unknown ldap failure for site %s' % self.ceHost) return res return result
def submit(self): """ Main function to submit jobs. """ global queuelist # check for stopsubmission flag if self.conf.get(['downtime', 'stopsubmission']) == "true": self.log.info('Submission suspended due to downtime') return 0 # Get cluster host and queue: cluster/queue clusterhost = clusterqueue = None if self.cluster: cluster = self.cluster if cluster.find('://') == -1: cluster = 'gsiftp://' + cluster clusterurl = arc.URL(cluster) clusterhost = clusterurl.Host() clusterqueue = clusterurl.Path()[1:] # strip off leading slash # Apply fair-share if self.cluster: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist like '%" + self.cluster + "%'", ['fairshare']) else: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist=''", ['fairshare']) if not fairshares: self.log.info('Nothing to submit') return 0 fairshares = list(set([p['fairshare'] for p in fairshares])) # For EMI-ES proxy bug - see below shuffle(fairshares) count = 0 for fairshare in fairshares: try: # catch any exceptions here to avoid leaving lock if self.cluster: # Lock row for update in case multiple clusters are specified #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare), jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' limit 10" .format(self.cluster, fairshare), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid" ], lock=True) if jobs: self.log.debug("started lock for writing %d jobs" % len(jobs)) else: jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist='' and fairshare='{0}' limit 10" .format(fairshare), columns=["id", "jobdesc", "appjobid", "priority"]) # mark submitting in db jobs_taken = [] for j in jobs: jd = { 'cluster': self.cluster, 'arcstate': 'submitting', 'tarcstate': self.db.getTimeStamp() } self.db.updateArcJobLazy(j['id'], jd) jobs_taken.append(j) jobs = jobs_taken finally: if self.cluster: try: self.db.Commit(lock=True) self.log.debug("ended lock") except: self.log.warning("Failed to release DB lock") else: self.db.Commit() if len(jobs) == 0: #self.log.debug("No jobs to submit") continue self.log.info("Submitting %d jobs for fairshare %s" % (len(jobs), fairshare)) # max waiting priority try: maxpriowaiting = max(jobs, key=lambda x: x['priority'])['priority'] except: maxpriowaiting = 0 self.log.info("Maximum priority of waiting jobs: %d" % maxpriowaiting) # Query infosys - either local or index if self.cluster: if self.cluster.find('://') != -1: aris = arc.URL(self.cluster) else: aris = arc.URL('gsiftp://%s' % self.cluster) if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] elif aris.Protocol() == 'local': infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.local') ] else: aris = 'ldap://' + aris.Host( ) + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] else: giises = self.conf.getList(['atlasgiis', 'item']) infoendpoints = [] for g in giises: # Specify explicitly EGIIS infoendpoints.append( arc.Endpoint(str(g), arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")) # Set UserConfig credential for each proxy. Assumes that any proxy # in the fairshare can query the CE infosys self.uc.CredentialString(self.db.getProxy(jobs[0]['proxyid'])) # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process queuelist = [] for target in targets: if not target.ComputingService.ID: self.log.info( "Target %s does not have ComputingService ID defined, skipping" % target.ComputingService.Name) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[ 0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': self.log.debug( "Rejecting target interface %s because not EMI-ES" % target.ComputingEndpoint.InterfaceName) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if clusterhost and targethost != clusterhost: self.log.debug( 'Rejecting target host %s as it does not match %s' % (targethost, clusterhost)) continue if clusterqueue and targetqueue != clusterqueue: self.log.debug( 'Rejecting target queue %s as it does not match %s' % (targetqueue, clusterqueue)) continue if targetqueue in self.conf.getList(['queuesreject', 'item']): self.log.debug( 'Rejecting target queue %s in queuesreject list' % targetqueue) continue elif targethost in self.conf.getList( ['clustersreject', 'item']): self.log.debug( 'Rejecting target host %s in clustersreject list' % targethost) continue else: # tmp hack target.ComputingShare.LocalWaitingJobs = 0 target.ComputingShare.PreLRMSWaitingJobs = 0 target.ExecutionEnvironment.CPUClockSpeed = 2000 qjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='submitted' and fairshare='%s'" % fairshare, ['id', 'priority']) rjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='running' and fairshare='%s'" % fairshare, ['id']) # max queued priority try: maxprioqueued = max( qjobs, key=lambda x: x['priority'])['priority'] except: maxprioqueued = 0 self.log.info("Max priority queued: %d" % maxprioqueued) # Set number of submitted jobs to running * 0.15 + 400/num of shares # Note: assumes only a few shares are used jlimit = len(rjobs) * 0.15 + 100 / len(fairshares) if str(self.cluster).find('arc-boinc-0') != -1: jlimit = len(rjobs) * 0.15 + 400 if str(self.cluster).find('XXXpikolit') != -1: jlimit = len(rjobs) * 0.15 + 100 if str(self.cluster).find('arc05.lcg') != -1: jlimit = len(rjobs) * 0.15 + 400 target.ComputingShare.PreLRMSWaitingJobs = len(qjobs) if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued) and (maxpriowaiting > 10)): if maxpriowaiting > maxprioqueued: self.log.info( "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d" % (maxpriowaiting, maxprioqueued)) queuelist.append(target) self.log.debug("Adding target %s:%s" % (targethost, targetqueue)) else: self.log.info( "%s/%s already at limit of submitted jobs for fairshare %s" % (targethost, targetqueue, fairshare)) # check if any queues are available, if not leave and try again next time if not queuelist: self.log.info("No free queues available") self.db.Commit() # EMI-ES proxy problem - see bug 3685 if self.cluster and self.cluster.startswith('https://'): raise ExceptInterrupt(15) continue self.log.info("start submitting") # Just run one thread for each job in sequence. Strange things happen # when trying to create a new UserConfig object for each thread. for j in jobs: self.log.debug("%s: preparing submission" % j['appjobid']) jobdescstr = str( self.db.getArcJobDescription(str(j['jobdesc']))) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse( jobdescstr, jobdescs): self.log.error("%s: Failed to prepare job description" % j['appjobid']) continue # TODO: might not work if proxies are different within a share # since same uc object is shared among threads self.uc.CredentialString(self.db.getProxy(j['proxyid'])) t = SubmitThr(Submit, j['id'], j['appjobid'], jobdescs, self.uc, self.log) self.RunThreadsSplit([t], 1) count = count + 1 self.log.info("threads finished") # commit transaction to release row locks self.db.Commit() # EMI-ES proxy problem - see bug 3685 if self.cluster and self.cluster.startswith('https://'): raise ExceptInterrupt(15) self.log.info("end submitting") return count