def run(self): jd = saga.job.description() jd.arguments = ["-c", self.bootstrap_script] jd.executable = "python" jd.working_directory = self.working_directory jd.set_attribute("Interactive", "True") # Submit job js = None if self.userproxy != None and self.userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=self.userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", self.userproxy) s.add_context(ctx) print "use proxy: " + self.userproxy js = saga.job.service(s, self.lrms_saga_url) else: print "use standard proxy" js = saga.job.service(self.lrms_saga_url) sgesshjob = js.create_job(jd) print "Submit pilot job to: " + str(self.lrms_saga_url) sgesshjob.run() sgesshjob.wait() outstr = (sgesshjob.get_stdout().read()).rstrip() errstr = sgesshjob.get_stderr().read() print "Output - \n" + str(outstr) self.job_id=(outstr).split("\n")[-1] print "SGE JobID: " + str(self.job_id) if self.job_id==None or self.job_id=="": raise Exception("BigJob submission via sge-ssh:// failed: %s %s" % (outstr,errstr))
def test_context_and_session(self): """ Test for the saga.file.get_size() API call http://saga.cct.lsu.edu/python/apidoc/saga.file._file.html#file-get_size """ try: ctx1 = saga.context(saga.attributes.context_server) ctx2 = saga.context(saga.attributes.SSH) s = saga.session() s.add_context(ctx1) s.add_context(ctx2) l = s.list_contexts() print len(l) print l[0].type print l[1].type print ctx1.get_ctype() print ctx2.get_ctype() except saga.exception, e: self.fail(e)
def run(self): jd = saga.job.description() jd.arguments = ["-c", self.bootstrap_script] jd.executable = "python" jd.working_directory = self.working_directory jd.set_attribute("Interactive", "True") # Submit job js = None if self.userproxy != None and self.userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=self.userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", self.userproxy) s.add_context(ctx) print "use proxy: " + self.userproxy js = saga.job.service(s, self.lrms_saga_url) else: print "use standard proxy" js = saga.job.service(self.lrms_saga_url) pbssshjob = js.create_job(jd) print "Submit pilot job to: " + str(self.lrms_saga_url) pbssshjob.run() joboutput= pbssshjob.get_stdout() self.job_id=(joboutput.read()).split(".")[0]
import saga try: c1 = saga.context("x509") c1.set_attribute("UserProxy", "/tmp/x509up_u500_cern") c2 = saga.context("x509") c2.set_attribute("UserProxy", "/tmp/x509up_u500_ncsa") s = saga.session() s.add_context(c1) s.add_context(c2) js = saga.job.service(s, saga.url("gram://qb1.loni.org")) except saga.exception, e: print e.get_full_message()
def start_pilot_job(self, lrms_url, bigjob_agent_executable=None, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Prop Adaptor) """ if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = saga.url(lrms_url) self.pilot_url = self.app_url + ":" + lrms_saga_url.host pilot_url_dict[self.pilot_url]=self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## self.number_nodes=int(number_nodes) # create job description jd = saga.job.description() logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if lrms_saga_url.scheme == "condorg": jd.arguments = [ "-a", self.coordination.get_address(), "-b",self.pilot_url] logger.debug("\n\n-a", self.coordination.get_address(),"-b", self.pilot_url) agent_exe = os.path.abspath(os.path.join(os.getcwd(),"..","bootstrap","bigjob-condor-bootstrap.py")) logger.debug(agent_exe) jd.executable = agent_exe else: bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url) if lrms_saga_url.scheme == "gram": bootstrap_script = self.escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro": bootstrap_script = self.escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.escape_ssh(bootstrap_script) ############ submit pbs script which launches bigjob agent using ssh adaptors########## elif lrms_saga_url.scheme == "pbs-ssh": # change the url scheme ssh to use ssh adaptors to launch job bootstrap_script = self.escape_ssh(bootstrap_script) ### convert walltime in minutes to PBS representation of time ### hrs=walltime/60 minu=walltime%60 walltimepbs=""+str(hrs)+":"+str(minu)+":00" if number_nodes%processes_per_node == 0: number_nodes = number_nodes/processes_per_node else: number_nodes = ( number_nodes/processes_per_node) + 1 pbssshj = pbsssh(bootstrap_script,lrms_saga_url, walltimepbs,number_nodes,processes_per_node,userproxy,working_directory) self.job = pbssshj self.job.run() return elif is_bliss: bootstrap_script = self.escape_bliss(bootstrap_script) #logger.debug(bootstrap_script) if is_bliss==False: jd.number_of_processes = str(number_nodes) jd.processes_per_host=str(processes_per_node) else: jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node)) jd.spmd_variation = "single" #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url] jd.arguments = ["-c", bootstrap_script] jd.executable = "python" if queue != None: jd.queue = queue if project !=None: jd.job_project = [project] if walltime!=None: jd.wall_time_limit=str(walltime) # XXX Isn't the working directory about the remote site? if working_directory != None: if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork": os.mkdir(working_directory) self.working_directory = working_directory else: self.working_directory = os.path.expanduser("~") jd.working_directory = self.working_directory logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.__get_bigjob_working_dir(), "stdout-bigjob_agent.txt") jd.error = os.path.join(self.__get_bigjob_working_dir(),"stderr-bigjob_agent.txt") # Stage BJ Input files # build target url bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir() self.__stage_files(filetransfers, bigjob_working_directory_url) # Submit job js = None if userproxy != None and userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", userproxy) s.add_context(ctx) logger.debug("use proxy: " + userproxy) js = saga.job.service(s, lrms_saga_url) else: logger.debug("use standard proxy") js = saga.job.service(lrms_saga_url) self.job = js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run()
def start_pilot_job(self, lrms_url, bigjob_agent_executable, number_nodes, queue, project, working_directory, userproxy, walltime, processes_per_node=1): if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return #register advert entry lrms_saga_url = saga.url(lrms_url) self.pilot_url = self.app_url.get_string() + "/" + lrms_saga_url.host print "create advert entry: " + self.pilot_url self.pilot_dir = saga.advert.directory(saga.url(self.pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) # application level state since globus adaptor does not support state detail self.pilot_dir.set_attribute("state", str(saga.job.Unknown)) self.pilot_dir.set_attribute("stopped", "false") logging.debug("set pilot state to: " + self.pilot_dir.get_attribute("state")) self.number_nodes=int(number_nodes) # create job description jd = saga.job.description() jd.number_of_processes = str(number_nodes) jd.processes_per_host=str(processes_per_node) jd.spmd_variation = "single" jd.arguments = [bigjob_agent_executable, self.database_host, self.pilot_url] jd.executable = "/bin/bash" #jd.executable = bigjob_agent_executable if queue != None: jd.queue = queue if project !=None: jd.job_project = [project] if walltime!=None: jd.wall_time_limit=str(walltime) # XXX Isn't the working directory about the remote site? if working_directory != None: if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork": os.mkdir(working_directory) jd.working_directory = working_directory else: jd.working_directory = "$(HOME)" print "Working directory: " + jd.working_directory jd.output = "stdout-bigjob_agent-" + str(self.uuid) + ".txt" jd.error = "stderr-bigjob_agent-" + str(self.uuid) + ".txt" # Submit job js = None if userproxy != None and userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", userproxy) s.add_context(ctx) print "use proxy: " + userproxy js = saga.job.service(s, lrms_saga_url) else: print "use standard proxy" js = saga.job.service(lrms_saga_url) self.job = js.create_job(jd) print "Submit pilot job to: " + str(lrms_saga_url) self.job.run()
def start_pilot_job(self, lrms_url=None, # in future version one can specify a URL for a cloud (ec2:// vs. nimbus:// vs. eu://) bigjob_agent_executable=None, # n/a number_nodes=1, # number of images requested queue=None, # n/a project=None, # n/a working_directory=None, # working directory userproxy=None, # optional: path to user credential (X509 cert or proxy cert) walltime=None, # optional: walltime cloud_type=None, image_name=None): # optional: EC2 or Nimbus """ The start_pilot_job method will initialize the requested number of images """ print "Working directory: " + working_directory if not os.path.isdir(working_directory): os.mkdir(working_directory) self.working_directory=working_directory self.walltime=walltime self.nodes = [] self.free_nodes = [] self.busynodes = [] self.subjobs = {} self.job_service_cache={} #EC2 environment self.env_dict={} self.cloud_type = cloud_type self.image_name = image_name self.number_requested_nodes=number_nodes self.init_thread=None # for locking self.resource_lock = threading.RLock() # spawn Cloud images start=time.time() host=socket.gethostname() if cloud_type == "EC2": self.pilot_url="ec2://"+host # SSH Context self.ssh_context = saga.context("ssh") self.ssh_context.set_attribute("UserKey", EC2_SSH_PRIVATE_KEY_FILE) self.session = saga.session() self.session.add_context(self.ssh_context) self.instance_type = EC2_INSTANCE_TYPE self.key_name = EC2_KEYNAME #setup environment self.env_dict=self.read_ec2_environments(EC2_ENV_FILE) self.start_ec2_images_in_background(number_nodes) elif cloud_type == "EUCA": self.pilot_url="euca://"+host self.ssh_context = saga.context("ssh") self.ssh_context.set_attribute("UserKey", EUCA_SSH_PRIVATE_KEY_FILE) self.session = saga.session() self.session.add_context(self.ssh_context) self.instance_type = EUCA_INSTANCE_TYPE self.key_name = EUCA_KEYNAME #setup environment self.env_dict=self.read_ec2_environments(EUCA_ENV_FILE) self.start_ec2_images_in_background(number_nodes) elif cloud_type == "NIMBUS": self.pilot_url="nimbus://"+host self.ssh_context = saga.context("ssh") self.ssh_context.set_attribute("UserKey", NIMBUS_SSH_PRIVATE_KEY_FILE) self.session = saga.session() # use default self.session.add_context(self.ssh_context) self.start_nimbus_images_in_background(number_nodes) else: raise UnsupportedCloudType("Cloud Type not supported") # for fast debugging #self.nodes=[{"hostname":"149.165.228.103", "vmid":"i-48F80882", "private_hostname":"192.168.8.2", "cpu_count":1}, # {"hostname":"149.165.228.108", "vmid":"i-40820878", "private_hostname":"192.168.8.3", "cpu_count":1}, # ] # self.nodes = [{"hostname":"tp-x001.ci.uchicago.edu", "vmid":"vm-049", "cpu_count":2}, # {"hostname":"tp-x002.ci.uchicago.edu", "vmid":"vm-050", "cpu_count":2}, # {"hostname":"tp-x004.ci.uchicago.edu", "vmid":"vm-050", "cpu_count":2}, # {"hostname":"tp-x005.ci.uchicago.edu", "vmid":"vm-050", "cpu_count":2}] print "Started " + str(len(self.nodes)) + " nodes in " + str(time.time()-start) # check whether all requested nodes have been started #if len(self.nodes) < number_nodes: # print "Not sufficent resources available: " + str(self.pilot_url) # raise NoResourceAvailable("Not sufficient resources available.") self.launcher_thread=threading.Thread(target=self.start_background_thread) self.launcher_thread.start() print "Finished launching of pilot jobs"
def start_pilot_job(self, lrms_url, bigjob_agent_executable=None, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Prop Adaptor) """ if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = saga.url(lrms_url) self.pilot_url = self.app_url + ":" + lrms_saga_url.host pilot_url_dict[self.pilot_url]=self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## self.number_nodes=int(number_nodes) # create job description jd = saga.job.description() # XXX Isn't the working directory about the remote site? # Yes, it is: This is to make sure that if fork if working_directory != None: if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork": os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing self.working_directory = os.path.expanduser("~") # Stage BJ Input files # build target url # this will also create the remote directory for the BJ if lrms_saga_url.username!=None and lrms_saga_url.username!="": bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir() else: bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir() # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory if self.__create_remote_directory(bigjob_working_directory_url)==True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, bigjob_working_directory_url) else: logger.warn("For file staging. SSH (incl. password-less authentication is required.") logger.debug("BJ Working Directory: %s", self.working_directory) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if lrms_saga_url.scheme == "condorg": jd.arguments = [ self.coordination.get_address(), self.pilot_url] agent_exe = os.path.abspath(os.path.join(os.path.dirname(__file__),"..","bootstrap","bigjob-condor-bootstrap.py")) logger.debug("agent_exe",agent_exe) jd.executable = agent_exe else: bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url) if lrms_saga_url.scheme == "gram": bootstrap_script = self.escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme=="xt5torque" or lrms_saga_url.scheme=="torque": bootstrap_script = self.escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.escape_ssh(bootstrap_script) ############ submit pbs script which launches bigjob agent using ssh adaptors########## elif lrms_saga_url.scheme == "pbs-ssh": bootstrap_script = self.escape_ssh(bootstrap_script) # PBS specific BJ plugin pbssshj = pbsssh(bootstrap_script, lrms_saga_url, walltime, number_nodes, processes_per_node, userproxy, self.working_directory, self.working_directory) self.job = pbssshj self.job.run() return ############ submit sge script which launches bigjob agent using ssh adaptors########## elif lrms_saga_url.scheme == "sge-ssh": bootstrap_script = self.escape_ssh(bootstrap_script) # PBS specific BJ plugin sgesshj = sgessh(bootstrap_script, lrms_saga_url, walltime, number_nodes, processes_per_node, userproxy, project, queue, self.working_directory, self.working_directory) self.job = sgesshj self.job.run() return elif is_bliss: bootstrap_script = self.escape_bliss(bootstrap_script) #logger.debug(bootstrap_script) if is_bliss==False: jd.number_of_processes = str(number_nodes) jd.processes_per_host=str(processes_per_node) else: jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node)) jd.spmd_variation = "single" #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url] jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" if queue != None: jd.queue = queue if project !=None: jd.job_project = [project] if walltime!=None: jd.wall_time_limit=str(walltime) jd.working_directory = self.working_directory logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.working_directory, "stdout-bigjob_agent.txt") jd.error = os.path.join(self.working_directory,"stderr-bigjob_agent.txt") # Submit job js = None if userproxy != None and userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", userproxy) s.add_context(ctx) logger.debug("use proxy: " + userproxy) js = saga.job.service(s, lrms_saga_url) else: logger.debug("use standard proxy") js = saga.job.service(lrms_saga_url) logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url