def start_pilot_job(self, lrms_url, bigjob_agent_executable, number_nodes, queue, project, working_directory, userproxy, walltime): """ start advert_launcher on specified host """ if userproxy != None and userproxy != '': os.environ["X509_USER_PROXY"]=userproxy print "use proxy: " + userproxy else: print "use standard proxy" #register advert entry lrms_saga_url = saga.url(lrms_url) self.pilot_url = self.app_url.get_string() + "/" + lrms_saga_url.host print "create advert entry: " + self.pilot_url self.pilot_dir = saga.advert.directory(saga.url(self.pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) # application level state since globus adaptor does not support state detail self.pilot_dir.set_attribute("state", str(saga.job.Unknown)) logging.debug("set pilot state to: " + self.pilot_dir.get_attribute("state")) self.number_nodes=int(number_nodes) # create job description jd = saga.job.description() jd.number_of_processes = str(number_nodes) jd.spmd_variation = "single" jd.arguments = [bigjob_agent_executable, self.database_host, self.pilot_url] jd.executable = "/bin/bash" #jd.executable = bigjob_agent_executable if queue != None: jd.queue = queue if project !=None: jd.job_project = [project] if walltime!=None: jd.wall_time_limit=str(walltime) if working_directory != None: jd.working_directory = working_directory else: jd.working_directory = "$(HOME)" # if not os.path.isdir(jd.working_directory): # os.mkdir(jd.working_directory) print "Working directory: " + jd.working_directory jd.output = "stdout-bigjob_agent-" + str(self.uuid) + ".txt" jd.error = "stderr-bigjob_agent-" + str(self.uuid) + ".txt" # Submit jbo js = saga.job.service(lrms_saga_url) self.job = js.create_job(jd) print "Submit pilot job to: " + str(lrms_saga_url) self.job.run() return self.job
def copy_files_to_location(self, new_pilot_data_url): new_file_locations = [] for i in self.file_registry: logging.debug(i) try: dir = saga.filesystem.directory(saga.url(new_pilot_data_url), saga.filesystem.Create | saga.filesystem.ReadWrite) except: traceback.print_exc(file=sys.stdout) print "Could not create: " + str(new_pilot_data_url) # copy files try: source_url = saga.url(str(i)) filename = os.path.split(source_url.path)[1] dest_url = saga.url(os.path.join(new_pilot_data_url, filename)) print "copy file: " + str(i) + " to " + str(dest_url) sagafile = saga.filesystem.file(source_url) sagafile.copy(dest_url, saga.filesystem.Overwrite) new_file_locations.append(dest_url) except saga.exception, e: traceback.print_exc(file=sys.stdout) error_msg = "file %s failed to be copied to"%(i) logging.error(error_msg)
def dequeue_job(self, pilot_url): """ deque to new job of a certain pilot """ self.resource_lock.acquire() #pilot_url = self.get_url(pilot_url) jobs = [] new_job_dir_url = self.get_url(pilot_url + "/new/") new_job_dir = saga.advert.directory( saga.url(new_job_dir_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) new_jobs = new_job_dir.list() logger.debug("Pilot Job base dir: " + new_job_dir_url + " #new jobs: " + str(len(new_jobs)) + " jobs: " + str(new_jobs)) if len(new_jobs) >= 1: job_entry = new_jobs[0] job_dir_url = self.get_url(pilot_url + "/new/" + "/" + job_entry.get_string()) logger.debug("Open job at " + str(job_dir_url)) job_dir = saga.advert.directory( saga.url(job_dir_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) #new_job_dir.open_dir(job_entry) job_url = job_dir.get_attribute("joburl") #remove old job entry job_dir.remove(self.__remove_dbtype(job_dir_url), saga.name_space.Recursive) logger.debug("Dequeued new job: " + str(job_url)) self.resource_lock.release() return self.__remove_dbtype(job_url) else: self.resource_lock.release() time.sleep(1) return
def file_stage_in_with_saga(self, input_file_list_with_path, remote_url_prefix, remote_dir): cwd = os.getcwd() for ifile in input_file_list_with_path: # destination url dest_url = saga.url(remote_url_prefix + "/") ifile_basename = os.path.basename(ifile) try: dest_dir = saga.url(remote_url_prefix) dest_dir.path = remote_dir saga.file.directory(dest_dir, saga.file.Create | saga.file.ReadWrite) except: print "Could not create: " + dest_dir.get_string() dest_url.path = os.path.join(remote_dir, ifile_basename) # source url source_url = saga.url('file://' + os.path.join(cwd, ifile)) if not os.path.isfile(ifile): error_msg = "Input file %s does not exist in %s"%(ifile_basename, os.path.dirname(ifile)) print(error_msg) else: try: print "stage file: " + source_url.get_string() + " to " + dest_url.get_string() sagafile = saga.filesystem.file(source_url) sagafile.copy(dest_url) except saga.exception, e: error_msg = "Input file %s failed to be staged in"%(ifile_basename) print(error_msg)
def dequeue_job(self, pilot_url): """ deque to new job of a certain pilot """ self.resource_lock.acquire() #pilot_url = self.get_url(pilot_url) jobs = [] new_job_dir_url = self.get_url(pilot_url + "/new/") new_job_dir = saga.advert.directory(saga.url(new_job_dir_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) new_jobs = new_job_dir.list() logger.debug("Pilot Job base dir: " + new_job_dir_url + " #new jobs: " + str(len(new_jobs)) + " jobs: " + str(new_jobs)); if len(new_jobs)>=1: job_entry=new_jobs[0] job_dir_url = self.get_url(pilot_url + "/new/" + "/" + job_entry.get_string()) logger.debug("Open job at " + str(job_dir_url)) job_dir = saga.advert.directory(saga.url(job_dir_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) #new_job_dir.open_dir(job_entry) job_url = job_dir.get_attribute("joburl") #remove old job entry job_dir.remove(self.__remove_dbtype(job_dir_url), saga.name_space.Recursive) logger.debug("Dequeued new job: " + str(job_url)) self.resource_lock.release() return self.__remove_dbtype(job_url) else: self.resource_lock.release() time.sleep(1) return
def file_stage_in_with_saga(self, input_file_list_with_path, remote_machine_ip, remote_dir): cwd = os.getcwd() for ifile in input_file_list_with_path: # destination url if remote_machine_ip.find("localhost") >= 0: dest_url_str = "file://" else: dest_url_str = "gridftp://" + remote_machine_ip + "/" ifile_basename = os.path.basename(ifile) try: dest_dir = dest_url_str + remote_dir saga.file.directory(saga.url(dest_dir), saga.file.Create | saga.file.ReadWrite) except: print "Could not create: " + dest_dir dest_url_str = dest_url_str + os.path.join(remote_dir, ifile_basename) # source url source_url_str = "file://" + os.path.join(cwd, ifile) if not os.path.isfile(ifile): error_msg = "Input file %s does not exist in %s" % (ifile_basename, os.path.dirname(ifile)) logging.error(error_msg) else: try: source_url = saga.url(source_url_str) dest_url = saga.url(dest_url_str) print "stage file: " + source_url_str + " to " + dest_url_str sagafile = saga.file.file(source_url) sagafile.copy(dest_url) logging.info("Now Input file %s is staged into %s" % (ifile_basename, dest_url_str)) except saga.exception, e: error_msg = "Input file %s failed to be staged in" % (ifile_basename) logging.error(error_msg)
def remote_filecopy_with_saga(filename_with_local_path_from, machine_url_from, filename_with_local_path_to, machine_url_to): source_url = saga.url('file://' + machine_url_from + filename_with_local_path_from) dest_url = saga.url('gridftp://' + machine_url_to + filename_with_local_path_to) sagafile = saga.file.file(source_url) try: sagafile.copy(dest_url) print "\n(DEBUG) remote file copy from %s of %s to %s of %s is attempted"%(filename_with_local_path_from, machine_url_from, filename_with_local_path_to, machine_url_to) except saga.exception, e: print "\n(WARNING) remote file copy from %s of %s to %s of %s is failed"%(filename_with_local_path_from, machine_url_from, filename_with_local_path_to, machine_url_to) return "Failed"
def file_stage_out_with_saga(self, file_list, local_dir, remote_url_prefix, remote_dir): for ifile in file_list: try: source_url = saga.url(remote_url_prefix) source_url.path= os.path.join(remote_dir, ifile) dest_url = saga.url("file:///" + local_dir + "/" + ifile) #dest_url.path = ifile print "(DEBUG) Staging out output.txt file at %s to %s"%(source_url.get_string(), dest_url.get_string()) sagafile = saga.filesystem.file(source_url) sagafile.copy(dest_url) except saga.exception, e: error_msg = "File stage out failed: "+ source_url.get_string()
def copy_with_saga(i): if i<RPB: os.system("cp "+ WORK_DIR + "/NPT.conf " + WORK_DIR + "agent/" + str(i) + "/NPT.conf") # source_url = saga.url('file://' + WORK_DIR + 'NPT.conf') # dest_url = saga.url('file://' + WORK_DIR + 'agent/' + str(i) + '/') elif (i>=RPB and i<(2*RPB)): source_url = saga.url('file://' + WORK_DIR + 'NPT.conf') dest_url = saga.url('gridftp://' + REMOTE1 + WORK_DIR+'agent/'+str(i)+'/') sagafile = saga.filesystem.file(source_url) try: sagafile.copy(dest_url) except saga.exception, e: print "\n(ERROR) remote ###NPT.CONF####file copy from %s to %s failed"%(HOST, REMOTE1)
def chunk_size(self,chunksize): self.__chunksize = chunksize chunk_list=[] group_chunks={} input=saga.url(self.__input_dir).path temp=saga.url(self.__tmp_dir).path dirList=os.listdir(input) for fname in dirList: os.system("cd " + temp + "; split -d -a 5 -b " + str(chunksize) + " " + input + "/" + fname + " " + fname + "--" ) dirList=os.listdir(temp) for fname in dirList: chunk_list.append([temp + "/" + fname]) return chunk_list
def stage_ifiles(i): if not i%2: try: os.mkdir(WORK_DIR + 'agent/' + str(i)) except OSError: pass for ifile in os.listdir(REPLICA_DIR): source_url = saga.url('file://' + REPLICA_DIR + ifile) dest_url = saga.url('file://' + WORK_DIR + 'agent/'+ str(i)+'/') sagafile = saga.filesystem.file(source_url) try: sagafile.copy(dest_url) except saga.exception, e: print str(e) + "\n(ERROR) local file ####STAGING### copy from %s to %s failed"%(REPLICA_DIR, HOST)
def stage_ifiles(i): if not i % 2: try: os.mkdir(WORK_DIR + "agent/" + str(i)) except OSError: pass for ifile in os.listdir(REPLICA_DIR): source_url = saga.url("file://" + REPLICA_DIR + ifile) dest_url = saga.url("file://" + WORK_DIR + "agent/" + str(i) + "/") sagafile = saga.filesystem.file(source_url) try: sagafile.copy(dest_url) except saga.exception, e: print str(e) + "\n(ERROR) local file ####STAGING### copy from %s to %s failed" % (REPLICA_DIR, HOST)
def file_stage_in_with_saga(input_file_list_with_path, remote_machine_ip, remote_dir, RE_info): userproxy = None try: userproxy = RE_info.userproxy[RE_info.remote_hosts.index( remote_machine_ip)] except: try: userproxy = RE_info.userproxy[RE_info.gridftp_hosts.index( remote_machine_ip)] except: pass if userproxy != None or userproxy == "": os.environ["X509_USER_PROXY"] = userproxy print "use proxy: " + userproxy else: print "use standard proxy" for ifile in input_file_list_with_path: if remote_machine_ip.find('localhost') >= 0: dest_url_str = 'file://' else: dest_url_str = 'gridftp://' + remote_machine_ip + "/" source_url_str = 'file://' print "stage file: " + ifile + " to " + dest_url_str ifile_basename = os.path.basename(ifile) if not os.path.isfile(ifile): error_msg = "Input file %s does not exist in %s" % ( ifile_basename, os.path.dirname(ifile)) logging.error(error_msg) else: try: source_url_str = source_url_str + ifile dest_url_str = dest_url_str + os.path.join( remote_dir, ifile_basename) source_url = saga.url(source_url_str) dest_url = saga.url(dest_url_str) print "stage file: " + source_url_str + " to " + dest_url_str sagafile = saga.file.file(source_url) sagafile.copy(dest_url) logging.info("Now Input file %s is staged into %s" % (ifile_basename, dest_url_str)) except saga.exception, e: error_msg = "Input file %s failed to be staged in" % ( ifile_basename) logging.error(error_msg)
def copy_with_saga(i): print "####################start time(npt.conf copy)" + time.asctime(time.localtime(time.time())) + "##################" start = time.time() if i<RPB: os.system("cp "+ WORK_DIR + "/NPT.conf " + WORK_DIR + "agent/" + str(i) + "/NPT.conf") # source_url = saga.url('file://' + WORK_DIR + 'NPT.conf') # dest_url = saga.url('file://' + WORK_DIR + 'agent/' + str(i) + '/') elif (i>=RPB and i<(2*RPB)): source_url = saga.url('file://' + WORK_DIR + 'NPT.conf') dest_url = saga.url('gridftp://' + REMOTE1 + WORK_DIR+'agent/'+str(i)+'/') sagafile = saga.filesystem.file(source_url) try: sagafile.copy(dest_url) except saga.exception, e: print "\n(ERROR) remote ###NPT.CONF####file copy from %s to %s failed"%(HOST, REMOTE1)
def test_file_move(self): """ Test for the saga.filesystem.move() API call http://saga.cct.lsu.edu/python/apidoc/saga.filesystem._file.html#file-move """ try: source_url = saga.url("file:///"+self.filename_a) target_url = saga.url("file:///"+self.filename_d) my_file = saga.filesystem.file(source_url) my_file.move(target_url) my_file.close() except saga.exception, e: self.fail(e.get_full_message())
def copy_with_saga(i): # print "####################start time(npt.conf copy)" + time.asctime(time.localtime(time.time())) + "##################" start = time.time() if i<RPB: os.system("cp "+ WORK_DIR +"agent/"+ str(replica_id)+ "/NPT.conf " + WORK_DIR + "agent/" + str(i) + "/NPT.conf") # source_url = saga.url('file://' + WORK_DIR + 'NPT.conf') # dest_url = saga.url('file://' + WORK_DIR + 'agent/' + str(i) + '/') elif (i>=RPB and i<(2*RPB)): source_url = saga.url('file://' + WORK_DIR + 'NPT.conf') dest_url = saga.url('gridftp://' +"gridftp.ranger.tacc.teragrid.org:2811" + WORK_DIR1+'agent/'+str(i)+'/') sagafile = saga.filesystem.file(source_url) try: sagafile.copy(dest_url) except saga.exception, e: print "\n(ERROR) remote ###NPT.CONF####file copy from %s to %s failed"%(HOST, REMOTE1)
def __parse_url(self, url): try: surl = saga.url(url) host = surl.host port = surl.port username = surl.username password = surl.password query = surl.query scheme = "%s://"%surl.scheme except: """ Fallback URL parser based on Python urlparse library """ logger.error("URL %s could not be parsed") traceback.print_exc(file=sys.stderr) result = urlparse.urlparse(url) host = result.hostname port = result.port username = result.username password = result.password if url.find("?")>0: query = url[url.find("?")+1:] else: query = None scheme = "%s://"%result.scheme return scheme, username, password, host, port, query
def delete_du(cls, du_url): du_url = cls.__get_url(du_url) du_dir = saga.advert.directory(saga.url(du_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) du_dir.remove(du_url, saga.name_space.Recursive)
def setup_charmpp_nodefile(self, allocated_nodes): """ Setup charm++ nodefile to use for executing NAMD HACK!! Method violates layering principle File $HOME/machinefile in charm++ nodefileformat is written to first node in list """ # Nodelist format: # # host tp-x001 ++cpus 2 ++shell ssh # host tp-x002 ++cpus 2 ++shell ssh nodefile_string="" for i in allocated_nodes: if i.has_key("private_hostname"): nodefile_string=nodefile_string + "host "+ i["private_hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" else: nodefile_string=nodefile_string + "host "+ i["hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n" # copy nodefile to rank 0 node jd = saga.job.description() jd.executable = "echo" jd.number_of_processes = "1" jd.spmd_variation = "single" # ssh [email protected] "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys" jd.arguments = ["\""+nodefile_string+"\"", ">", "machinefile"] jd.output = "stdout.txt" jd.error = "stderr.txt" job_service_url = saga.url("ssh://root@"+allocated_nodes[0]["hostname"]) job_service = saga.job.service(self.session, job_service_url) job = job_service.create_job(jd) job.run() job.wait()
def set_job_state(self, job_url, new_state): self.resource_lock.acquire() job_url = self.get_url(job_url) logger.debug("Set state of job: " + str(job_url) + " to: " + str(new_state)) job_dir = saga.advert.directory(saga.url(job_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) job_dir.set_attribute("state", str(new_state)) self.resource_lock.release()
def submit_job(self, glidin_url, jd): """ submit job via advert service to NAMD-Launcher dest_url - url reference to advert job or host on which the advert job is going to run""" print "submit job: " + str(glidin_url) if self.job_url==None: self.job_url=self.get_job_url(glidin_url) for i in range(0,3): try: print "create job entry " self.job_dir = saga.advert.directory(saga.url(self.job_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) print "initialized advert directory for job: " + self.job_url # put job description attributes to advert attributes = jd.list_attributes() for i in attributes: if jd.attribute_is_vector(i): self.job_dir.set_vector_attribute(i, jd.get_vector_attribute(i)) else: logging.debug("Add attribute: " + str(i) + " Value: " + jd.get_attribute(i)) self.job_dir.set_attribute(i, jd.get_attribute(i)) self.job_dir.set_attribute("state", str(saga.job.Unknown)) # return self object for get_state() query #logging.debug("Submission time (time to create advert entries): " + str(time.time()-start) + " s") return self except: traceback.print_exc(file=sys.stdout)
def __init__(self, database_host): self.database_host = database_host print "init advert service session at host: " + database_host self.uuid = uuid.uuid1() self.app_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(self.uuid) + "/") self.app_dir = saga.advert.directory(self.app_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) print "created advert directory for application: " + self.app_url.get_string()
def get_jobs_of_pilot(self, pilot_url): pilot_url = self.get_url(pilot_url + "/jobs") """ returns array of job_url that are associated with a pilot """ pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) jobs = pilot_dir.list() j = [self.__remove_dbtype(pilot_url) + "/" + i.get_string() for i in jobs] return j
def submit_job(self, pilot_url, jd, rid): """ submit job via advert service to NAMD-Launcher dest_url - url reference to advert job or host on which the advert job is going to run""" print "submit job: " + str(pilot_url) if self.job_url == None: self.job_url = self.get_job_url(pilot_url) for i in range(0, 3): try: print "create job entry " self.job_dir = saga.advert.directory( saga.url(self.job_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) print "initialized advert directory for job: " + self.job_url # put job description attributes to advert attributes = jd.list_attributes() for i in attributes: if jd.attribute_is_vector(i): self.job_dir.set_vector_attribute( i, jd.get_vector_attribute(i)) else: logging.debug("Add attribute: " + str(i) + " Value: " + jd.get_attribute(i)) self.job_dir.set_attribute(i, jd.get_attribute(i)) self.job_dir.set_attribute("state", str(saga.job.Unknown)) self.job_dir.set_attribute("energy", "unknown energy") self.job_dir.set_attribute("temp", "unknown temp") self.job_dir.set_attribute("replica_id", rid) # return self object for get_state() query #logging.debug("Submission time (time to create advert entries): " + str(time.time()-start) + " s") return self except: traceback.print_exc(file=sys.stdout)
def setup_image(self, hostname): """ ensure ssh keys are properly setup (works for Nimbus, Eucalyptus and EC2 """ jd = saga.job.description() jd.executable = "/usr/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" # ssh [email protected] "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys" jd.arguments = ["~/.ssh/id_rsa.pub", ">>", "~/.ssh/authorized_keys" ] jd.output = "stdout.txt" jd.error = "stderr.txt" for i in range (0, 3): try: job_service_url = saga.url("ssh://root@" + hostname) job_service = saga.job.service(self.session, job_service_url) job = job_service.create_job(jd) job.run() # Cache job service object for later usage self.job_service_cache[job_service_url] =job_service # wait for completion of job job.wait() return except: pass time.sleep(30)
def set_pilot_state(self, pilot_url, new_state, stopped=False): pilot_url = self.get_url(pilot_url) logger.debug("create advert entry: " + pilot_url) pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) logger.debug("update state of pilot job to: " + str(new_state) + " Stopped: " + str(stopped)) pilot_dir.set_attribute("state", str(new_state)) pilot_dir.set_attribute("stopped", str(stopped))
def delete_pds(cls, pds_url): pds_url = cls.__get_url(pds_url) pds_dir = saga.advert.directory(saga.url(pds_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) pds_dir.remove(pds_url, saga.name_space.Recursive)
def m_dirs(): global i_fsize , c_size , s_out, e_time, nbr_maps, nbr_reduces, app_url, app_dir, qtimes,b_uuid global database_host for m,q in machine.iteritems(): pilot_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(b_uuid) + "/" + m + "/" ) pilot_dir = saga.advert.directory(pilot_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) pilot_dir.set_attribute("state", "Unknown") print " created pilot directory " + str(app_url) + " with state Unknown" new_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(b_uuid) + "/" + m + "/" + "new") new_dir = saga.advert.directory(new_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) print " created new directory " + str(new_url) for i in range(int(q)): pilot_dir.set_attribute("state", "Unknown") pilot_dir.set_attribute("state", "Running")
def allocate_nodes(self, number_of_nodes): """ allocate nodes - remove nodes from free nodes list return SAGA-URL to resource ssh://tx.domain.org """ allocated_nodes = [] self.resource_lock.acquire() if (len(self.free_nodes)>=number_of_nodes): for i in self.free_nodes[:]: number = i["cpu_count"] print "Pilot: " + self.pilot_url + " Allocate: " + i["hostname"] + " number cores: " + str(number) if(number_of_nodes > 0): allocated_nodes.append(i) self.free_nodes.remove(i) self.busynodes.append(i) number_of_nodes = number_of_nodes - 1 else: break self.resource_lock.release() self.setup_charmpp_nodefile(allocated_nodes) return saga.url("ssh://root@" + allocated_nodes[0]["hostname"]), allocated_nodes else: print "BigJob: " + str(self.pilot_url) + ": Not sufficient resources for job." self.resource_lock.release() return "", []
def delete_job(self): print "delete job and close dirs: " + self.job_url try: self.job_dir.change_dir("..") self.job_dir.remove(saga.url(self.job_url), saga.name_space.Recursive) self.job_dir.close() except: pass
def from_advert(ps_url): logging.debug("Open pilot store at: " + ps_url.get_string()) ps_dir = saga.advert.directory(ps_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) ps = pilot_store() ps.base_dir=saga.url(ps_dir.get_attribute("base_dir")) ps.name=ps_dir.get_attribute("name") ps.uuid=ps_dir.get_attribute("uuid") ps.weight=ps_dir.get_attribute("weight") ps.pd_url=saga.url(ps_dir.get_attribute("pd_url")) ps.number_of_chunks=int(ps_dir.get_attribute("number_of_chunks")) if (ps_dir.attribute_exists("file_registry") == True): ps.file_registry = [saga.url(x) for x in ps_dir.get_vector_attribute("file_registry")] else: ps.file_registry=[] return ps
def cancel(self): print "delete job and close dirs: " + self.job_url try: self.job_dir.change_dir("..") self.job_dir.remove(saga.url(self.job_url), saga.name_space.Recursive) self.job_dir.close() except: pass
def get_job(self, job_url): #job_dir = saga.advert.directory(saga.url(job_url), # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) job_url = self.get_url(job_url + "/job-description") logger.debug("Get job description from: %s" % (job_url)) job_desc_entry = saga.advert.entry(saga.url(job_url), saga.advert.Read) job_dict = json.loads(job_desc_entry.retrieve_string()) return job_dict
def __init__(self, database_host): self.database_host = database_host #print "init advert service session at host: " + database_host self.uuid = get_uuid() self.app_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(self.uuid) + "/") self.app_dir = saga.advert.directory(self.app_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) self.state=saga.job.Unknown self.pilot_url=""
def __store_entry(cls, entry_url, content): entry_url = cls.__get_url(entry_url) # directory is recursively created entry = saga.advert.entry( saga.url(entry_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) entry.store_string(json.dumps(content))
def __store_entry(cls, entry_url, content): entry_url = cls.__get_url(entry_url) # directory is recursively created entry = saga.advert.entry(saga.url(entry_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) entry.store_string(json.dumps(content))
def __init__(self, args): self.database_host = args[1] # objects to store running jobs and processes self.jobs = [] self.processes = {} self.freenodes = [] self.busynodes = [] self.restarted = {} # read config file conf_file = os.path.dirname(args[0]) + "/" + CONFIG_FILE config = ConfigParser.ConfigParser() print ("read configfile: " + conf_file) config.read(conf_file) default_dict = config.defaults() self.CPR = default_dict["cpr"] self.SHELL=default_dict["shell"] self.MPIRUN=default_dict["mpirun"] print "cpr: " + self.CPR + " mpi: " + self.MPIRUN + " shell: " + self.SHELL # init cpr monitoring self.init_cpr() # init rms (SGE/PBS) self.init_rms() self.failed_polls = 0 # open advert service base url hostname = socket.gethostname() self.base_url = args[2] print "Open advert: " + self.base_url try: self.base_dir = saga.advert.directory(saga.url(self.base_url), saga.advert.Create | saga.advert.ReadWrite) self.new_job_dir = saga.advert.directory(saga.url(self.base_url+"/new/"), saga.advert.Create| saga.advert.CreateParents | saga.advert.ReadWrite) except: print "No advert entry found at specified url: " + self.base_url traceback.print_exc(file=sys.stderr) # update state of glidin job to running self.update_glidin_state() # start background thread for polling new jobs and monitoring current jobs self.resource_lock=threading.Lock() self.launcher_thread=threading.Thread(target=self.start_background_thread()) self.launcher_thread.start()
def get_job(self, job_url): #job_dir = saga.advert.directory(saga.url(job_url), # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) job_url = self.get_url(job_url+"/job-description") logger.debug("Get job description from: %s"%(job_url)) job_desc_entry = saga.advert.entry(saga.url(job_url), saga.advert.Read) job_dict = json.loads(job_desc_entry.retrieve_string()) return job_dict
def __init__(self, server=ADVERT_SERVER, server_port=ADVERT_SERVER_PORT, server_connect_url=None, username=None, password=None, dbtype=None, url_prefix=None): ''' Constructor ''' #pdb.set_trace() if url_prefix == None: url_prefix = ADVERT_URL_SCHEME if username != None and username != "": url_prefix = url_prefix + username if password != None: url_prefix = url_prefix + ":" + password url_prefix = url_prefix + "@" if server_connect_url != None: self.address = server_connect_url elif server_port != None: self.address = url_prefix + "%s:%i" % (server, server_port) elif server != None: self.address = url_prefix + "%s" % (server) self.username = "" self.password = "" self.dbtype = "" surl = saga.url(self.address) if server_connect_url == None: # Manager if username != None: surl.username = username self.username = username if password != None: surl.password = password self.password = password if dbtype != None: #surl.query = dbtype self.dbtype = dbtype else: # Agent if surl.query != None: self.dbtype = surl.query surl.query = "" self.address = str(surl) self.pilot_url = self.address logger.debug("Server: " + str(server) + " Port " + str(server_port) + " Url prefix: " + str(url_prefix) + " Address: " + str(self.get_address()) + " server_connect_url: " + str(server_connect_url)) logger.debug("Initialized Coordination to: %s (DB: %s)" % (self.address, self.dbtype)) self.resource_lock = threading.RLock()
def map_job_submit(self): ########################################################################################## print " >>> Starting Mapping ..................... \n" jobs = [] job_start_times = {} job_states = {} for u in self.__chunk_list: k = u.replace('//', '/').split('/') uname = (os.path.split(u))[1] temp_abs_path = "/" + "/".join(k[2:len(k) - 1]) + "/" + uname print " >>> chunk path/name to be submitted to map subjob " + temp_abs_path + " >>> " + uname # create job description try: jd = saga.job.description() jd.executable = self.__mapper jd.number_of_processes = self.workers jd.spmd_variation = "single" jd.arguments = [temp_abs_path, str(self.__nbr_reduces)] jd.working_directory = saga.url(self.__tmp_dir).path jd.output = self.workingdirectory + "/stdout-" + uname + ".txt" jd.error = self.workingdirectory + "/stderr-" + uname + ".txt" js = saga.job.service(saga.url(self.resource_url)) job = js.create_job(jd) print "Submited sub-job " + self.resource_url + "." job.run() jobs.append(job) job_start_times[job] = time.time() job_states[job] = job.get_state() except: #traceback.print_exc(file=sys.stdout) print " Map Job failed. Cancelling framework......" sys.exit(0) print "************************ All Jobs submitted ************************" print " No of map subjobs created - " + str(len(jobs)) # Wait for task completion of map tasks - synchronization ############################################################################################ # Wait for task completion of map tasks - synchronization wait_for_all_jobs(jobs, job_start_times, job_states, 5)
def get_pilot_state(self, pilot_url): pilot_url = self.get_url(pilot_url) pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Read) state = pilot_dir.get_attribute("state") stopped = pilot_dir.get_attribute("stopped") if stopped == "false" or stopped == "False": return {"state": state, "stopped": False} else: return {"state": state, "stopped": True}
def set_pilot_state(self, pilot_url, new_state, stopped=False): pilot_url = self.get_url(pilot_url) logger.debug("create advert entry: " + pilot_url) pilot_dir = saga.advert.directory( saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) logger.debug("update state of pilot job to: " + str(new_state) + " Stopped: " + str(stopped)) pilot_dir.set_attribute("state", str(new_state)) pilot_dir.set_attribute("stopped", str(stopped))
def get_job_url(self, pilot_url): self.saga_pilot_url = saga.url(pilot_url) if(self.saga_pilot_url.scheme=="advert"): # pass else: # any other url, try to guess pilot job url host="" try: host = self.saga_pilot_url.host except: pass if host =="": host=socket.gethostname() # create dir for destination url self.saga_pilot_url = saga.url("advert://" + self.database_host + "/"+APPLICATION_NAME + "/" + host) # create dir for job self.job_url = self.saga_pilot_url.get_string() + "/" + str(self.uuid) return self.job_url
def fastq_chunk(self,lines): chunk_list=[] group_chunks={} input=saga.url(self.__input_dir).path temp=saga.url(self.__tmp_dir).path dirList=os.listdir(input) for fname in dirList: os.system("cd " + temp + "; split -d -a 5 -l " + str(lines) + " " + input + "/" + fname + " " + fname + "--" ) dirList=os.listdir(temp) for fname in sorted(dirList): chunk_list.append(temp + "/" + fname) for chunk in chunk_list: seq=chunk.split("--")[1] if group_chunks.has_key(seq): group_chunks[seq].append(chunk) else: group_chunks[seq] = chunk.split() self.__chunk_list = group_chunks.values() return self.__chunk_list
def submit_job_cpr(self, dest_url_string, jd, checkpt_files): error_string = "" start = time.time() js = saga.cpr.service(saga.url(dest_url_string)) jd_start = jd jd_restart = jd new_cpr_job = js.create_job(jd_start, jd_restart) new_cpr_job.run() print "job state: " + str(new_cpr_job.get_state()) print "spawning time " + "%d" % (time.time() - start) + " s" return error_string, new_cpr_job
def __retrieve_entry(cls, entry_url): entry_url = cls.__get_url(entry_url) #logger.debug("Retrieve Advert entry at: " + entry_url) # directory is recursively created entry = saga.advert.entry( saga.url(entry_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) content = json.loads(entry.retrieve_string()) #logger.debug("Retrieve Advert entry at: " + entry_url # + " Content: " + str(json.dumps(content))) return content
def get_job_url(self, glidin_url): self.saga_glidin_url = saga.url(glidin_url) if(self.saga_glidin_url.scheme=="advert"): # pass else: # any other url, try to guess glidin job url host="" try: host = self.saga_glidin_url.host except: pass if host =="": host=socket.gethostname() # create dir for destination url self.saga_glidin_url = saga.url("advert://" + self.database_host + "/"+APPLICATION_NAME + "/" + host) #self.glidin_dir = saga.advert.directory(self.saga_glidin_url, # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) # create dir for job self.job_url = self.saga_glidin_url.get_string() + "/" + str(self.uuid) return self.job_url
def queue_job(self, pilot_url, job_url): self.resource_lock.acquire() #pilot_url = self.get_url(pilot_url) job_url = self.get_url(job_url) """ queue new job to pilot """ new_job_url = self.get_url(pilot_url + "/new/" + str(uuid.uuid1())) logger.debug("Job URL: %s Create new job entry at: %s" % (job_url, new_job_url)) new_job_dir = saga.advert.directory( saga.url(new_job_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) new_job_dir.set_attribute("joburl", job_url) self.resource_lock.release()
def file_stage_in_with_saga(self, input_file_list_with_path, remote_machine_ip, remote_dir): cwd = os.getcwd() for ifile in input_file_list_with_path: # destination url if remote_machine_ip.find('localhost') >= 0: dest_url_str = 'file://' else: dest_url_str = 'gridftp://' + remote_machine_ip + "/" ifile_basename = os.path.basename(ifile) try: dest_dir = dest_url_str + remote_dir saga.file.directory(saga.url(dest_dir), saga.file.Create | saga.file.ReadWrite) except: print "Could not create: " + dest_dir dest_url_str = dest_url_str + os.path.join(remote_dir, ifile_basename) # source url source_url_str = 'file://' + os.path.join(cwd, ifile) if not os.path.isfile(ifile): error_msg = "Input file %s does not exist in %s" % ( ifile_basename, os.path.dirname(ifile)) logging.error(error_msg) else: try: source_url = saga.url(source_url_str) dest_url = saga.url(dest_url_str) print "stage file: " + source_url_str + " to " + dest_url_str sagafile = saga.file.file(source_url) sagafile.copy(dest_url) logging.info("Now Input file %s is staged into %s" % (ifile_basename, dest_url_str)) except saga.exception, e: error_msg = "Input file %s failed to be staged in" % ( ifile_basename) logging.error(error_msg)
def add_pd(cls, pds_url, pd): pds_url = cls.__remove_dbtype(pds_url) pd_url = pds_url + "/" + pd.id pd_description_url = cls.__get_url(pd_url + "/description") logger.debug("PDS URL: %s, PD Description URL: %s" % (pds_url, pd_description_url)) # directory is recursively created pd_desc_entry = saga.advert.entry( saga.url(pd_description_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) logger.debug("initialized advert entry for pds: " + pd_description_url) pd_desc_entry.store_string(json.dumps(pd.data_unit_description)) return pd_url
def set_job(self, job_url, job_dict): job_dir_url = self.get_url(job_url) job_description_url = self.get_url(job_url + "/job-description") logger.debug("Job URL: %s, Job Description URL: %s" % (job_dir_url, job_description_url)) #job_dir = saga.advert.directory(saga.url(job_dir_url), # saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) # directory is recursively created job_desc_entry = saga.advert.entry( saga.url(job_description_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) logger.debug("initialized advert entry for job: " + job_dir_url) job_desc_entry.store_string(json.dumps(job_dict)) self.set_job_state(job_url, str(saga.job.Unknown))
def get_jobs_of_pilot(self, pilot_url): pilot_url = self.get_url(pilot_url + "/jobs") """ returns array of job_url that are associated with a pilot """ pilot_dir = saga.advert.directory( saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) jobs = pilot_dir.list() job_urls = [ self.__get_colon_url( self.__remove_dbtype(pilot_url) + "/" + i.get_string()) for i in jobs ] if self.dbtype != None: job_urls = [i + "?" + self.dbtype for i in job_urls] return job_urls
def monitor_checkpoints(self): """ parses all job working directories and registers files with Migol via SAGA/CPR """ #get current files from AIS url = saga.url("advert_launcher_checkpoint") checkpoint = saga.cpr.checkpoint(url) files = checkpoint.list_files() for i in files: print i dir_listing = os.listdir(os.getcwd()) for i in dir_listing: filename = dir + "/" + i if (os.path.isfile(filename)): if (check_file(files, filename == False)): url = self.build_url(filename) print str(self.build_url(filename))