def start_pilot_job(self, lrms_url, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None, external_queue="", pilot_compute_description=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Pro Adaptor) """ if self.job != None: raise BigJobError( "One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = SAGAUrl(lrms_url) self.url = lrms_saga_url self.pilot_url = self.app_url + ":" + lrms_saga_url.host self.number_nodes = int(number_nodes) * int(processes_per_node) # Store references to BJ in global dict _pilot_url_dict[self.pilot_url] = self _pilot_url_dict[external_queue] = self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) self.coordination.set_pilot_description(self.pilot_url, filetransfers) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) self.js = None if lrms_saga_url.scheme == "gce+ssh": self.js = GCEService(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ or lrms_saga_url.scheme=="nova+ssh": self.js = EC2Service(lrms_saga_url, pilot_compute_description) else: self.js = SAGAJobService(lrms_saga_url) ############################################################################## # create job description jd = SAGAJobDescription() # Attempt to create working directory (e.g. in local scenario) if working_directory != None: if not os.path.isdir(working_directory) \ and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ and working_directory.startswith("go:")==False: os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing #self.working_directory = os.path.expanduser("~") self.working_directory = "" if queue != None: jd.queue = queue if project != None: jd.project = project if walltime != None: if is_bliss: jd.wall_time_limit = int(walltime) else: jd.wall_time_limit = str(walltime) ############################################################################## # File Management and Stage-In # Determine whether target machine use gsissh or ssh to logon. # logger.debug("Detect launch method for: " + lrms_saga_url.host) # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) self.bigjob_working_directory_url = "" if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): logger.debug( "File Staging for Cloud Instances currently not supported.") elif lrms_saga_url.scheme.startswith("condor") == True: logger.debug("Using Condor file staging") else: # build target url for working directory # this will also create the remote directory for the BJ # Fallback if working directory is not a valid URL if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): if lrms_saga_url.username != None and lrms_saga_url.username != "": self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir( ) else: self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir( ) elif self.working_directory.startswith("go:"): self.bigjob_working_directory_url = os.path.join( self.working_directory, self.uuid) else: # working directory is a valid file staging URL self.bigjob_working_directory_url = self.working_directory # initialize file manager that takes care of file movement and directory creation if self.__filemanager == None: self.__initialize_pilot_data( self.bigjob_working_directory_url) # determines the url if self.__filemanager != None and not self.working_directory.startswith( "/"): self.working_directory = self.__filemanager.get_path( self.bigjob_working_directory_url) # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory logger.debug("BigJob working directory: %s" % self.bigjob_working_directory_url) if self.__filemanager != None and self.__filemanager.create_remote_directory( self.bigjob_working_directory_url) == True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, self.bigjob_working_directory_url) else: logger.warn("No file staging adaptor found.") logger.debug("BJ Working Directory: %s", self.working_directory) if lrms_saga_url.scheme.startswith("condor") == False: jd.working_directory = self.working_directory else: jd.working_directory = "" ############################################################################## # Create and process BJ bootstrap script bootstrap_script = self.__generate_bootstrap_script( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if is_bliss: bootstrap_script = self.__escape_bliss(bootstrap_script) else: if lrms_saga_url.scheme == "gram": bootstrap_script = self.__escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme == "xt5torque" or lrms_saga_url.scheme == "torque": bootstrap_script = self.__escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.__escape_ssh(bootstrap_script) logger.debug(bootstrap_script) # Define Agent Executable in Job description # in Condor case bootstrap script is staged # (Python app cannot be passed inline in Condor job description) if lrms_saga_url.scheme.startswith("condor") == True: condor_bootstrap_filename = os.path.join( "/tmp", "bootstrap-" + str(self.uuid)) condor_bootstrap_file = open(condor_bootstrap_filename, "w") condor_bootstrap_file.write(bootstrap_script) condor_bootstrap_file.close() logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) jd.executable = "/usr/bin/env" jd.arguments = [ "python", os.path.basename(condor_bootstrap_filename) ] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename( condor_bootstrap_filename) bj_file_transfers.append(file_transfer_spec) output_file_name = "output-" + str(self.uuid) + ".tar.gz" output_file_transfer_spec = os.path.join( self.working_directory, output_file_name) + " < " + output_file_name #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" logger.debug("Output transfer: " + output_file_transfer_spec) bj_file_transfers.append(output_file_transfer_spec) if filetransfers != None: for t in filetransfers: bj_file_transfers.append(t) logger.debug("Condor file transfers: " + str(bj_file_transfers)) jd.file_transfer = bj_file_transfers else: if is_bliss: jd.total_cpu_count = int(number_nodes) else: jd.number_of_processes = str(number_nodes) jd.processes_per_host = str(processes_per_node) jd.spmd_variation = "single" jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") ############################################################################## # Create and submit pilot job to job service logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = self.js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url
def start_pilot_job(self, lrms_url, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None, external_queue="", pilot_compute_description=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Pro Adaptor) """ if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = SAGAUrl(lrms_url) self.url = lrms_saga_url self.pilot_url = self.app_url + ":" + lrms_saga_url.host self.number_nodes=int(number_nodes)*int(processes_per_node) # Store references to BJ in global dict _pilot_url_dict[self.pilot_url]=self _pilot_url_dict[external_queue]=self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) if pilot_compute_description==None: pilot_compute_description={"service_url": lrms_url, "number_of_processes": number_nodes, "processes_per_node": processes_per_node, "working_directory": working_directory} self.coordination.set_pilot_description(self.pilot_url, pilot_compute_description) logger.debug("set pilot state to: " + str(Unknown)) # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) self.js =None if lrms_saga_url.scheme=="gce+ssh": self.js = GCEService(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ or lrms_saga_url.scheme=="nova+ssh": self.js = EC2Service(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="slurm+ssh": self.js = SlurmService(lrms_saga_url, pilot_compute_description) else: self.js = SAGAJobService(lrms_saga_url) ############################################################################## # create job description jd = SAGAJobDescription() # Attempt to create working directory (e.g. in local scenario) if working_directory != None: if not os.path.isdir(working_directory) \ and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ and working_directory.startswith("go:")==False: os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing #self.working_directory = os.path.expanduser("~") self.working_directory = "" if queue != None: jd.queue = queue if project !=None: jd.project=project if walltime!=None: logger.debug("setting walltime to: " + str(walltime)) if is_bliss: jd.wall_time_limit=int(walltime) else: jd.wall_time_limit=str(walltime) ############################################################################## # File Management and Stage-In # Determine whether target machine use gsissh or ssh to logon. # logger.debug("Detect launch method for: " + lrms_saga_url.host) # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) self.bigjob_working_directory_url="" if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): logger.debug("File Staging for Cloud Instances currently not supported.") elif lrms_saga_url.scheme.startswith("condor") == True: logger.debug("Using Condor file staging") else: # build target url for working directory # this will also create the remote directory for the BJ # Fallback if working directory is not a valid URL if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): if lrms_saga_url.username!=None and lrms_saga_url.username!="": self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir() else: self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir() elif self.working_directory.startswith("go:"): self.bigjob_working_directory_url=os.path.join(self.working_directory, self.uuid) else: # working directory is a valid file staging URL self.bigjob_working_directory_url=self.working_directory # initialize file manager that takes care of file movement and directory creation if self.__filemanager==None: self.__initialize_pilot_data(self.bigjob_working_directory_url) # determines the url if self.__filemanager != None and not self.working_directory.startswith("/"): self.working_directory = self.__filemanager.get_path(self.bigjob_working_directory_url) # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory logger.debug("BigJob working directory: %s"%self.bigjob_working_directory_url) if self.__filemanager!=None and self.__filemanager.create_remote_directory(self.bigjob_working_directory_url)==True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, self.bigjob_working_directory_url) else: logger.warn("No file staging adaptor found.") logger.debug("BJ Working Directory: %s", self.working_directory) if lrms_saga_url.scheme.startswith("condor")==False: jd.working_directory = self.working_directory else: jd.working_directory="" ############################################################################## # Create and process BJ bootstrap script bootstrap_script = self.__generate_bootstrap_script( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if is_bliss and lrms_saga_url.scheme.startswith("condor")==False: bootstrap_script = self.__escape_bliss(bootstrap_script) else: if lrms_saga_url.scheme == "gram": bootstrap_script = self.__escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme=="xt5torque" or lrms_saga_url.scheme=="torque": bootstrap_script = self.__escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh" and lrms_saga_url.scheme == "slurm+ssh": bootstrap_script = self.__escape_ssh(bootstrap_script) logger.debug(bootstrap_script) # Define Agent Executable in Job description # in Condor case bootstrap script is staged # (Python app cannot be passed inline in Condor job description) if lrms_saga_url.scheme.startswith("condor")==True: bootstrap_script = self.__generate_bootstrap_script_from_binary( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) condor_bootstrap_filename = os.path.join("/tmp", "bootstrap-"+str(self.uuid)) condor_bootstrap_file = open(condor_bootstrap_filename, "w") condor_bootstrap_file.write(bootstrap_script) condor_bootstrap_file.close() logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) jd.executable = "/usr/bin/env" jd.arguments = ["python", os.path.basename(condor_bootstrap_filename)] if pilot_compute_description.has_key("candidate_hosts"): jd.candidate_hosts = pilot_compute_description["candidate_hosts"] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename(condor_bootstrap_filename) bj_file_transfers.append(file_transfer_spec) output_file_name = "output-" + str(self.uuid) + ".tar.gz" #output_file_transfer_spec = os.path.join(self.working_directory, output_file_name) +" < " + output_file_name output_file_transfer_spec = output_file_name +" < " + output_file_name #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" #logger.debug("Output transfer: " + output_file_transfer_spec) #bj_file_transfers.append(output_file_transfer_spec) if filetransfers != None: for t in filetransfers: bj_file_transfers.append(t) logger.debug("Condor file transfers: " + str(bj_file_transfers)) jd.file_transfer = bj_file_transfers else: if is_bliss: jd.total_cpu_count=int(number_nodes) else: jd.number_of_processes=str(number_nodes) jd.processes_per_host=str(processes_per_node) jd.spmd_variation = "single" if pilot_compute_description!=None and pilot_compute_description.has_key("spmd_variation"): jd.spmd_variation=pilot_compute_description["spmd_variation"] jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" logger.debug("Working directory: " + jd.working_directory + " Job Description: " + str(jd)) jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") ############################################################################## # Create and submit pilot job to job service logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = self.js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url
class bigjob(api.base.bigjob): ''' BigJob: Class for managing pilot jobs: Example: bj = bigjob("redis://localhost") bj.start_pilot_job("fork://localhost") .. bj.cancel() ''' __APPLICATION_NAME = "bigjob" def __init__(self, coordination_url="advert://localhost/?dbtype=sqlite3", pilot_url=None): """ Initializes BigJob's coordination system advert://localhost (SAGA/Advert SQLITE) advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) redis://localhost:6379 (Redis at localhost) tcp://localhost (ZMQ) The following formats for pilot_url are supported: 1.) Including root path at distributed coordination service: redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost This path is returned when call bigjob.get_url() 2.) BigJob unique ID: bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost """ self.coordination_url = coordination_url #self.launch_method="" self.__filemanager = None # restore existing BJ or initialize new BJ if pilot_url != None: logger.debug("Reconnect to BJ: %s" % pilot_url) if pilot_url.startswith("bigjob:"): self.pilot_url = pilot_url else: self.coordination_url, self.pilot_url = self.__parse_pilot_url( pilot_url) self.uuid = self.__get_bj_id(pilot_url) self.app_url = self.__APPLICATION_NAME + ":" + str(self.uuid) self.job = None self.working_directory = None # Coordination subsystem must be initialized before get_state_detail self.coordination = self.__init_coordination(self.coordination_url) self.state = self.get_state_detail() _pilot_url_dict[self.pilot_url] = self else: self.coordination = self.__init_coordination(self.coordination_url) self.uuid = "bj-" + str(get_uuid()) logger.debug("init BigJob w/: " + coordination_url) self.app_url = self.__APPLICATION_NAME + ":" + str(self.uuid) self.state = Unknown self.pilot_url = "" self.job = None self.working_directory = None logger.debug("initialized BigJob: " + self.app_url) def start_pilot_job(self, lrms_url, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None, external_queue="", pilot_compute_description=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Pro Adaptor) """ if self.job != None: raise BigJobError( "One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = SAGAUrl(lrms_url) self.url = lrms_saga_url self.pilot_url = self.app_url + ":" + lrms_saga_url.host self.number_nodes = int(number_nodes) * int(processes_per_node) # Store references to BJ in global dict _pilot_url_dict[self.pilot_url] = self _pilot_url_dict[external_queue] = self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) self.coordination.set_pilot_description(self.pilot_url, filetransfers) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) self.js = None if lrms_saga_url.scheme == "gce+ssh": self.js = GCEService(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ or lrms_saga_url.scheme=="nova+ssh": self.js = EC2Service(lrms_saga_url, pilot_compute_description) else: self.js = SAGAJobService(lrms_saga_url) ############################################################################## # create job description jd = SAGAJobDescription() # Attempt to create working directory (e.g. in local scenario) if working_directory != None: if not os.path.isdir(working_directory) \ and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ and working_directory.startswith("go:")==False: os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing #self.working_directory = os.path.expanduser("~") self.working_directory = "" if queue != None: jd.queue = queue if project != None: jd.project = project if walltime != None: if is_bliss: jd.wall_time_limit = int(walltime) else: jd.wall_time_limit = str(walltime) ############################################################################## # File Management and Stage-In # Determine whether target machine use gsissh or ssh to logon. # logger.debug("Detect launch method for: " + lrms_saga_url.host) # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) self.bigjob_working_directory_url = "" if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): logger.debug( "File Staging for Cloud Instances currently not supported.") elif lrms_saga_url.scheme.startswith("condor") == True: logger.debug("Using Condor file staging") else: # build target url for working directory # this will also create the remote directory for the BJ # Fallback if working directory is not a valid URL if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): if lrms_saga_url.username != None and lrms_saga_url.username != "": self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir( ) else: self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir( ) elif self.working_directory.startswith("go:"): self.bigjob_working_directory_url = os.path.join( self.working_directory, self.uuid) else: # working directory is a valid file staging URL self.bigjob_working_directory_url = self.working_directory # initialize file manager that takes care of file movement and directory creation if self.__filemanager == None: self.__initialize_pilot_data( self.bigjob_working_directory_url) # determines the url if self.__filemanager != None and not self.working_directory.startswith( "/"): self.working_directory = self.__filemanager.get_path( self.bigjob_working_directory_url) # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory logger.debug("BigJob working directory: %s" % self.bigjob_working_directory_url) if self.__filemanager != None and self.__filemanager.create_remote_directory( self.bigjob_working_directory_url) == True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, self.bigjob_working_directory_url) else: logger.warn("No file staging adaptor found.") logger.debug("BJ Working Directory: %s", self.working_directory) if lrms_saga_url.scheme.startswith("condor") == False: jd.working_directory = self.working_directory else: jd.working_directory = "" ############################################################################## # Create and process BJ bootstrap script bootstrap_script = self.__generate_bootstrap_script( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if is_bliss: bootstrap_script = self.__escape_bliss(bootstrap_script) else: if lrms_saga_url.scheme == "gram": bootstrap_script = self.__escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme == "xt5torque" or lrms_saga_url.scheme == "torque": bootstrap_script = self.__escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.__escape_ssh(bootstrap_script) logger.debug(bootstrap_script) # Define Agent Executable in Job description # in Condor case bootstrap script is staged # (Python app cannot be passed inline in Condor job description) if lrms_saga_url.scheme.startswith("condor") == True: condor_bootstrap_filename = os.path.join( "/tmp", "bootstrap-" + str(self.uuid)) condor_bootstrap_file = open(condor_bootstrap_filename, "w") condor_bootstrap_file.write(bootstrap_script) condor_bootstrap_file.close() logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) jd.executable = "/usr/bin/env" jd.arguments = [ "python", os.path.basename(condor_bootstrap_filename) ] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename( condor_bootstrap_filename) bj_file_transfers.append(file_transfer_spec) output_file_name = "output-" + str(self.uuid) + ".tar.gz" output_file_transfer_spec = os.path.join( self.working_directory, output_file_name) + " < " + output_file_name #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" logger.debug("Output transfer: " + output_file_transfer_spec) bj_file_transfers.append(output_file_transfer_spec) if filetransfers != None: for t in filetransfers: bj_file_transfers.append(t) logger.debug("Condor file transfers: " + str(bj_file_transfers)) jd.file_transfer = bj_file_transfers else: if is_bliss: jd.total_cpu_count = int(number_nodes) else: jd.number_of_processes = str(number_nodes) jd.processes_per_host = str(processes_per_node) jd.spmd_variation = "single" jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") ############################################################################## # Create and submit pilot job to job service logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = self.js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url def list_subjobs(self): sj_list = self.coordination.get_jobs_of_pilot(self.pilot_url) logger.debug(str(sj_list)) subjobs = [] for i in sj_list: url = i #if url.find("/")>0: # url = url[url.find("bigjob"):] # url = url.replace("/", ":") #sj = subjob(coordination_url=self.coordination_url, subjob_url=url) sj = subjob(subjob_url=url) subjobs.append(sj.get_url()) return subjobs def get_state(self): """ duck typing for get_state of saga.job.job state of saga job that is used to spawn the pilot agent """ return self.get_state_detail() def get_state_detail(self): """ internal state of BigJob agent """ try: return self.coordination.get_pilot_state(self.pilot_url)["state"] except: return None def get_url(self): """ Get unique URL of big-job. This URL can be used to reconnect to BJ later, e.g.: redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost """ url = os.path.join(self.coordination.address, self.pilot_url) if self.coordination.dbtype != "" and self.coordination.dbtype != None: url = os.path.join(url, "?" + self.coordination.dbtype) return url def get_free_nodes(self): jobs = self.coordination.get_jobs_of_pilot(self.pilot_url) number_used_nodes = 0 for i in jobs: job_detail = self.coordination.get_job(i) if job_detail != None and job_detail.has_key("state") == True\ and job_detail["state"]==str(Running): job_np = "1" if (job_detail["NumberOfProcesses"] == True): job_np = job_detail["NumberOfProcesses"] number_used_nodes = number_used_nodes + int(job_np) return (self.number_nodes - number_used_nodes) def cancel(self): """ duck typing for cancel of saga.cpr.job and saga.job.job """ logger.debug("Cancel Pilot Job") try: if self.url.scheme.startswith("condor") == False: self.job.cancel() else: logger.debug( "Output files are being transfered to file: outpt.tar.gz. Please wait until transfer is complete." ) except: pass #traceback.print_stack() try: self._stop_pilot_job() logger.debug("delete pilot job: " + str(self.pilot_url)) if _CLEANUP: self.coordination.delete_pilot(self.pilot_url) #os.remove(os.path.join("/tmp", "bootstrap-"+str(self.uuid))) except: pass #traceback.print_stack() logger.debug("Cancel Pilot Job finished") def wait(self): """ Waits for completion of all sub-jobs """ while 1: jobs = self.coordination.get_jobs_of_pilot(self.pilot_url) finish_counter = 0 result_map = {} for i in jobs: # parse job id out of sj url surl = SAGAUrl(i) state = self.coordination.get_job_state(surl.path) #state = job_detail["state"] if result_map.has_key(state) == False: result_map[state] = 1 else: result_map[state] = result_map[state] + 1 if self.__has_finished(state) == True: finish_counter = finish_counter + 1 logger.debug("Total Jobs: %s States: %s" % (len(jobs), str(result_map))) if finish_counter == len(jobs): break time.sleep(2) ########################################################################### # internal and protected methods def _stop_pilot_job(self): """ mark in database entry of pilot-job as stopped """ try: logger.debug("stop pilot job: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Done), True) self.job = None except: pass def _delete_subjob(self, job_url): self.coordination.delete_job(job_url) def _get_subjob_state(self, job_url): return self.coordination.get_job_state(job_url) def _get_subjob_details(self, job_url): return self.coordination.get_job(job_url) def _add_subjob(self, queue_url, jd, job_url, job_id): logger.debug("add subjob to queue of PJ: " + str(queue_url)) for i in range(0, 3): try: logger.debug( "create dictionary for job description. Job-URL: " + job_url) # put job description attributes to Coordination Service job_dict = {} # to accomendate current bug in bliss (Number of processes is not returned from list attributes) job_dict["NumberOfProcesses"] = "1" attributes = jd.list_attributes() logger.debug("SJ Attributes: " + str(jd)) for i in attributes: if jd.attribute_is_vector(i): vector_attr = [] for j in jd.get_vector_attribute(i): vector_attr.append(j) job_dict[i] = vector_attr else: #logger.debug("Add attribute: " + str(i) + " Value: " + jd.get_attribute(i)) job_dict[i] = jd.get_attribute(i) # Other pilot state information job_dict["state"] = str(Unknown) job_dict["job-id"] = str(job_id) logger.debug("job dict: " + str(job_dict)) if job_dict.has_key("FileTransfer"): files = job_dict["FileTransfer"] sj_work_dir = self.__get_subjob_working_dir(job_id) self.__stage_files(files, sj_work_dir) #logger.debug("update job description at communication & coordination sub-system") self.coordination.set_job(job_url, job_dict) self.coordination.queue_job(queue_url, job_url) break except: self.__print_traceback() time.sleep(2) def _get_subjob_url(self, subjob_url): """ Get unique URL for a sub-job. This URL can be used to reconnect to SJ later, e.g.: redis://localhost/bigjob:bj-9a9ba4d8-b162-11e1-9c42-109addae22a3:localhost:jobs:sj-6f44da6e-b178-11e1-bc99-109addae22a3 """ url = subjob_url if subjob_url.find("bigjob") == 0: url = os.path.join(self.coordination.address, subjob_url) if self.coordination.dbtype != "" and self.coordination.dbtype != None: url = os.path.join(url, "?" + self.coordination.dbtype) return url def __generate_bootstrap_script(self, coordination_host, coordination_namespace, external_coordination_namespace=""): script = textwrap.dedent("""import sys import os import urllib import sys import time start_time = time.time() home = os.environ.get("HOME") #print "Home: " + home if home==None: home = os.getcwd() BIGJOB_AGENT_DIR= os.path.join(home, ".bigjob") if not os.path.exists(BIGJOB_AGENT_DIR): os.mkdir (BIGJOB_AGENT_DIR) BIGJOB_PYTHON_DIR=BIGJOB_AGENT_DIR+"/python/" if not os.path.exists(BIGJOB_PYTHON_DIR): os.mkdir(BIGJOB_PYTHON_DIR) BOOTSTRAP_URL="https://raw.github.com/saga-project/BigJob/master/bootstrap/bigjob-bootstrap.py" BOOTSTRAP_FILE=BIGJOB_AGENT_DIR+"/bigjob-bootstrap.py" #ensure that BJ in .bigjob is upfront in sys.path sys.path.insert(0, os.getcwd() + "/../") #sys.path.insert(0, /User/luckow/.bigjob/python/lib") #sys.path.insert(0, os.getcwd() + "/../../") p = list() for i in sys.path: if i.find(\".bigjob/python\")>1: p.insert(0, i) for i in p: sys.path.insert(0, i) print "Python path: " + str(sys.path) print "Python version: " + str(sys.version_info) try: import saga except: print "SAGA and SAGA Python Bindings not found: BigJob only work w/ non-SAGA backends e.g. Redis, ZMQ."; try: import bigjob.bigjob_agent except: print "BigJob not installed. Attempt to install it."; opener = urllib.FancyURLopener({}); opener.retrieve(BOOTSTRAP_URL, BOOTSTRAP_FILE); print "Execute: " + "python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR os.system("/usr/bin/env") try: os.system("python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR); activate_this = BIGJOB_PYTHON_DIR+'bin/activate_this.py'; execfile(activate_this, dict(__file__=activate_this)) except: print "BJ installation failed. Trying system-level python (/usr/bin/python)"; os.system("/usr/bin/python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR); activate_this = BIGJOB_PYTHON_DIR+'bin/activate_this.py'; execfile(activate_this, dict(__file__=activate_this)) #try to import BJ once again import bigjob.bigjob_agent # execute bj agent args = list() args.append("bigjob_agent.py") args.append(\"%s\") args.append(\"%s\") args.append(\"%s\") print "Bootstrap time: " + str(time.time()-start_time) print "Starting BigJob Agents with following args: " + str(args) bigjob_agent = bigjob.bigjob_agent.bigjob_agent(args) """ % (coordination_host, coordination_namespace, external_coordination_namespace)) return script def __escape_rsl(self, bootstrap_script): logger.debug("Escape RSL") bootstrap_script = bootstrap_script.replace("\"", "\"\"") return bootstrap_script def __escape_pbs(self, bootstrap_script): logger.debug("Escape PBS") bootstrap_script = "\'" + bootstrap_script + "\'" return bootstrap_script def __escape_ssh(self, bootstrap_script): logger.debug("Escape SSH") bootstrap_script = bootstrap_script.replace("\"", "\\\"") bootstrap_script = bootstrap_script.replace("\'", "\\\"") bootstrap_script = "\"" + bootstrap_script + "\"" return bootstrap_script def __escape_bliss(self, bootstrap_script): logger.debug("Escape Bliss") #bootstrap_script = bootstrap_script.replace("\'", "\"") #bootstrap_script = "\'" + bootstrap_script+ "\'" bootstrap_script = bootstrap_script.replace('"', '\\"') bootstrap_script = '"' + bootstrap_script + '"' return bootstrap_script def __parse_pilot_url(self, pilot_url): #pdb.set_trace() pilot_saga_url = SAGAUrl(pilot_url) dbtype = pilot_saga_url.query coordination = pilot_url[:pilot_url.index("bigjob")] if dbtype != None: coordination = os.path.join(coordination, "?" + dbtype) pilot_url = pilot_saga_url.path[1:] #dbtype = None #coordination = pilot_url[:pilot_url.index("bigjob")] #pilot_url = pilot_url[pilot_url.find("bigjob"):] #if pilot_url.find("/") > 0: # comp = pilot_url.split("/") # pilot_url = comp[0] # if comp[1].find("dbtype")>0: # dbtype=comp[1][comp[1].find("dbtype"):] #if dbtype!=None: # coordination = os.path.join(coordination, "?"+dbtype) logger.debug("Parsed URL - Coordination: %s Pilot: %s" % (coordination, pilot_url)) return coordination, pilot_url def __has_finished(self, state): state = state.lower() if state == "done" or state == "failed" or state == "canceled": return True else: return False def __parse_url(self, url): try: surl = SAGAUrl(url) host = surl.host port = surl.port username = surl.username password = surl.password query = surl.query if query != None and query.endswith("/"): query = query[:-1] scheme = "%s://" % surl.scheme except: """ Fallback URL parser based on Python urlparse library """ logger.error("URL %s could not be parsed" % (url)) traceback.print_exc(file=sys.stderr) result = urlparse.urlparse(url) logger.debug("Result: " + str(result)) host = result.hostname #host = None port = result.port username = result.username password = result.password scheme = "%s://" % result.scheme if host == None: logger.debug("Python 2.6 fallback") if url.find("/", len(scheme)) > 0: host = url[len(scheme):url.find("/", len(scheme))] else: host = url[len(scheme):] if host.find(":") > 1: logger.debug(host) comp = host.split(":") host = comp[0] port = int(comp[1]) if url.find("?") > 0: query = url[url.find("?") + 1:] else: query = None logger.debug("%s %s %s" % (scheme, host, port)) return scheme, username, password, host, port, query def __get_bj_id(self, pilot_url): start = pilot_url.index("bj-") end = pilot_url.index(":", start) return pilot_url[start:end] def __init_coordination(self, coordination_url): if (coordination_url.startswith("advert://") or coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logger.debug("Utilizing ADVERT Backend") except: logger.error("Advert Backend could not be loaded") elif (coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug("Utilizing Redis Backend") except: logger.error("Error loading pyredis.") elif (coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error( "ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " + "PYZMQ (http://zeromq.github.com/pyzmq/)") else: logger.error("No suitable coordination backend found.") logger.debug("Parsing URL: " + coordination_url) scheme, username, password, host, port, dbtype = self.__parse_url( coordination_url) if port == -1: port = None coordination = bigjob_coordination(server=host, server_port=port, username=username, password=password, dbtype=dbtype, url_prefix=scheme) return coordination def __get_bigjob_working_dir(self): if self.working_directory.find( self.uuid) != -1: # working directory already contains BJ id return self.working_directory else: return os.path.join(self.working_directory, self.uuid) def __get_subjob_working_dir(self, sj_id): base_url = self.bigjob_working_directory_url url = os.path.join(base_url, sj_id) return url ########################################################################### # File Management def __initialize_pilot_data(self, service_url): # initialize file adaptor # Pilot Data API for File Management if service_url.startswith("ssh:"): logger.debug("Use SSH backend") try: from pilot.filemanagement.ssh_adaptor import SSHFileAdaptor self.__filemanager = SSHFileAdaptor(service_url) except: logger.debug("SSH/Paramiko package not found.") self.__print_traceback() elif service_url.startswith("http:"): logger.debug("Use WebHDFS backend") try: from pilot.filemanagement.webhdfs_adaptor import WebHDFSFileAdaptor self.__filemanager = WebHDFSFileAdaptor(service_url) except: logger.debug("WebHDFS package not found.") elif service_url.startswith("go:"): logger.debug("Use Globus Online backend") try: from pilot.filemanagement.globusonline_adaptor import GlobusOnlineFileAdaptor self.__filemanager = GlobusOnlineFileAdaptor(service_url) except: logger.debug("Globus Online package not found.") self.__print_traceback() def __stage_files(self, filetransfers, target_url): logger.debug("Stage: %s to %s" % (filetransfers, target_url)) if filetransfers == None: return if self.__filemanager: self.__filemanager.create_remote_directory(target_url) for i in filetransfers: source_file = i if i.find(">") > 0: source_file = i[:i.find(">")].strip() if source_file.startswith( "ssh://") == False and source_file.startswith( "go://") == False: logger.error( "Staging of file: %s not supported. Please use URL in form ssh://<filename>" % source_file) continue target_url_full = os.path.join(target_url, os.path.basename(source_file)) logger.debug("Stage: %s to %s" % (source_file, target_url_full)) #self.__third_party_transfer(source_file, target_url_full) if self.__filemanager: self.__filemanager.transfer(source_file, target_url_full) def __get_launch_method(self, hostname, user=None): """ returns desired execution method: ssh, aprun """ if user == None: user = self.__discover_ssh_user(hostname) host = "" if user != None and user != "": logger.debug("discovered user: "******"@" + hostname else: host = hostname gsissh_available = False try: cmd = "gsissh " + host + " /bin/date" logger.debug("Execute: " + cmd) gsissh_available = (subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0) except: pass ssh_available = False try: cmd = "ssh " + host + " /bin/date" logger.debug("Execute: " + cmd) ssh_available = (subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0) except: pass launch_method = "ssh" if ssh_available == False and gsissh_available == True: launch_method = "gsissh" else: launch_method = "ssh" logger.info("SSH: %r GSISSH: %r Use: %s" % (ssh_available, gsissh_available, launch_method)) return launch_method def __discover_ssh_user(self, hostname): # discover username user = None ssh_config = os.path.join(os.path.expanduser("~"), ".ssh/config") ssh_config_file = open(ssh_config, "r") lines = ssh_config_file.readlines() for i in range(0, len(lines)): line = lines[i] if line.find(hostname) > 0: for k in range(i + 1, len(lines)): sub_line = lines[k] if sub_line.startswith( " ") == True and sub_line.startswith("\t") == True: break # configuration for next host elif sub_line.find("User") != -1: stripped_sub_line = sub_line.strip() user = stripped_sub_line.split()[1] break ssh_config_file.close() return user def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() logger.debug("*** print_exception:", exc_info=(exc_type, exc_value, exc_traceback)) #traceback.print_exception(exc_type, exc_value, exc_traceback, # limit=2, file=sys.stdout) def __repr__(self): return self.pilot_url def __del__(self): """ BJ is not cancelled when object terminates Application can reconnect to BJ via pilot url later on""" pass
class bigjob(api.base.bigjob): ''' BigJob: Class for managing pilot jobs: Example: bj = bigjob("redis://localhost") bj.start_pilot_job("fork://localhost") .. bj.cancel() ''' __APPLICATION_NAME="bigjob" def __init__(self, coordination_url="advert://localhost/?dbtype=sqlite3", pilot_url=None): """ Initializes BigJob's coordination system advert://localhost (SAGA/Advert SQLITE) advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) redis://localhost:6379 (Redis at localhost) tcp://localhost (ZMQ) The following formats for pilot_url are supported: 1.) Including root path at distributed coordination service: redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost This path is returned when call bigjob.get_url() 2.) BigJob unique ID: bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost """ self.coordination_url = coordination_url if self.coordination_url==None: logger.error("Coordination URL not set. Exiting BigJob.") #self.launch_method="" self.__filemanager=None # restore existing BJ or initialize new BJ if pilot_url!=None: logger.debug("Reconnect to BJ: %s"%pilot_url) if pilot_url.startswith("bigjob:"): self.pilot_url=pilot_url else: self.coordination_url, self.pilot_url = self.__parse_pilot_url(pilot_url) self.uuid = self.__get_bj_id(pilot_url) self.app_url = self.__APPLICATION_NAME +":" + str(self.uuid) self.job = None self.working_directory = None # Coordination subsystem must be initialized before get_state_detail self.coordination = self.__init_coordination(self.coordination_url) self.state=self.get_state_detail() _pilot_url_dict[self.pilot_url]=self else: self.coordination = self.__init_coordination(self.coordination_url) self.uuid = "bj-" + str(get_uuid()) logger.debug("init BigJob w/: " + coordination_url) self.app_url =self. __APPLICATION_NAME +":" + str(self.uuid) self.state=Unknown self.pilot_url="" self.job = None self.working_directory = None logger.debug("initialized BigJob: " + self.app_url) def start_pilot_job(self, lrms_url, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None, external_queue="", pilot_compute_description=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Pro Adaptor) """ if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = SAGAUrl(lrms_url) self.url = lrms_saga_url self.pilot_url = self.app_url + ":" + lrms_saga_url.host self.number_nodes=int(number_nodes)*int(processes_per_node) # Store references to BJ in global dict _pilot_url_dict[self.pilot_url]=self _pilot_url_dict[external_queue]=self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) if pilot_compute_description==None: pilot_compute_description={"service_url": lrms_url, "number_of_processes": number_nodes, "processes_per_node": processes_per_node, "working_directory": working_directory} self.coordination.set_pilot_description(self.pilot_url, pilot_compute_description) logger.debug("set pilot state to: " + str(Unknown)) # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) self.js =None if lrms_saga_url.scheme=="gce+ssh": self.js = GCEService(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ or lrms_saga_url.scheme=="nova+ssh": self.js = EC2Service(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="slurm+ssh": self.js = SlurmService(lrms_saga_url, pilot_compute_description) else: self.js = SAGAJobService(lrms_saga_url) ############################################################################## # create job description jd = SAGAJobDescription() # Attempt to create working directory (e.g. in local scenario) if working_directory != None: if not os.path.isdir(working_directory) \ and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ and working_directory.startswith("go:")==False: os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing #self.working_directory = os.path.expanduser("~") self.working_directory = "" if queue != None: jd.queue = queue if project !=None: jd.project=project if walltime!=None: logger.debug("setting walltime to: " + str(walltime)) if is_bliss: jd.wall_time_limit=int(walltime) else: jd.wall_time_limit=str(walltime) ############################################################################## # File Management and Stage-In # Determine whether target machine use gsissh or ssh to logon. # logger.debug("Detect launch method for: " + lrms_saga_url.host) # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) self.bigjob_working_directory_url="" if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): logger.debug("File Staging for Cloud Instances currently not supported.") elif lrms_saga_url.scheme.startswith("condor") == True: logger.debug("Using Condor file staging") else: # build target url for working directory # this will also create the remote directory for the BJ # Fallback if working directory is not a valid URL if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): if lrms_saga_url.username!=None and lrms_saga_url.username!="": self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir() else: self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir() elif self.working_directory.startswith("go:"): self.bigjob_working_directory_url=os.path.join(self.working_directory, self.uuid) else: # working directory is a valid file staging URL self.bigjob_working_directory_url=self.working_directory # initialize file manager that takes care of file movement and directory creation if self.__filemanager==None: self.__initialize_pilot_data(self.bigjob_working_directory_url) # determines the url if self.__filemanager != None and not self.working_directory.startswith("/"): self.working_directory = self.__filemanager.get_path(self.bigjob_working_directory_url) # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory logger.debug("BigJob working directory: %s"%self.bigjob_working_directory_url) if self.__filemanager!=None and self.__filemanager.create_remote_directory(self.bigjob_working_directory_url)==True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, self.bigjob_working_directory_url) else: logger.warn("No file staging adaptor found.") logger.debug("BJ Working Directory: %s", self.working_directory) if lrms_saga_url.scheme.startswith("condor")==False: jd.working_directory = self.working_directory else: jd.working_directory="" ############################################################################## # Create and process BJ bootstrap script bootstrap_script = self.__generate_bootstrap_script( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if is_bliss and lrms_saga_url.scheme.startswith("condor")==False: bootstrap_script = self.__escape_bliss(bootstrap_script) else: if lrms_saga_url.scheme == "gram": bootstrap_script = self.__escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme=="xt5torque" or lrms_saga_url.scheme=="torque": bootstrap_script = self.__escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh" and lrms_saga_url.scheme == "slurm+ssh": bootstrap_script = self.__escape_ssh(bootstrap_script) logger.debug(bootstrap_script) # Define Agent Executable in Job description # in Condor case bootstrap script is staged # (Python app cannot be passed inline in Condor job description) if lrms_saga_url.scheme.startswith("condor")==True: bootstrap_script = self.__generate_bootstrap_script_from_binary( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) condor_bootstrap_filename = os.path.join("/tmp", "bootstrap-"+str(self.uuid)) condor_bootstrap_file = open(condor_bootstrap_filename, "w") condor_bootstrap_file.write(bootstrap_script) condor_bootstrap_file.close() logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) jd.executable = "/usr/bin/env" jd.arguments = ["python", os.path.basename(condor_bootstrap_filename)] if pilot_compute_description.has_key("candidate_hosts"): jd.candidate_hosts = pilot_compute_description["candidate_hosts"] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename(condor_bootstrap_filename) bj_file_transfers.append(file_transfer_spec) output_file_name = "output-" + str(self.uuid) + ".tar.gz" #output_file_transfer_spec = os.path.join(self.working_directory, output_file_name) +" < " + output_file_name output_file_transfer_spec = output_file_name +" < " + output_file_name #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" #logger.debug("Output transfer: " + output_file_transfer_spec) #bj_file_transfers.append(output_file_transfer_spec) if filetransfers != None: for t in filetransfers: bj_file_transfers.append(t) logger.debug("Condor file transfers: " + str(bj_file_transfers)) jd.file_transfer = bj_file_transfers else: if is_bliss: jd.total_cpu_count=int(number_nodes) else: jd.number_of_processes=str(number_nodes) jd.processes_per_host=str(processes_per_node) jd.spmd_variation = "single" if pilot_compute_description!=None and pilot_compute_description.has_key("spmd_variation"): jd.spmd_variation=pilot_compute_description["spmd_variation"] jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" logger.debug("Working directory: " + jd.working_directory + " Job Description: " + str(jd)) jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") ############################################################################## # Create and submit pilot job to job service logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = self.js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url def list_subjobs(self): sj_list = self.coordination.get_jobs_of_pilot(self.pilot_url) logger.debug(str(sj_list)) subjobs = [] for i in sj_list: url = i #if url.find("/")>0: # url = url[url.find("bigjob"):] # url = url.replace("/", ":") #sj = subjob(coordination_url=self.coordination_url, subjob_url=url) sj = subjob(subjob_url=url) subjobs.append(sj.get_url()) return subjobs def get_state(self): """ duck typing for get_state of saga.job.job state of saga job that is used to spawn the pilot agent """ return self.get_state_detail() def get_state_detail(self): """ internal state of BigJob agent """ try: return self.coordination.get_pilot_state(self.pilot_url)["state"] except: return None def get_url(self): """ Get unique URL of big-job. This URL can be used to reconnect to BJ later, e.g.: redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost """ url = os.path.join(self.coordination.address, self.pilot_url) if self.coordination.dbtype!="" and self.coordination.dbtype!=None: url = os.path.join(url, "?" + self.coordination.dbtype) return url def get_free_nodes(self): jobs = self.coordination.get_jobs_of_pilot(self.pilot_url) number_used_nodes=0 for i in jobs: job_detail = self.coordination.get_job(i) if job_detail != None and job_detail.has_key("state") == True\ and job_detail["state"]==str(Running): job_np = "1" if (job_detail["NumberOfProcesses"] == True): job_np = job_detail["NumberOfProcesses"] number_used_nodes=number_used_nodes + int(job_np) return (self.number_nodes - number_used_nodes) def cancel(self): """ duck typing for cancel of saga.cpr.job and saga.job.job """ logger.debug("Cancel Pilot Job") try: if self.url.scheme.startswith("condor")==False: self.job.cancel() else: pass #logger.debug("Output files are being transfered to file: outpt.tar.gz. Please wait until transfer is complete.") except: pass #traceback.print_stack() try: self._stop_pilot_job() logger.debug("delete pilot job: " + str(self.pilot_url)) if _CLEANUP: self.coordination.delete_pilot(self.pilot_url) #os.remove(os.path.join("/tmp", "bootstrap-"+str(self.uuid))) except: pass #traceback.print_stack() logger.debug("Cancel Pilot Job finished") def wait(self): """ Waits for completion of all sub-jobs """ while 1: jobs = self.coordination.get_jobs_of_pilot(self.pilot_url) finish_counter=0 result_map = {} for i in jobs: # parse job id out of sj url surl = SAGAUrl(i) sj_id = surl.path if sj_id.startswith("/"): sj_id = sj_id[1:] state = str(self.coordination.get_job_state(sj_id)) #logger.debug("SJ: %s : State: %s"%(sj_id, str(state))) #state = job_detail["state"] if result_map.has_key(state)==False: result_map[state]=1 else: result_map[state] = result_map[state]+1 if self.__has_finished(state)==True: finish_counter = finish_counter + 1 logger.debug("Total Jobs: %s States: %s"%(len(jobs), str(result_map))) if finish_counter == len(jobs): break time.sleep(2) ########################################################################### # internal and protected methods def _stop_pilot_job(self): """ mark in database entry of pilot-job as stopped """ try: logger.debug("stop pilot job: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Done), True) self.job=None except: pass def _delete_subjob(self, job_url): self.coordination.delete_job(job_url) def _get_subjob_state(self, job_url): logger.debug("Get subjob state: " + str(job_url)) return self.coordination.get_job_state(job_url) def _get_subjob_details(self, job_url): return self.coordination.get_job(job_url) def _add_subjob(self, queue_url, jd, job_url, job_id): logger.debug("add subjob to queue of PJ: " + str(queue_url)) for i in range(0,3): try: logger.debug("create dictionary for job description. Job-URL: " + job_url) # put job description attributes to Coordination Service job_dict = {} # to accomendate current bug in bliss (Number of processes is not returned from list attributes) job_dict["NumberOfProcesses"] = "1" attributes = jd.list_attributes() logger.debug("SJ Attributes: " + str(jd)) for i in attributes: if jd.attribute_is_vector(i): vector_attr = [] for j in jd.get_vector_attribute(i): vector_attr.append(j) job_dict[i]=vector_attr else: #logger.debug("Add attribute: " + str(i) + " Value: " + jd.get_attribute(i)) job_dict[i] = jd.get_attribute(i) # Other pilot state information job_dict["state"] = str(Unknown) job_dict["job-id"] = str(job_id) logger.debug("job dict: " + str(job_dict)) if job_dict.has_key("FileTransfer"): files = job_dict["FileTransfer"] sj_work_dir = self.__get_subjob_working_dir(job_id) self.__stage_files(files, sj_work_dir) #logger.debug("update job description at communication & coordination sub-system") self.coordination.set_job(job_url, job_dict) self.coordination.queue_job(queue_url, job_url) break except: self.__print_traceback() time.sleep(2) def _get_subjob_url(self, subjob_url): """ Get unique URL for a sub-job. This URL can be used to reconnect to SJ later, e.g.: redis://localhost/bigjob:bj-9a9ba4d8-b162-11e1-9c42-109addae22a3:localhost:jobs:sj-6f44da6e-b178-11e1-bc99-109addae22a3 """ url = subjob_url if subjob_url.find("bigjob")==0: url = os.path.join(self.coordination.address, subjob_url) if self.coordination.dbtype!="" and self.coordination.dbtype!=None: url = os.path.join(url, "?" + self.coordination.dbtype) return url def __generate_bootstrap_script(self, coordination_host, coordination_namespace, external_coordination_namespace="", bigjob_home=None): script = textwrap.dedent("""import sys import os import urllib import sys import time start_time = time.time() home = os.environ.get("HOME") #print "Home: " + home if home==None: home = os.getcwd() BIGJOB_AGENT_DIR= os.path.join(home, ".bigjob") if not os.path.exists(BIGJOB_AGENT_DIR): os.mkdir (BIGJOB_AGENT_DIR) BIGJOB_PYTHON_DIR=BIGJOB_AGENT_DIR+"/python/" if not os.path.exists(BIGJOB_PYTHON_DIR): os.mkdir(BIGJOB_PYTHON_DIR) BOOTSTRAP_URL="https://raw.github.com/saga-project/BigJob/master/bootstrap/bigjob-bootstrap.py" BOOTSTRAP_FILE=BIGJOB_AGENT_DIR+"/bigjob-bootstrap.py" #ensure that BJ in .bigjob is upfront in sys.path sys.path.insert(0, os.getcwd() + "/../") p = list() for i in sys.path: if i.find(\".bigjob/python\")>1: p.insert(0, i) for i in p: sys.path.insert(0, i) print "Python path: " + str(sys.path) print "Python version: " + str(sys.version_info) try: import saga except: print "SAGA and SAGA Python Bindings not found."; try: import bigjob.bigjob_agent except: print "BigJob not installed. Attempt to install it."; opener = urllib.FancyURLopener({}); opener.retrieve(BOOTSTRAP_URL, BOOTSTRAP_FILE); print "Execute: " + "python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR os.system("/usr/bin/env") try: os.system("python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR); activate_this = os.path.join(BIGJOB_PYTHON_DIR, "bin/activate_this.py"); execfile(activate_this, dict(__file__=activate_this)) except: print "BJ installation failed. Trying system-level python (/usr/bin/python)"; os.system("/usr/bin/python " + BOOTSTRAP_FILE + " " + BIGJOB_PYTHON_DIR); activate_this = os.path.join(BIGJOB_PYTHON_DIR, "bin/activate_this.py"); execfile(activate_this, dict(__file__=activate_this)) #try to import BJ once again import bigjob.bigjob_agent # execute bj agent args = list() args.append("bigjob_agent.py") args.append(\"%s\") args.append(\"%s\") args.append(\"%s\") print "Bootstrap time: " + str(time.time()-start_time) print "Starting BigJob Agents with following args: " + str(args) bigjob_agent = bigjob.bigjob_agent.bigjob_agent(args) """ % (coordination_host, coordination_namespace, external_coordination_namespace)) return script def __generate_bootstrap_script_from_binary(self, coordination_host, coordination_namespace, external_coordination_namespace="", bigjob_home=None): script = textwrap.dedent("""import sys import os import urllib import sys import time start_time = time.time() home = os.environ.get("HOME") #print "Home: " + home if home==None: home = os.getcwd() BIGJOB_AGENT_DIR= os.path.join(home, ".bigjob") if not os.path.exists(BIGJOB_AGENT_DIR): os.mkdir (BIGJOB_AGENT_DIR) BIGJOB_PYTHON_DIR=BIGJOB_AGENT_DIR+"/python/" if not os.path.exists(BIGJOB_PYTHON_DIR): os.mkdir(BIGJOB_PYTHON_DIR) BOOTSTRAP_URL="http://s3.amazonaws.com/bigjob/bigjob-Linux-x86_64.tar.gz" BOOTSTRAP_FILE="bigjob-Linux-x86_64.tar.gz" #ensure that BJ in .bigjob is upfront in sys.path sys.path.insert(0, os.getcwd() + "/../") #sys.path.insert(0, /User/luckow/.bigjob/python/lib") #sys.path.insert(0, os.getcwd() + "/../../") p = list() for i in sys.path: if i.find(\".bigjob/python\")>1: p.insert(0, i) for i in p: sys.path.insert(0, i) print "Python path: " + str(sys.path) print "Python version: " + str(sys.version_info) try: import saga except: print "SAGA and SAGA Python Bindings not found."; try: import bigjob.bigjob_agent except: print "BigJob not installed. Attempt to install it."; #opener = urllib.FancyURLopener({}); #ret = opener.retrieve(BOOTSTRAP_URL, BOOTSTRAP_FILE); print "Download via wget" os.system("wget " + BOOTSTRAP_URL) os.system("rm -rf .bigjob") print "Execute: " + "tar -xzf " + BOOTSTRAP_FILE os.system("ls -lta") try: os.system("tar -xzf " + BOOTSTRAP_FILE); os.system("ls -lta") os.system(".bigjob/python/bin/python -c 'import bigjob; import bigjob.bigjob_agent; print bigjob.version; bigjob.bigjob_agent.bigjob_agent([\\"bigjob_agent.py\\", \\"%s\\", \\"%s\\", \\"%s\\"])'") except: print "BJ installation failed!"; """ % (coordination_host, coordination_namespace, external_coordination_namespace)) return script def __escape_rsl(self, bootstrap_script): logger.debug("Escape RSL") bootstrap_script = bootstrap_script.replace("\"", "\"\"") return bootstrap_script def __escape_pbs(self, bootstrap_script): logger.debug("Escape PBS") bootstrap_script = "\'" + bootstrap_script+ "\'" return bootstrap_script def __escape_ssh(self, bootstrap_script): logger.debug("Escape SSH") bootstrap_script = bootstrap_script.replace("\"", "\\\"") bootstrap_script = bootstrap_script.replace("\'", "\\\"") bootstrap_script = "\"" + bootstrap_script+ "\"" return bootstrap_script def __escape_bliss(self, bootstrap_script): logger.debug("Escape Bliss") #bootstrap_script = bootstrap_script.replace("\'", "\"") #bootstrap_script = "\'" + bootstrap_script+ "\'" bootstrap_script = bootstrap_script.replace('"','\\"') bootstrap_script = '"' + bootstrap_script+ '"' return bootstrap_script def __parse_pilot_url(self, pilot_url): #pdb.set_trace() pilot_saga_url = SAGAUrl(pilot_url) dbtype = pilot_saga_url.query coordination = pilot_url[:pilot_url.index("bigjob")] if dbtype!=None: coordination = os.path.join(coordination, "?"+dbtype) pilot_url = pilot_saga_url.path[1:] #dbtype = None #coordination = pilot_url[:pilot_url.index("bigjob")] #pilot_url = pilot_url[pilot_url.find("bigjob"):] #if pilot_url.find("/") > 0: # comp = pilot_url.split("/") # pilot_url = comp[0] # if comp[1].find("dbtype")>0: # dbtype=comp[1][comp[1].find("dbtype"):] #if dbtype!=None: # coordination = os.path.join(coordination, "?"+dbtype) logger.debug("Parsed URL - Coordination: %s Pilot: %s"%(coordination, pilot_url)) return coordination, pilot_url def __has_finished(self, state): state = state.lower() if state=="done" or state=="failed" or state=="canceled": return True else: return False def __parse_url(self, url): try: surl = SAGAUrl(url) host = surl.host port = surl.port username = surl.username password = surl.password query = surl.query if query!=None and query.endswith("/"): query = query[:-1] scheme = "%s://"%surl.scheme except: """ Fallback URL parser based on Python urlparse library """ logger.error("URL %s could not be parsed"%(url)) traceback.print_exc(file=sys.stderr) result = urlparse.urlparse(url) logger.debug("Result: " + str(result)) host = result.hostname #host = None port = result.port username = result.username password = result.password scheme = "%s://"%result.scheme if host==None: logger.debug("Python 2.6 fallback") if url.find("/", len(scheme)) > 0: host = url[len(scheme):url.find("/", len(scheme))] else: host = url[len(scheme):] if host.find(":")>1: logger.debug(host) comp = host.split(":") host = comp[0] port = int(comp[1]) if url.find("?")>0: query = url[url.find("?")+1:] else: query = None logger.debug("%s %s %s"%(scheme, host, port)) return scheme, username, password, host, port, query def __get_bj_id(self, pilot_url): start = pilot_url.index("bj-") end =pilot_url.index(":", start) return pilot_url[start:end] def __init_coordination(self, coordination_url): if(coordination_url.startswith("advert://") or coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logger.debug("Utilizing ADVERT Backend") except: logger.error("Advert Backend could not be loaded") elif (coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug("Utilizing Redis Backend") except: logger.error("Error loading pyredis.") elif (coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " +"PYZMQ (http://zeromq.github.com/pyzmq/)") else: logger.error("No suitable coordination backend found.") logger.debug("Parsing URL: " + coordination_url) scheme, username, password, host, port, dbtype = self.__parse_url(coordination_url) if port == -1: port = None coordination = bigjob_coordination(server=host, server_port=port, username=username, password=password, dbtype=dbtype, url_prefix=scheme) return coordination def __get_bigjob_working_dir(self): self.working_directory = os.path.abspath(self.working_directory) if self.working_directory.find(self.uuid)!=-1: # working directory already contains BJ id return self.working_directory else: return os.path.join(self.working_directory, self.uuid) def __get_subjob_working_dir(self, sj_id): base_url = self.bigjob_working_directory_url url = os.path.join(base_url, sj_id) return url ########################################################################### # File Management def __initialize_pilot_data(self, service_url): # initialize file adaptor # Pilot Data API for File Management if service_url.startswith("ssh:"): logger.debug("Use SSH backend") try: from pilot.filemanagement.ssh_adaptor import SSHFileAdaptor self.__filemanager = SSHFileAdaptor(service_url) except: logger.debug("SSH package not found.") self.__print_traceback() elif service_url.startswith("http:"): logger.debug("Use WebHDFS backend") try: from pilot.filemanagement.webhdfs_adaptor import WebHDFSFileAdaptor self.__filemanager = WebHDFSFileAdaptor(service_url) except: logger.debug("WebHDFS package not found.") elif service_url.startswith("go:"): logger.debug("Use Globus Online backend") try: from pilot.filemanagement.globusonline_adaptor import GlobusOnlineFileAdaptor self.__filemanager = GlobusOnlineFileAdaptor(service_url) except: logger.debug("Globus Online package not found.") self.__print_traceback() def __stage_files(self, filetransfers, target_url): logger.debug("Stage: %s to %s"%(filetransfers, target_url)) if filetransfers==None: return if self.__filemanager: self.__filemanager.create_remote_directory(target_url) for i in filetransfers: source_file=i if i.find(">")>0: source_file = i[:i.find(">")].strip() if source_file.startswith("ssh://")==False and source_file.startswith("go://")==False: logger.error("Staging of file: %s not supported. Please use URL in form ssh://<filename>"%source_file) continue target_url_full = os.path.join(target_url, os.path.basename(source_file)) logger.debug("Stage: %s to %s"%(source_file, target_url_full)) #self.__third_party_transfer(source_file, target_url_full) if self.__filemanager: self.__filemanager.transfer(source_file, target_url_full) def __get_launch_method(self, hostname, user=None): """ returns desired execution method: ssh, aprun """ if user == None: user = self.__discover_ssh_user(hostname) host = "" if user!=None and user!="": logger.debug("discovered user: "******"@" + hostname else: host = hostname gsissh_available = False try: cmd = "gsissh " + host + " /bin/date" logger.debug("Execute: " + cmd) gsissh_available = (subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)==0) except: pass ssh_available = False try: cmd = "ssh " + host + " /bin/date" logger.debug("Execute: " + cmd) ssh_available = (subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)==0) except: pass launch_method = "ssh" if ssh_available == False and gsissh_available == True: launch_method="gsissh" else: launch_method="ssh" logger.info("SSH: %r GSISSH: %r Use: %s"%(ssh_available, gsissh_available, launch_method)) return launch_method def __discover_ssh_user(self, hostname): # discover username user = None ssh_config = os.path.join(os.path.expanduser("~"), ".ssh/config") ssh_config_file = open(ssh_config, "r") lines = ssh_config_file.readlines() for i in range(0, len(lines)): line = lines[i] if line.find(hostname)>0: for k in range(i + 1, len(lines)): sub_line = lines[k] if sub_line.startswith(" ")==True and sub_line.startswith("\t")==True: break # configuration for next host elif sub_line.find("User")!=-1: stripped_sub_line = sub_line.strip() user = stripped_sub_line.split()[1] break ssh_config_file.close() return user def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() logger.debug("*** print_exception:", exc_info=(exc_type, exc_value, exc_traceback)) #traceback.print_exception(exc_type, exc_value, exc_traceback, # limit=2, file=sys.stdout) def __repr__(self): return self.pilot_url def __del__(self): """ BJ is not cancelled when object terminates Application can reconnect to BJ via pilot url later on""" pass