def __init__(self, pilot_data_service=None, pilot_data_description=None, pd_url=None): """ Initialize PilotData at given service url: ssh://<hostname> gsissh://<hostname> Currently only ssh schemes are supported. In the future all SAGA URL schemes/adaptors should be supported. """ self.id = None self.url = None self.pilot_data_description = None self.service_url=None self.size = None self.data_unit_description = None self.data_units={} if pd_url==None and pilot_data_service!=None: # new pd self.id = self.PD_ID_PREFIX+str(uuid.uuid1()) self.pilot_data_description = pilot_data_description self.url = CoordinationAdaptor.add_pd(CoordinationAdaptor.get_base_url(application_id)+"/"+pilot_data_service.id, self) elif pd_url != None: logger.warn("Reconnect to PilotData: %s"%pd_url) dictionary = CoordinationAdaptor.get_pd(pd_url) pd_dict = dictionary["pilot_data"] for i in pd_dict: self.__setattr__(i, pd_dict[i]) self.initialize_pilot_data()
def __create_remote_directory(self, target_url): #result = urlparse.urlparse(target_url) #target_host = result.netloc #target_path = result.path # Python 2.6 compatible URL parsing scheme = target_url[:target_url.find("://")+3] target_host = target_url[len(scheme):target_url.find("/", len(scheme))] target_path = target_url[len(scheme)+len(target_host):] target_user = None if target_host.find("@")>1: comp = target_host.split("@") target_host =comp[1] target_user =comp[0] logger.debug("Create remote directory; scheme: %s, host: %s, path: %s"%(scheme, target_host, target_path)) if scheme.startswith("fork") or target_host.startswith("localhost"): os.makedirs(target_path) return True else: try: client = self.__get_ssh_client(target_host, target_user) sftp = client.open_sftp() sftp.mkdir(target_path) sftp.close() client.close() return True except: self.__print_traceback() logger.warn("Error creating directory: " + str(target_path) + " at: " + str(target_host) + " SSH password-less login activated?" ) return False
def __refresh(self): """ Update list of data units items from coordination service """ try: if self.url != None: du_dict = CoordinationAdaptor.get_du(self.url) data_unit_dict_list = eval(du_dict["data_unit_items"]) self.data_unit_items = [DataUnitItem.create_data_unit_from_dict(i) for i in data_unit_dict_list] except: logger.warn("Refresh of DU %s failed"%(self.get_url()))
def __refresh(self): """ Update list of data units items from coordination service """ try: if self.url != None: du_dict = CoordinationAdaptor.get_du(self.url) data_unit_dict_list = eval(du_dict["data_unit_items"]) self.data_unit_items = [ DataUnitItem.create_data_unit_from_dict(i) for i in data_unit_dict_list ] except: logger.warn("Refresh of DU %s failed" % (self.get_url()))
def __init__(self, pilot_data_service=None, pilot_data_description=None, pd_url=None): """ Initialize PilotData at given service url:: ssh://<hostname> gsissh://<hostname> go://<hostname> gs://google.com s3://aws.amazon.com In the future more SAGA/Bliss URL schemes/adaptors are supported. """ self.id = None self.url = pd_url self.pilot_data_description = None self.pilot_data_service = pilot_data_service self.service_url = None self.size = None self.data_unit_urls = [] self.security_context = None if pd_url == None and pilot_data_service != None: # new pd self.id = self.PD_ID_PREFIX + str(uuid.uuid1()) self.pilot_data_description = pilot_data_description self.url = CoordinationAdaptor.add_pd( CoordinationAdaptor.get_base_url(application_id) + ":" + pilot_data_service.id, self) elif pd_url != None: logger.warn("Reconnect to PilotData: %s" % pd_url) dictionary = CoordinationAdaptor.get_pd(pd_url) if dictionary.has_key("security_context"): self.security_context = dictionary["security_context"] pd_dict = eval(dictionary["pilot_data"]) for i in pd_dict: self.__setattr__(i, pd_dict[i]) # A Pilot Data does not hold a direct reference to a Data Unit (only URL refs are stored) self.data_unit_urls = eval(dictionary["data_unit_urls"]) self.__initialize_pilot_data() CoordinationAdaptor.update_pd(self)
def __init__(self, pilot_data_service=None, pilot_data_description=None, pd_url=None): """ Initialize PilotData at given service url:: ssh://<hostname> gsissh://<hostname> go://<hostname> gs://google.com s3://aws.amazon.com In the future more SAGA/Bliss URL schemes/adaptors are supported. """ self.id = None self.url = pd_url self.pilot_data_description = None self.pilot_data_service = pilot_data_service self.service_url = None self.size = None self.data_unit_urls = [] self.security_context = None if pd_url == None and pilot_data_service != None: # new pd self.id = self.PD_ID_PREFIX + str(uuid.uuid1()) self.pilot_data_description = pilot_data_description self.url = CoordinationAdaptor.add_pd( CoordinationAdaptor.get_base_url(application_id) + ":" + pilot_data_service.id, self ) elif pd_url != None: logger.warn("Reconnect to PilotData: %s" % pd_url) dictionary = CoordinationAdaptor.get_pd(pd_url) if dictionary.has_key("security_context"): self.security_context = dictionary["security_context"] pd_dict = eval(dictionary["pilot_data"]) for i in pd_dict: self.__setattr__(i, pd_dict[i]) # A Pilot Data does not hold a direct reference to a Data Unit (only URL refs are stored) self.data_unit_urls = eval(dictionary["data_unit_urls"]) self.__initialize_pilot_data() CoordinationAdaptor.update_pd(self)
def __create_remote_directory(self, target_url): #result = urlparse.urlparse(target_url) #target_host = result.netloc #target_path = result.path # Python 2.6 compatible URL parsing scheme = target_url[:target_url.find("://")+3] target_host = target_url[len(scheme):target_url.find("/", len(scheme))] target_path = target_url[len(scheme)+len(target_host):] if target_host == "localhost": os.makedirs(target_path) else: try: client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.connect(target_host) sftp = client.open_sftp() sftp.mkdir(target_path) sftp.close() client.close() except: logger.warn("Error creating directory: " + str(target_path) + " at: " + str(target_host) + " Already exists?" )
#!/usr/bin/env python import textwrap import re from bigjob import logger try: import saga except: logger.warn("pbs-ssh://<hostname> plugin not compatible with SAGA Bliss. Use pbs+ssh://<hostname>") import os class pbsssh: """Constructor""" def __init__(self,bootstrap_script,lrms_saga_url,walltime,nodes,ppn,userproxy,working_directory=None): self.job_id = "" self.lrms_saga_url = lrms_saga_url self.lrms_saga_url.scheme="ssh" self.userproxy = userproxy self.working_directory = "" if working_directory == None: self.working_directory = "" else: self.working_directory = working_directory self.bootstrap_script = textwrap.dedent("""import sys import os import urllib import sys
pass # import other BigJob packages # import API import api.base sys.path.append(os.path.dirname(__file__)) if SAGA_BLISS == False: try: import saga logger.info("Using SAGA C++/Python.") is_bliss=False except: logger.warn("SAGA C++ and Python bindings not found. Using Bliss.") try: import bliss.saga as saga is_bliss=True except: logger.warn("SAGA Bliss not found") else: logger.info("Using SAGA Bliss.") try: import bliss.saga as saga is_bliss=True except: logger.warn("SAGA Bliss not found") """BigJob Job Description is always derived from BLISS Job Description
Set environment variable BIGJOB_HOME to installation directory """ import sys from bigjob import logger import time import os import traceback import logging import textwrap import urlparse try: import paramiko except: logger.warn("Paramiko not found. Without Paramiko file staging is not supported!") from bigjob import SAGA_BLISS from bigjob.state import Running, New, Failed, Done, Unknown if SAGA_BLISS == False: try: import saga logger.debug("Using SAGA C++/Python.") is_bliss=False except: logger.warn("SAGA C++ and Python bindings not found. Using Bliss.") try: import bliss.sagacompat as saga is_bliss=True except:
def start_pilot_job(self, lrms_url, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None, external_queue="", pilot_compute_description=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Pro Adaptor) """ if self.job != None: raise BigJobError( "One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = SAGAUrl(lrms_url) self.url = lrms_saga_url self.pilot_url = self.app_url + ":" + lrms_saga_url.host self.number_nodes = int(number_nodes) * int(processes_per_node) # Store references to BJ in global dict _pilot_url_dict[self.pilot_url] = self _pilot_url_dict[external_queue] = self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) self.coordination.set_pilot_description(self.pilot_url, filetransfers) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) self.js = None if lrms_saga_url.scheme == "gce+ssh": self.js = GCEService(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ or lrms_saga_url.scheme=="nova+ssh": self.js = EC2Service(lrms_saga_url, pilot_compute_description) else: self.js = SAGAJobService(lrms_saga_url) ############################################################################## # create job description jd = SAGAJobDescription() # Attempt to create working directory (e.g. in local scenario) if working_directory != None: if not os.path.isdir(working_directory) \ and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ and working_directory.startswith("go:")==False: os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing #self.working_directory = os.path.expanduser("~") self.working_directory = "" if queue != None: jd.queue = queue if project != None: jd.project = project if walltime != None: if is_bliss: jd.wall_time_limit = int(walltime) else: jd.wall_time_limit = str(walltime) ############################################################################## # File Management and Stage-In # Determine whether target machine use gsissh or ssh to logon. # logger.debug("Detect launch method for: " + lrms_saga_url.host) # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) self.bigjob_working_directory_url = "" if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): logger.debug( "File Staging for Cloud Instances currently not supported.") elif lrms_saga_url.scheme.startswith("condor") == True: logger.debug("Using Condor file staging") else: # build target url for working directory # this will also create the remote directory for the BJ # Fallback if working directory is not a valid URL if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): if lrms_saga_url.username != None and lrms_saga_url.username != "": self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir( ) else: self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir( ) elif self.working_directory.startswith("go:"): self.bigjob_working_directory_url = os.path.join( self.working_directory, self.uuid) else: # working directory is a valid file staging URL self.bigjob_working_directory_url = self.working_directory # initialize file manager that takes care of file movement and directory creation if self.__filemanager == None: self.__initialize_pilot_data( self.bigjob_working_directory_url) # determines the url if self.__filemanager != None and not self.working_directory.startswith( "/"): self.working_directory = self.__filemanager.get_path( self.bigjob_working_directory_url) # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory logger.debug("BigJob working directory: %s" % self.bigjob_working_directory_url) if self.__filemanager != None and self.__filemanager.create_remote_directory( self.bigjob_working_directory_url) == True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, self.bigjob_working_directory_url) else: logger.warn("No file staging adaptor found.") logger.debug("BJ Working Directory: %s", self.working_directory) if lrms_saga_url.scheme.startswith("condor") == False: jd.working_directory = self.working_directory else: jd.working_directory = "" ############################################################################## # Create and process BJ bootstrap script bootstrap_script = self.__generate_bootstrap_script( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if is_bliss: bootstrap_script = self.__escape_bliss(bootstrap_script) else: if lrms_saga_url.scheme == "gram": bootstrap_script = self.__escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme == "xt5torque" or lrms_saga_url.scheme == "torque": bootstrap_script = self.__escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.__escape_ssh(bootstrap_script) logger.debug(bootstrap_script) # Define Agent Executable in Job description # in Condor case bootstrap script is staged # (Python app cannot be passed inline in Condor job description) if lrms_saga_url.scheme.startswith("condor") == True: condor_bootstrap_filename = os.path.join( "/tmp", "bootstrap-" + str(self.uuid)) condor_bootstrap_file = open(condor_bootstrap_filename, "w") condor_bootstrap_file.write(bootstrap_script) condor_bootstrap_file.close() logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) jd.executable = "/usr/bin/env" jd.arguments = [ "python", os.path.basename(condor_bootstrap_filename) ] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename( condor_bootstrap_filename) bj_file_transfers.append(file_transfer_spec) output_file_name = "output-" + str(self.uuid) + ".tar.gz" output_file_transfer_spec = os.path.join( self.working_directory, output_file_name) + " < " + output_file_name #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" logger.debug("Output transfer: " + output_file_transfer_spec) bj_file_transfers.append(output_file_transfer_spec) if filetransfers != None: for t in filetransfers: bj_file_transfers.append(t) logger.debug("Condor file transfers: " + str(bj_file_transfers)) jd.file_transfer = bj_file_transfers else: if is_bliss: jd.total_cpu_count = int(number_nodes) else: jd.number_of_processes = str(number_nodes) jd.processes_per_host = str(processes_per_node) jd.spmd_variation = "single" jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") ############################################################################## # Create and submit pilot job to job service logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = self.js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url
import threading import time import pdb import Queue import saga sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) from bigjob import logger from pilot.api import PilotData, DataUnit, PilotDataService from pilot.api import State from pilot.filemanagement.ssh_adaptor import SSHFileAdaptor try: from pilot.filemanagement.webhdfs_adaptor import WebHDFSFileAdaptor except: logger.warn("WebHDFS package not found.") from pilot.coordination.advert import AdvertCoordinationAdaptor as CoordinationAdaptor # generate global application id for this instance #application_id = str(uuid.uuid1()) application_id = "bigdata" class PilotData(PilotData): """ PilotData. Reserves a space of physical storage on the resource specified in the pilot_data_description """ PD_ID_PREFIX="pd-"
#!/usr/bin/env python import textwrap import re import os import pdb from bigjob import logger import bigjob try: import bliss.saga as saga except: logger.warn("slurm+ssh://<hostname> plugin not compatible with SAGA Bliss. Use slurm+ssh://<hostname>") class Service(object): """ Plugin for SlURM """ def __init__(self, resource_url, pilot_compute_description=None): """Constructor""" self.resource_url = resource_url self.pilot_compute_description = pilot_compute_description def create_job(self, job_description): j = Job(job_description, self.resource_url, self.pilot_compute_description) return j
except: pass # import other BigJob packages # import API import api.base sys.path.append(os.path.dirname(__file__)) if SAGA_BLISS == False: try: import saga logger.info("Using SAGA C++/Python.") is_bliss = False except: logger.warn("SAGA C++ and Python bindings not found. Using Bliss.") try: import bliss.saga as saga is_bliss = True except: logger.warn("SAGA Bliss not found") else: logger.info("Using SAGA Bliss.") try: import bliss.saga as saga is_bliss = True except: logger.warn("SAGA Bliss not found") """BigJob Job Description is always derived from BLISS Job Description BLISS Job Description behaves compatible to SAGA C++ job description """
#!/usr/bin/env python import textwrap import re from bigjob import logger import bigjob try: import saga except: logger.warn("sge-ssh://<hostname> plugin not compatible with SAGA Bliss.") import os class sgessh: """Constructor""" def __init__(self, bootstrap_script, lrms_saga_url, walltime, number_nodes, processes_per_node, userproxy, project, queue, working_directory=None, bj_working_directory=None): self.job_id = "" self.lrms_saga_url = lrms_saga_url self.lrms_saga_url.scheme="ssh" self.userproxy = userproxy self.working_directory = "" if working_directory == None: self.working_directory = "" else: self.working_directory = working_directory if bj_working_directory==None: bj_working_directory=self.working_directory ### convert walltime in minutes to PBS representation of time ### walltime_sge="1:00:00"
def __init__(self, args): self.coordination_url = args[1] # objects to store running jobs and processes self.jobs = [] self.processes = {} self.freenodes = [] self.busynodes = [] self.restarted = {} # read config file conf_file = os.path.dirname( os.path.abspath(__file__)) + "/../" + CONFIG_FILE if not os.path.exists(conf_file): conf_file = os.path.join(sys.prefix, CONFIG_FILE) logging.debug("read configfile: " + conf_file) config = ConfigParser.ConfigParser() config.read(conf_file) default_dict = config.defaults() self.CPR = False if default_dict.has_key("cpr"): self.CPR = default_dict["cpr"] self.SHELL = "/bin/bash" if default_dict.has_key("shell"): self.SHELL = default_dict["shell"] self.MPIRUN = "mpirun" # On TACC resources the default MPICH is # linked under mpirun_rsh if default_dict.has_key("mpirun"): self.MPIRUN = default_dict["mpirun"] if default_dict.has_key("number_executor_threads"): THREAD_POOL_SIZE = int(default_dict["number_executor_threads"]) self.OUTPUT_TAR = False if default_dict.has_key("create_output_tar"): self.OUTPUT_TAR = eval(default_dict["create_output_tar"]) logger.debug("Create output tar: %r", self.OUTPUT_TAR) self.failed_polls = 0 ############################################################################## # initialization of coordination and communication subsystem # Redis initialization self.base_url = args[2] self.cds_queue_url = None if len(args) == 4: self.cds_queue_url = args[3] logger.debug("External queue: " + str(self.cds_queue_url)) self.id = self.__get_bj_id(self.base_url) logger.debug("BigJob Agent arguments: " + str(args)) logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url) logger.debug("BigJob ID: %s" % self.id) # create bj directory self.work_dir = os.getcwd() if self.work_dir.find( self.id) == -1: # working directory already contains BJ id self.bj_dir = os.path.join(os.getcwd(), self.id) logger.debug("Agent working directory: %s" % self.bj_dir) try: os.makedirs(self.bj_dir) except: logger.debug("Directory already exists.") else: self.bj_dir = os.getcwd() os.chdir(self.bj_dir) if (self.coordination_url.startswith("advert://") or self.coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logging.debug("Utilizing ADVERT Backend: " + self.coordination_url) except: logger.error("Advert Backend could not be loaded") exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exc(file=sys.stderr) traceback.print_tb(exc_traceback, file=sys.stderr) elif (self.coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug("Utilizing Redis Backend: " + self.coordination_url + ".") except: logger.error( "Error loading pyredis. Check configuration in bigjob_coordination_redis.py." ) elif (self.coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error( "ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " + "PYZMQ (http://zeromq.github.com/pyzmq/)") ### # Initiate coordination sub-system of both BJ agent and Pilot Data self.coordination = bigjob_coordination( server_connect_url=self.coordination_url) try: # initialize coordination subsystem of pilot data self.pilot_data_service = PilotDataService( coordination_url=self.coordination_url) except: logger.warn("Pilot-Data could not be initialized.") # update state of pilot job to running logger.debug("set state to : " + str(bigjob.state.Running)) self.coordination.set_pilot_state(self.base_url, str(bigjob.state.Running), False) self.pilot_description = self.coordination.get_pilot_description( self.base_url) try: self.pilot_description = ast.literal_eval(self.pilot_description) except: logger.warn("Unable to parse pilot description") self.pilot_description = None ############################################################################ # Detect launch method self.LAUNCH_METHOD = "ssh" if default_dict.has_key("launch_method"): self.LAUNCH_METHOD = default_dict["launch_method"] self.LAUNCH_METHOD = self.__get_launch_method(self.LAUNCH_METHOD) logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " + self.MPIRUN + " shell: " + self.SHELL) # init rms (SGE/PBS) self.init_rms() ############################################################################## # start background thread for polling new jobs and monitoring current jobs # check whether user requested a certain threadpool size if self.pilot_description != None and self.pilot_description.has_key( "number_executor_threads"): THREAD_POOL_SIZE = int( self.pilot_description["number_executor_threads"]) logger.debug("Creating executor thread pool of size: %d" % (THREAD_POOL_SIZE)) self.resource_lock = threading.RLock() self.threadpool = ThreadPool(THREAD_POOL_SIZE) self.launcher_thread = threading.Thread(target=self.dequeue_new_jobs) self.launcher_thread.start() self.monitoring_thread = threading.Thread( target=self.start_background_thread) self.monitoring_thread.start()
def start_pilot_job(self, lrms_url, bigjob_agent_executable=None, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Prop Adaptor) """ if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = saga.url(lrms_url) self.pilot_url = self.app_url + ":" + lrms_saga_url.host pilot_url_dict[self.pilot_url]=self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## self.number_nodes=int(number_nodes) # create job description jd = saga.job.description() # XXX Isn't the working directory about the remote site? # Yes, it is: This is to make sure that if fork if working_directory != None: if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork": os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing self.working_directory = os.path.expanduser("~") # Stage BJ Input files # build target url # this will also create the remote directory for the BJ if lrms_saga_url.username!=None and lrms_saga_url.username!="": bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir() else: bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir() # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory if self.__create_remote_directory(bigjob_working_directory_url)==True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, bigjob_working_directory_url) else: logger.warn("For file staging. SSH (incl. password-less authentication is required.") logger.debug("BJ Working Directory: %s", self.working_directory) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if lrms_saga_url.scheme == "condorg": jd.arguments = [ self.coordination.get_address(), self.pilot_url] agent_exe = os.path.abspath(os.path.join(os.path.dirname(__file__),"..","bootstrap","bigjob-condor-bootstrap.py")) logger.debug("agent_exe",agent_exe) jd.executable = agent_exe else: bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url) if lrms_saga_url.scheme == "gram": bootstrap_script = self.escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme=="xt5torque" or lrms_saga_url.scheme=="torque": bootstrap_script = self.escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.escape_ssh(bootstrap_script) ############ submit pbs script which launches bigjob agent using ssh adaptors########## elif lrms_saga_url.scheme == "pbs-ssh": bootstrap_script = self.escape_ssh(bootstrap_script) # PBS specific BJ plugin pbssshj = pbsssh(bootstrap_script, lrms_saga_url, walltime, number_nodes, processes_per_node, userproxy, self.working_directory, self.working_directory) self.job = pbssshj self.job.run() return ############ submit sge script which launches bigjob agent using ssh adaptors########## elif lrms_saga_url.scheme == "sge-ssh": bootstrap_script = self.escape_ssh(bootstrap_script) # PBS specific BJ plugin sgesshj = sgessh(bootstrap_script, lrms_saga_url, walltime, number_nodes, processes_per_node, userproxy, project, queue, self.working_directory, self.working_directory) self.job = sgesshj self.job.run() return elif is_bliss: bootstrap_script = self.escape_bliss(bootstrap_script) #logger.debug(bootstrap_script) if is_bliss==False: jd.number_of_processes = str(number_nodes) jd.processes_per_host=str(processes_per_node) else: jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node)) jd.spmd_variation = "single" #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url] jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" if queue != None: jd.queue = queue if project !=None: jd.job_project = [project] if walltime!=None: jd.wall_time_limit=str(walltime) jd.working_directory = self.working_directory logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.working_directory, "stdout-bigjob_agent.txt") jd.error = os.path.join(self.working_directory,"stderr-bigjob_agent.txt") # Submit job js = None if userproxy != None and userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", userproxy) s.add_context(ctx) logger.debug("use proxy: " + userproxy) js = saga.job.service(s, lrms_saga_url) else: logger.debug("use standard proxy") js = saga.job.service(lrms_saga_url) logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url
import tldextract tldextract.tldextract.LOG.setLevel(logging.WARNING) from pilot.api.api import PilotError sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) from bigjob import logger from pilot.api import PilotData, DataUnit, PilotDataService, State """ Load file management adaptors """ from pilot.filemanagement.ssh_adaptor import SSHFileAdaptor try: from pilot.filemanagement.webhdfs_adaptor import WebHDFSFileAdaptor except: logger.warn("WebHDFS package not found.") try: from pilot.filemanagement.globusonline_adaptor import GlobusOnlineFileAdaptor except: logger.warn("Globus Online package not found.") try: from pilot.filemanagement.gs_adaptor import GSFileAdaptor except: logger.warn("Goggle Storage package not found.") try: from pilot.filemanagement.s3_adaptor import S3FileAdaptor except: logger.warn("Amazon S3 package not found.")
def start_pilot_job(self, lrms_url, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None, external_queue="", pilot_compute_description=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Pro Adaptor) """ if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = SAGAUrl(lrms_url) self.url = lrms_saga_url self.pilot_url = self.app_url + ":" + lrms_saga_url.host self.number_nodes=int(number_nodes)*int(processes_per_node) # Store references to BJ in global dict _pilot_url_dict[self.pilot_url]=self _pilot_url_dict[external_queue]=self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) if pilot_compute_description==None: pilot_compute_description={"service_url": lrms_url, "number_of_processes": number_nodes, "processes_per_node": processes_per_node, "working_directory": working_directory} self.coordination.set_pilot_description(self.pilot_url, pilot_compute_description) logger.debug("set pilot state to: " + str(Unknown)) # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) self.js =None if lrms_saga_url.scheme=="gce+ssh": self.js = GCEService(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ or lrms_saga_url.scheme=="nova+ssh": self.js = EC2Service(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="slurm+ssh": self.js = SlurmService(lrms_saga_url, pilot_compute_description) else: self.js = SAGAJobService(lrms_saga_url) ############################################################################## # create job description jd = SAGAJobDescription() # Attempt to create working directory (e.g. in local scenario) if working_directory != None: if not os.path.isdir(working_directory) \ and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ and working_directory.startswith("go:")==False: os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing #self.working_directory = os.path.expanduser("~") self.working_directory = "" if queue != None: jd.queue = queue if project !=None: jd.project=project if walltime!=None: logger.debug("setting walltime to: " + str(walltime)) if is_bliss: jd.wall_time_limit=int(walltime) else: jd.wall_time_limit=str(walltime) ############################################################################## # File Management and Stage-In # Determine whether target machine use gsissh or ssh to logon. # logger.debug("Detect launch method for: " + lrms_saga_url.host) # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) self.bigjob_working_directory_url="" if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): logger.debug("File Staging for Cloud Instances currently not supported.") elif lrms_saga_url.scheme.startswith("condor") == True: logger.debug("Using Condor file staging") else: # build target url for working directory # this will also create the remote directory for the BJ # Fallback if working directory is not a valid URL if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): if lrms_saga_url.username!=None and lrms_saga_url.username!="": self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir() else: self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir() elif self.working_directory.startswith("go:"): self.bigjob_working_directory_url=os.path.join(self.working_directory, self.uuid) else: # working directory is a valid file staging URL self.bigjob_working_directory_url=self.working_directory # initialize file manager that takes care of file movement and directory creation if self.__filemanager==None: self.__initialize_pilot_data(self.bigjob_working_directory_url) # determines the url if self.__filemanager != None and not self.working_directory.startswith("/"): self.working_directory = self.__filemanager.get_path(self.bigjob_working_directory_url) # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory logger.debug("BigJob working directory: %s"%self.bigjob_working_directory_url) if self.__filemanager!=None and self.__filemanager.create_remote_directory(self.bigjob_working_directory_url)==True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, self.bigjob_working_directory_url) else: logger.warn("No file staging adaptor found.") logger.debug("BJ Working Directory: %s", self.working_directory) if lrms_saga_url.scheme.startswith("condor")==False: jd.working_directory = self.working_directory else: jd.working_directory="" ############################################################################## # Create and process BJ bootstrap script bootstrap_script = self.__generate_bootstrap_script( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if is_bliss and lrms_saga_url.scheme.startswith("condor")==False: bootstrap_script = self.__escape_bliss(bootstrap_script) else: if lrms_saga_url.scheme == "gram": bootstrap_script = self.__escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme=="xt5torque" or lrms_saga_url.scheme=="torque": bootstrap_script = self.__escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh" and lrms_saga_url.scheme == "slurm+ssh": bootstrap_script = self.__escape_ssh(bootstrap_script) logger.debug(bootstrap_script) # Define Agent Executable in Job description # in Condor case bootstrap script is staged # (Python app cannot be passed inline in Condor job description) if lrms_saga_url.scheme.startswith("condor")==True: bootstrap_script = self.__generate_bootstrap_script_from_binary( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) condor_bootstrap_filename = os.path.join("/tmp", "bootstrap-"+str(self.uuid)) condor_bootstrap_file = open(condor_bootstrap_filename, "w") condor_bootstrap_file.write(bootstrap_script) condor_bootstrap_file.close() logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) jd.executable = "/usr/bin/env" jd.arguments = ["python", os.path.basename(condor_bootstrap_filename)] if pilot_compute_description.has_key("candidate_hosts"): jd.candidate_hosts = pilot_compute_description["candidate_hosts"] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename(condor_bootstrap_filename) bj_file_transfers.append(file_transfer_spec) output_file_name = "output-" + str(self.uuid) + ".tar.gz" #output_file_transfer_spec = os.path.join(self.working_directory, output_file_name) +" < " + output_file_name output_file_transfer_spec = output_file_name +" < " + output_file_name #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" #logger.debug("Output transfer: " + output_file_transfer_spec) #bj_file_transfers.append(output_file_transfer_spec) if filetransfers != None: for t in filetransfers: bj_file_transfers.append(t) logger.debug("Condor file transfers: " + str(bj_file_transfers)) jd.file_transfer = bj_file_transfers else: if is_bliss: jd.total_cpu_count=int(number_nodes) else: jd.number_of_processes=str(number_nodes) jd.processes_per_host=str(processes_per_node) jd.spmd_variation = "single" if pilot_compute_description!=None and pilot_compute_description.has_key("spmd_variation"): jd.spmd_variation=pilot_compute_description["spmd_variation"] jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" logger.debug("Working directory: " + jd.working_directory + " Job Description: " + str(jd)) jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") ############################################################################## # Create and submit pilot job to job service logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = self.js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url
def __init__(self, args): self.coordination_url = args[1] # objects to store running jobs and processes self.jobs = [] self.processes = {} self.freenodes = [] self.busynodes = [] self.restarted = {} # read config file conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/../" + CONFIG_FILE if not os.path.exists(conf_file): conf_file = os.path.join(sys.prefix, CONFIG_FILE) logging.debug ("read configfile: " + conf_file) config = ConfigParser.ConfigParser() config.read(conf_file) default_dict = config.defaults() self.CPR=False if default_dict.has_key("cpr"): self.CPR = default_dict["cpr"] self.SHELL="/bin/bash" if default_dict.has_key("shell"): self.SHELL=default_dict["shell"] self.MPIRUN="mpirun" # On TACC resources the default MPICH is # linked under mpirun_rsh if default_dict.has_key("mpirun"): self.MPIRUN=default_dict["mpirun"] self.OUTPUT_TAR=False if default_dict.has_key("create_output_tar"): self.OUTPUT_TAR=eval(default_dict["create_output_tar"]) logger.debug("Create output tar: %r", self.OUTPUT_TAR) self.LAUNCH_METHOD="ssh" if default_dict.has_key("launch_method"): self.LAUNCH_METHOD=self.__get_launch_method(default_dict["launch_method"]) logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " + self.MPIRUN + " shell: " + self.SHELL) # init rms (SGE/PBS) self.init_rms() self.failed_polls = 0 ############################################################################## # initialization of coordination and communication subsystem # Redis initialization self.base_url = args[2] self.cds_queue_url = None if len(args)==4: self.cds_queue_url = args[3] logger.debug("External queue: " + str(self.cds_queue_url)) self.id = self.__get_bj_id(self.base_url) logger.debug("BigJob Agent arguments: " + str(args)) logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url) logger.debug("BigJob ID: %s"%self.id) # create bj directory self.work_dir = os.getcwd() if self.work_dir.find(self.id)==-1: # working directory already contains BJ id self.bj_dir = os.path.join(os.getcwd(), self.id) logger.debug("Agent working directory: %s"%self.bj_dir) try: os.makedirs(self.bj_dir) except: logger.debug("Directory already exists.") else: self.bj_dir = os.getcwd() os.chdir(self.bj_dir) if(self.coordination_url.startswith("advert://") or self.coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logging.debug("Utilizing ADVERT Backend: " + self.coordination_url) except: logger.error("Advert Backend could not be loaded") exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exc(file=sys.stderr) traceback.print_tb(exc_traceback, file=sys.stderr) elif (self.coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug("Utilizing Redis Backend: " + self.coordination_url + ". Please make sure Redis server is configured in bigjob_coordination_redis.py") except: logger.error("Error loading pyredis.") elif (self.coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " +"PYZMQ (http://zeromq.github.com/pyzmq/)") ### # Initiate coordination sub-system of both BJ agent and Pilot Data self.coordination = bigjob_coordination(server_connect_url=self.coordination_url) try: # initialize coordination subsystem of pilot data self.pilot_data_service = PilotDataService(coordination_url=self.coordination_url) except: logger.warn("Pilot-Data could not be initialized.") # update state of pilot job to running logger.debug("set state to : " + str(bigjob.state.Running)) self.coordination.set_pilot_state(self.base_url, str(bigjob.state.Running), False) self.pilot_description = self.coordination.get_pilot_description(self.base_url) ############################################################################## # start background thread for polling new jobs and monitoring current jobs self.resource_lock=threading.RLock() self.threadpool = ThreadPool(THREAD_POOL_SIZE) self.launcher_thread=threading.Thread(target=self.dequeue_new_jobs) self.launcher_thread.start() self.monitoring_thread=threading.Thread(target=self.start_background_thread) self.monitoring_thread.start()
import random import threading import time import pdb import Queue from pilot.api.api import PilotError sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) from bigjob import logger from pilot.api import PilotData, DataUnit, PilotDataService, State """ Load file management adaptors """ from pilot.filemanagement.ssh_adaptor import SSHFileAdaptor try: from pilot.filemanagement.webhdfs_adaptor import WebHDFSFileAdaptor except: logger.warn("WebHDFS package not found.") try: from pilot.filemanagement.globusonline_adaptor import GlobusOnlineFileAdaptor except: logger.warn("Globus Online package not found.") try: from pilot.filemanagement.gs_adaptor import GSFileAdaptor except: logger.warn("Goggle Storage package not found.") try: from pilot.filemanagement.s3_adaptor import S3FileAdaptor except: logger.warn("Amazon S3 package not found.")