def export(self, target_url): """ simple implementation of export: copies file from first pilot data to local machine """ if self.get_state()!=State.Running: self.wait() if len(self.pilot_data) > 0: # Search for PD that is close to local machine local_hostname=socket.getfqdn() max_score=0 best_pd=None for pd in self.pilot_data: pd_host = SAGAUrl(pd.service_url).host pd_score = difflib.SequenceMatcher(a=pd_host, b=local_hostname).ratio() logger.debug("Export locality compute score: Localhost: %s PD at: %s Score: %s"%(local_hostname, pd_host, pd_score)) if pd_score > max_score: best_pd=pd max_score=pd_score #pd_domain = tldextract.extract(pd.service_url).domain #local_domain = tldextract.extract(socket.getfqdn()).domain if best_pd!=None: logger.debug("Export from: %s"%(best_pd.service_url)) best_pd.export_du(self, target_url) return # No PD found. Utilize default PD logger.debug("Export from random PD") self.pilot_data[0].export_du(self, target_url) else: logger.error("No Pilot Data for PD found")
def __parse_url(self, url): try: surl = saga.url(url) host = surl.host port = surl.port username = surl.username password = surl.password query = surl.query scheme = "%s://"%surl.scheme except: """ Fallback URL parser based on Python urlparse library """ logger.error("URL %s could not be parsed") traceback.print_exc(file=sys.stderr) result = urlparse.urlparse(url) host = result.hostname port = result.port username = result.username password = result.password if url.find("?")>0: query = url[url.find("?")+1:] else: query = None scheme = "%s://"%result.scheme return scheme, username, password, host, port, query
def get_base_url(cls, application_id): if cls.BASE_URL==None: logger.error("Coordination URL not set. Exiting Pilot-Data.") raise Exception("Coordination URL not set. Exiting Pilot-Data.") surl = saga.Url(cls.BASE_URL) base_url = surl.scheme + "://" + surl.host + "/" + application_id logger.debug(base_url) return base_url
def get_base_url(cls, application_id): if cls.BASE_URL == None: logger.error("Coordination URL not set. Exiting Pilot-Data.") raise Exception("Coordination URL not set. Exiting Pilot-Data.") surl = SAGAUrl(cls.BASE_URL) base_url = surl.scheme + "://" + surl.host + "/" + application_id logger.debug(base_url) return base_url
def export(self, target_url): """ simple implementation of export: copies file from first pilot store to local machine """ if len(self.pilot_stores) > 0: self.pilot_stores[0].export_pd(self, target_url) else: logger.error("No Pilot Store for PD found")
def export(self, target_url): """ simple implementation of export: copies file from first pilot data to local machine """ if len(self.pilot_data) > 0: self.pilot_data[0].export_du(self, target_url) else: logger.error("No Pilot Data for PD found")
def __init__(self, coordination_url="advert://localhost/?dbtype=sqlite3", pilot_url=None): """ Initializes BigJob's coordination system advert://localhost (SAGA/Advert SQLITE) advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) redis://localhost:6379 (Redis at localhost) tcp://localhost (ZMQ) The following formats for pilot_url are supported: 1.) Including root path at distributed coordination service: redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost This path is returned when call bigjob.get_url() 2.) BigJob unique ID: bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost """ self.coordination_url = coordination_url if self.coordination_url == None: logger.error("Coordination URL not set. Exiting BigJob.") #self.launch_method="" self.__filemanager = None self._ocache = ObjectCache() # restore existing BJ or initialize new BJ if pilot_url != None: logger.debug("Reconnect to BJ: %s" % pilot_url) if pilot_url.startswith("bigjob:"): self.pilot_url = pilot_url else: self.coordination_url, self.pilot_url = self.__parse_pilot_url( pilot_url) self.uuid = self.__get_bj_id(pilot_url) self.app_url = self.__APPLICATION_NAME + ":" + str(self.uuid) self.job = None self.working_directory = None # Coordination subsystem must be initialized before get_state_detail self.coordination = self.__init_coordination(self.coordination_url) self.state = self.get_state_detail() _pilot_url_dict[self.pilot_url] = self else: self.coordination = self.__init_coordination(self.coordination_url) self.uuid = "bj-" + str(get_uuid()) logger.debug("init BigJob w/: " + coordination_url) self.app_url = self.__APPLICATION_NAME + ":" + str(self.uuid) self.state = Unknown self.pilot_url = "" self.job = None self.working_directory = None logger.debug("initialized BigJob: " + self.app_url)
def __init__(self, coordination_url="advert://localhost/?dbtype=sqlite3", pilot_url=None): """ Initializes BigJob's coordination system advert://localhost (SAGA/Advert SQLITE) advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL) redis://localhost:6379 (Redis at localhost) tcp://localhost (ZMQ) The following formats for pilot_url are supported: 1.) Including root path at distributed coordination service: redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost This path is returned when call bigjob.get_url() 2.) BigJob unique ID: bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost """ self.coordination_url = coordination_url if self.coordination_url==None: logger.error("Coordination URL not set. Exiting BigJob.") #self.launch_method="" self.__filemanager=None self._ocache = ObjectCache () # restore existing BJ or initialize new BJ if pilot_url!=None: logger.debug("Reconnect to BJ: %s"%pilot_url) if pilot_url.startswith("bigjob:"): self.pilot_url=pilot_url else: self.coordination_url, self.pilot_url = self.__parse_pilot_url(pilot_url) self.uuid = self.__get_bj_id(pilot_url) self.app_url = self.__APPLICATION_NAME +":" + str(self.uuid) self.job = None self.working_directory = None # Coordination subsystem must be initialized before get_state_detail self.coordination = self.__init_coordination(self.coordination_url) self.state=self.get_state_detail() _pilot_url_dict[self.pilot_url]=self else: self.coordination = self.__init_coordination(self.coordination_url) self.uuid = "bj-" + str(get_uuid()) logger.debug("init BigJob w/: " + coordination_url) self.app_url =self. __APPLICATION_NAME +":" + str(self.uuid) self.state=Unknown self.pilot_url="" self.job = None self.working_directory = None logger.debug("initialized BigJob: " + self.app_url)
def export(self, target_url): """ simple implementation of export: copies file from first pilot data to local machine """ if self.state != State.Running: self.wait() if len(self.pilot_data) > 0: self.pilot_data[0].export_du(self, target_url) else: logger.error("No Pilot Data for PD found")
def add_pilot_compute_service(self, pcs): """ Add a PilotComputeService to this CDS. @param pcs: The PilotComputeService to which this ComputeDataService will connect. """ self.pilot_job_services.append(pcs) CoordinationAdaptor.update_cds(self.url, self) if len(self.pilot_job_services)>1: logger.error("Decentral ComputeDataService only supports 1 PilotComputeService") raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService")
def _get_du_id(cls, du_url): try: start = du_url.index(cls.DU_ID_PREFIX) end = du_url.find("/", start) if end == -1: end = du_url.find("?", start) if end == -1: end = len(du_url) return du_url[start:end] except: logger.error("No valid PD URL") return None
def put_pd(self, pd): for i in pd.list_data_units(): remote_path = os.path.join(self.__get_pd_path(pd.id), os.path.basename(i.local_url)) logger.debug("Put file: %s to %s"%(i.local_url, remote_path)) if i.local_url.startswith("file://") or i.local_url.startswith("/"): if stat.S_ISDIR(os.stat(i.local_url).st_mode): logger.warning("Path %s is a directory. Ignored."%i.local_url) continue self.__webhdfs.copyFromLocal(i.local_url, remote_path) else: logger.error("File URLs: %s not supported"%i.local_url)
def __init__(self, server=REDIS_SERVER, server_port=REDIS_SERVER_PORT, server_connect_url=None, username=None, password=None, dbtype=None, url_prefix=None): ''' Constructor ''' if server_port==None: server_port=6379 self.username = None self.password = None self.address = "%s%s:%i"%(REDIS_URL_SCHEME, server, server_port) self.dbtype="" #self.redis_adaptor_start_time = datetime.datetime.utcnow().strftime("%s") self.redis_adaptor_start_time = time.time() if server_connect_url!=None: self.address=server_connect_url start_index = self.address.find(REDIS_URL_SCHEME)+len(REDIS_URL_SCHEME) server_and_port = self.address[start_index:] password_end = server_and_port.find("@") # parse out password if password_end != -1: self.password = server_and_port[:password_end] start_index=password_end server_and_port= server_and_port[(password_end+1):] # port and hostname if server_and_port.find(":")==-1: server=server_and_port server_port = REDIS_SERVER_PORT else: server = server_and_port.split(":")[0] server_port = int(server_and_port.split(":")[1]) else: self.password = username if self.password != None and self.password!="": self.address = "%s%s@%s:%i"%(REDIS_URL_SCHEME, self.password, server, server_port) logger.debug("Connect to Redis: " + server + " Port: " + str(server_port)) if self.password==None: self.redis_client = redis.Redis(host=server, port=server_port, db=0) else: self.redis_client = redis.Redis(host=server, port=server_port, password=self.password, db=0) #self.redis_client_pubsub = self.redis_client.pubsub() # redis pubsub client self.resource_lock = threading.RLock() self.pipe = self.redis_client.pipeline() try: self.redis_client.ping() except Exception, ex: logger.error("Cannot connect to Redis server: %s" % str(ex)) raise Exception("Cannot connect to Redis server: %s" % str(ex))
def __get_pd_id(self, pd_url): try: start = pd_url.index(self.PD_ID_PREFIX) end = pd_url.find("/", start) if end==-1: end = pd_url.find("?", start) if end==-1: end = len(pd_url)-1 return pd_url[start:end] except: logger.error("No valid PD URL") return None
def __get_du_id(self, du_url): try: start = du_url.index(self.DU_ID_PREFIX) end = du_url.find("/", start) if end==-1: end = du_url.find("?", start) if end==-1: end = len(du_url)-1 return du_url[start:end] except: logger.error("No valid PD URL") return None
def _scheduler_thread(self): while True and self.stop.isSet() == False: try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") du = self.du_queue.get(True, 1) # check whether this is a real du object if isinstance(du, DataUnit): pd = self._schedule_du(du) if (pd != None): logger.debug("Initiate Transfer to PD.") du.add_pilot_data(pd) logger.debug("Transfer to PD finished.") du._update_state(State.Running) self.du_queue.task_done() else: self.du_queue.task_done() self.du_queue.put(du) except Queue.Empty: pass try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Job") cu = self.cu_queue.get(True, 1) if isinstance(cu, ComputeUnit): self.__wait_for_du(cu) pj = self._schedule_cu(cu) if pj != None: cu = self.__expand_working_directory(cu, pj) pj._submit_cu(cu) self.cu_queue.task_done() else: logger.debug("No resource found.") self.cu_queue.task_done() self.cu_queue.put(cu) except Queue.Empty: pass except: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) logger.error("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) if self.cu_queue.empty() and self.du_queue.empty(): time.sleep(5) logger.debug("Re-Scheduler terminated")
def __init__(self, service_url): self.service_url = service_url try: result = urlparse.urlparse(service_url) self.host = result.netloc self.path = result.path except: logger.error("Error parsing URL.") self.__state = State.New self.__webhdfs = WebHDFS(self.HDFS_SERVICE_HOST, self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME)
def __stage_in_data_units(self, input_data=[], target_directory="."): """ stage in data units specified in input_data field """ try: logger.debug("Stage in input files to: %s" % target_directory) for i in input_data: du = DataUnit(du_url=i) logger.debug("Restored DU... call get state()") logger.debug("DU State: " + du.get_state()) du.wait() logger.debug("Reconnected to DU. Exporting it now...") du.export(target_directory) except: logger.error("Stage-in of files failed.") self.__print_traceback()
def __stage_in_data_units(self, input_data=[], target_directory="."): """ stage in data units specified in input_data field """ try: logger.debug("Stage in input files to: %s"%target_directory) for i in input_data: du = DataUnit(du_url=i) logger.debug("Restored DU... call get state()") logger.debug("DU State: " + du.get_state()) du.wait() logger.debug("Reconnected to DU. Exporting it now...") du.export(target_directory) except: logger.error("Stage-in of files failed.") self.__print_traceback()
def __init__(self, pjs_url=None): """ Create a PilotJobService object. Keyword arguments: pjs_id -- Don't create a new, but connect to an existing (optional) """ self.__mjs = None self.pilot_computes=[] if pjs_url==None: # new pjs self.id = self.PJS_ID_PREFIX+str(uuid.uuid1()) self.url = "pilotjob://localhost/"+self.id else: logger.error("Reconnect to PJS currently not supported.")
def __init_coordination(self, coordination_url): if(coordination_url.startswith("advert://") or coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logger.debug("Utilizing ADVERT Backend") except: logger.error("Advert Backend could not be loaded") elif (coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug("Utilizing Redis Backend") except: logger.error("Error loading pyredis.") elif (coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " +"PYZMQ (http://zeromq.github.com/pyzmq/)") else: logger.error("No suitable coordination backend found.") logger.debug("Parsing URL: " + coordination_url) scheme, username, password, host, port, dbtype = self.__parse_url(coordination_url) if port == -1: port = None coordination = bigjob_coordination(server=host, server_port=port, username=username, password=password, dbtype=dbtype, url_prefix=scheme) return coordination
def put_pd(self, pd): for i in pd.list_data_units(): remote_path = os.path.join(self.__get_pd_path(pd.id), os.path.basename(i.local_url)) logger.debug("Put file: %s to %s" % (i.local_url, remote_path)) if i.local_url.startswith("file://") or i.local_url.startswith( "/"): if stat.S_ISDIR(os.stat(i.local_url).st_mode): logger.warning("Path %s is a directory. Ignored." % i.local_url) continue self.__webhdfs.copyFromLocal(i.local_url, remote_path) else: logger.error("File URLs: %s not supported" % i.local_url)
def __init__(self, service_url): self.service_url = service_url try: result = urlparse.urlparse(service_url) self.host = result.netloc self.path = result.path except: logger.error("Error parsing URL.") self.__state=State.New self.__webhdfs= WebHDFS(self.HDFS_SERVICE_HOST, self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME)
def _scheduler_thread(self): while True and self.stop.isSet()==False: try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") du = self.du_queue.get(True, 1) # check whether this is a real du object if isinstance(du, DataUnit): pd=self._schedule_du(du) if(pd!=None): logger.debug("Initiate Transfer to PD.") du.add_pilot_data(pd) logger.debug("Transfer to PD finished.") du._update_state(State.Running) self.du_queue.task_done() else: self.du_queue.task_done() self.du_queue.put(du) except Queue.Empty: pass try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Job") cu = self.cu_queue.get(True, 1) if isinstance(cu, ComputeUnit): self.__wait_for_du(cu) pj=self._schedule_cu(cu) if pj !=None: cu = self.__expand_working_directory(cu, pj) pj._submit_cu(cu) self.cu_queue.task_done() else: logger.debug("No resource found.") self.cu_queue.task_done() self.cu_queue.put(cu) except Queue.Empty: pass except: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) logger.error("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) if self.cu_queue.empty() and self.du_queue.empty(): time.sleep(5) logger.debug("Re-Scheduler terminated")
def __init__(self, coordination_url=COORDINATION_URL, pcs_url=None): """ Create a PilotJobService object. Keyword arguments: pcs_id -- Don't create a new, but connect to an existing (optional) """ self.pilot_computes=[] self.coordination_url=coordination_url self.coordination_queue="" if pcs_url==None: # new pcs self.id = self.PJS_ID_PREFIX+str(uuid.uuid1()) self.url = os.path.join(self.coordination_url, "pcs", self.id) self.coordination_queue = "PilotComputeServiceQueue-" + str(self.id) logger.debug("Created Pilot Compute Service: %s"%self.url) else: logger.error("Reconnect to PilotComputeService currently not supported.")
def __create_remote_directory(self, target_url): result = urlparse.urlparse(target_url) target_host = result.netloc target_path = result.path try: client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.connect(target_host) sftp = client.open_sftp() sftp.mkdir(target_path) sftp.close() client.close() except: logger.error("Error creating directory: " + str(target_path) + " at: " + str(target_host)) self.__print_traceback()
def __get_redis_api_client(cls): import redis ''' Initialize Redis API Client ''' saga_url = saga.Url(RedisCoordinationAdaptor.BASE_URL) username = saga_url.username server = saga_url.host server_port = saga_url.port if username==None or username=="": redis_client = redis.Redis(host=server, port=server_port, db=0) else: redis_client = redis.Redis(host=server, port=server_port, password=username, db=0) try: redis_client.ping() except: logger.error("Please start Redis server!") raise Exception("Please start Redis server!") return redis_client
def remove_pilot_compute_service(self, pcs): """ Remove a PilotJobService from this CDS. Note that it won't cancel the PilotJobService, it will just no longer be connected to this WUS. Keyword arguments: pilotjob_services -- The PilotJob Service(s) to remove from this Work Unit Service. Return: Result """ self.pilot_job_services.remove(pcs) CoordinationAdaptor.update_cds(self.url, self) if len(self.pilot_job_services)>1: logger.error("Decentral ComputeDataService only supports 1 PilotComputeService") raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService")
def __init__(self, coordination_url=COORDINATION_URL, pcs_url=None): """ Create a PilotJobService object. Keyword arguments: pcs_id -- Don't create a new, but connect to an existing (optional) """ self.pilot_computes = [] self.coordination_url = coordination_url self.coordination_queue = "" if pcs_url == None: # new pcs self.id = self.PJS_ID_PREFIX + str(uuid.uuid1()) self.url = os.path.join(self.coordination_url, "pcs", self.id) self.coordination_queue = "PilotComputeServiceQueue-" + str( self.id) logger.debug("Created Pilot Compute Service: %s" % self.url) else: logger.error( "Reconnect to PilotComputeService currently not supported.")
def __stage_files(self, filetransfers, target_url): logger.debug("Stage: %s to %s"%(filetransfers, target_url)) if filetransfers==None: return if self.__filemanager: self.__filemanager.create_remote_directory(target_url) for i in filetransfers: source_file=i if i.find(">")>0: source_file = i[:i.find(">")].strip() if source_file.startswith("ssh://")==False and source_file.startswith("go://")==False: logger.error("Staging of file: %s not supported. Please use URL in form ssh://<filename>"%source_file) continue target_url_full = os.path.join(target_url, os.path.basename(source_file)) logger.debug("Stage: %s to %s"%(source_file, target_url_full)) #self.__third_party_transfer(source_file, target_url_full) if self.__filemanager: self.__filemanager.transfer(source_file, target_url_full)
def __init__(self, redis_url): ''' Constructor ''' server_port = 6379 self.redis_url = redis_url self.password = None start_index = self.redis_url.find(REDIS_URL_SCHEME) + len( REDIS_URL_SCHEME) server_and_port = self.redis_url[start_index:] password_end = server_and_port.find("@") # parse out password if password_end != -1: self.password = server_and_port[:password_end] start_index = password_end server_and_port = server_and_port[(password_end + 1):] # port and hostname if server_and_port.find(":") == -1: server = server_and_port server_port = REDIS_SERVER_PORT else: server = server_and_port.split(":")[0] server_port = int(server_and_port.split(":")[1]) logger.debug("Connect to Redis: " + server + " Port: " + str(server_port)) if self.password == None: self.redis_client = redis.Redis(host=server, port=server_port, db=0) else: self.redis_client = redis.Redis(host=server, port=server_port, password=self.password, db=0) self.pipe = self.redis_client.pipeline() try: self.redis_client.ping() except: logger.error("Please start Redis server!") raise Exception("Please start Redis server!")
def __stage_out_data_units(self, output_data=[], workingdirectory=None): """ stage out data to a specified data unit pilot data """ logger.debug("Stage out output files") """ Parsing output data field of job description: { ... "output_data": [ { output_data_unit.get_url(): ["stdout.txt", "stderr.txt"] } ] } """ try: for data_unit_dict in output_data: logger.debug("Process: " + str(data_unit_dict)) for du_url in data_unit_dict.keys( ): # go through all dicts (each representing 1 PD) #pd_url = self.__get_pd_url(du_url) #pilot_data = PilotData(pd_url=pd_url) #du = pilot_data.get_du(du_url) du = DataUnit(du_url=du_url) file_list = data_unit_dict[du_url] logger.debug("Add files: " + str(file_list)) all_files = [] for output_file in file_list: expanded_files = [output_file] if output_file.find("*") >= 0 or output_file.find( "?") >= 0: expanded_files = self.__expand_file_pattern( output_file, workingdirectory) logger.debug("Expanded files: " + str(expanded_files)) for f in expanded_files: all_files.append(os.path.join(workingdirectory, f)) du.add_files(all_files) for f in all_files: os.remove(f) except: logger.error("Stage out of files failed.") self.__print_traceback()
def __parse_url(self, url): try: surl = SAGAUrl(url) host = surl.host port = surl.port username = surl.username password = surl.password query = surl.query if query!=None and query.endswith("/"): query = query[:-1] scheme = "%s://"%surl.scheme except: """ Fallback URL parser based on Python urlparse library """ logger.error("URL %s could not be parsed"%(url)) traceback.print_exc(file=sys.stderr) result = urlparse.urlparse(url) logger.debug("Result: " + str(result)) host = result.hostname #host = None port = result.port username = result.username password = result.password scheme = "%s://"%result.scheme if host==None: logger.debug("Python 2.6 fallback") if url.find("/", len(scheme)) > 0: host = url[len(scheme):url.find("/", len(scheme))] else: host = url[len(scheme):] if host.find(":")>1: logger.debug(host) comp = host.split(":") host = comp[0] port = int(comp[1]) if url.find("?")>0: query = url[url.find("?")+1:] else: query = None logger.debug("%s %s %s"%(scheme, host, port)) return scheme, username, password, host, port, query
def __parse_url(self, url): try: surl = SAGAUrl(url) host = surl.host port = surl.port username = surl.username password = surl.password query = surl.query if query != None and query.endswith("/"): query = query[:-1] scheme = "%s://" % surl.scheme except: """ Fallback URL parser based on Python urlparse library """ logger.error("URL %s could not be parsed" % (url)) traceback.print_exc(file=sys.stderr) result = urlparse.urlparse(url) logger.debug("Result: " + str(result)) host = result.hostname #host = None port = result.port username = result.username password = result.password scheme = "%s://" % result.scheme if host == None: logger.debug("Python 2.6 fallback") if url.find("/", len(scheme)) > 0: host = url[len(scheme):url.find("/", len(scheme))] else: host = url[len(scheme):] if host.find(":") > 1: logger.debug(host) comp = host.split(":") host = comp[0] port = int(comp[1]) if url.find("?") > 0: query = url[url.find("?") + 1:] else: query = None logger.debug("%s %s %s" % (scheme, host, port)) return scheme, username, password, host, port, query
def start_new_job_in_thread(self, job_url): """evaluates job dir, sanity checks, executes job """ #pdb.set_trace() if job_url != None: failed = False; try: logger.debug("Get job description") job_dict = self.coordination.get_job(job_url) except: logger.error("Failed to get job description") failed=True if job_dict==None or failed==True: self.coordination.queue_job(self.pilot_url, job_url) logger.debug("start job: " + job_url + " data: " + str(job_dict)) if(job_dict["state"]==str(bigjob.state.Unknown)): job_dict["state"]=str(bigjob.state.New) self.coordination.set_job_state(job_url, str(bigjob.state.New)) self.execute_job(job_url, job_dict)
def start_new_job_in_thread(self, job_url): """evaluates job dir, sanity checks, executes job """ #pdb.set_trace() if job_url != None: failed = False try: logger.debug("Get job description") job_dict = self.coordination.get_job(job_url) except: logger.error("Failed to get job description") failed = True if job_dict == None or failed == True: self.coordination.queue_job(self.pilot_url, job_url) logger.debug("start job: " + job_url + " data: " + str(job_dict)) if (job_dict["state"] == str(bigjob.state.Unknown)): job_dict["state"] = str(bigjob.state.New) self.coordination.set_job_state(job_url, str(bigjob.state.New)) self.execute_job(job_url, job_dict)
def copy_pd_to_url(self, pd, local_url, remote_url): if not remote_url.startswith("file://") and not remote_url.startswith("/"): logger.error("Only local URLs supported") return result = urlparse.urlparse(remote_url) path = result.path # create directory try: os.makedirs(path) except: logger.debug("Directory: %s already exists."%path) base_dir = self.__get_pd_path(pd.id) for filename in self.__webhdfs.listdir(base_dir): file_url = local_url + "/" + filename file_remote_url = remote_url + "/" + filename logger.debug("GET " + file_url + " to " + file_remote_url) self.__webhdfs.copyToLocal(file_url, file_remote_url)
def __stage_out_data_units(self, output_data=[], workingdirectory=None): """ stage out data to a specified data unit pilot data """ logger.debug("Stage out output files") """ Parsing output data field of job description: { ... "output_data": [ { output_data_unit.get_url(): ["stdout.txt", "stderr.txt"] } ] } """ try: for data_unit_dict in output_data: logger.debug("Process: " + str(data_unit_dict)) for du_url in data_unit_dict.keys(): # go through all dicts (each representing 1 PD) #pd_url = self.__get_pd_url(du_url) #pilot_data = PilotData(pd_url=pd_url) #du = pilot_data.get_du(du_url) du = DataUnit(du_url=du_url) file_list = data_unit_dict[du_url] logger.debug("Add files: " + str(file_list)) all_files=[] for output_file in file_list: expanded_files = [output_file] if output_file.find("*")>=0 or output_file.find("?")>=0: expanded_files = self.__expand_file_pattern(output_file, workingdirectory) logger.debug("Expanded files: " + str(expanded_files)) for f in expanded_files: all_files.append(os.path.join(workingdirectory, f)) du.add_files(all_files) for f in all_files: os.remove(f) except: logger.error("Stage out of files failed.") self.__print_traceback()
def __get_redis_api_client(cls): import redis ''' Initialize Redis API Client ''' saga_url = SAGAUrl(RedisCoordinationAdaptor.BASE_URL) username = saga_url.username server = saga_url.host server_port = saga_url.port if username == None or username == "": redis_client = redis.Redis(host=server, port=server_port, db=0) else: redis_client = redis.Redis(host=server, port=server_port, password=username, db=0) try: redis_client.ping() except: logger.error("Please start Redis server!") raise Exception("Please start Redis server!") return redis_client
def copy_pd_to_url(self, pd, local_url, remote_url): if not remote_url.startswith("file://") and not remote_url.startswith( "/"): logger.error("Only local URLs supported") return result = urlparse.urlparse(remote_url) path = result.path # create directory try: os.makedirs(path) except: logger.debug("Directory: %s already exists." % path) base_dir = self.__get_pd_path(pd.id) for filename in self.__webhdfs.listdir(base_dir): file_url = local_url + "/" + filename file_remote_url = remote_url + "/" + filename logger.debug("GET " + file_url + " to " + file_remote_url) self.__webhdfs.copyToLocal(file_url, file_remote_url)
def export(self, target_url): """ simple implementation of export: copies file from first pilot data to local machine """ if self.get_state()!=State.Running: self.wait() if len(self.pilot_data) > 0: # Search for PD that is close to local machine for pd in self.pilot_data: pd_domain = tldextract.extract(pd.service_url).domain local_domain = tldextract.extract(socket.getfqdn()).domain logger.debug("Export to %s... checking PD at: %s"%(local_domain, pd_domain)) if pd_domain == local_domain: logger.debug("Export from: %s"%(pd_domain)) pd.export_du(self, target_url) return # No PD found. Utilize default PD logger.debug("Export from random PD") self.pilot_data[0].export_du(self, target_url) else: logger.error("No Pilot Data for PD found")
def add_subjob(self, jd, job_url, job_id): logger.debug("Stage input files for sub-job") if jd.attribute_exists ("filetransfer"): try: self.__stage_files(jd.filetransfer, self.__get_subjob_working_dir(job_id)) except: logger.error("File Stagein failed. Is Paramiko installed?") logger.debug("add subjob to queue of PJ: " + str(self.pilot_url)) for i in range(0,3): try: logger.debug("create dictionary for job description. Job-URL: " + job_url) # put job description attributes to Redis job_dict = {} #to accomendate current bug in bliss (Number of processes is not returned from list attributes) job_dict["NumberOfProcesses"] = "1" attributes = jd.list_attributes() logger.debug("SJ Attributes: " + str(attributes)) for i in attributes: if jd.attribute_is_vector(i): #logger.debug("Add attribute: " + str(i) + " Value: " + str(jd.get_vector_attribute(i))) vector_attr = [] for j in jd.get_vector_attribute(i): vector_attr.append(j) job_dict[i]=vector_attr else: #logger.debug("Add attribute: " + str(i) + " Value: " + jd.get_attribute(i)) job_dict[i] = jd.get_attribute(i) job_dict["state"] = str(Unknown) job_dict["job-id"] = str(job_id) #logger.debug("update job description at communication & coordination sub-system") self.coordination.set_job(job_url, job_dict) self.coordination.queue_job(self.pilot_url, job_url) break except: traceback.print_exc(file=sys.stdout) time.sleep(2)
def __stage_files(self, filetransfers, target_url): logger.debug("Stage: %s to %s" % (filetransfers, target_url)) if filetransfers == None: return if self.__filemanager: self.__filemanager.create_remote_directory(target_url) for i in filetransfers: source_file = i if i.find(">") > 0: source_file = i[:i.find(">")].strip() if source_file.startswith( "ssh://") == False and source_file.startswith( "go://") == False: logger.error( "Staging of file: %s not supported. Please use URL in form ssh://<filename>" % source_file) continue target_url_full = os.path.join(target_url, os.path.basename(source_file)) logger.debug("Stage: %s to %s" % (source_file, target_url_full)) #self.__third_party_transfer(source_file, target_url_full) if self.__filemanager: self.__filemanager.transfer(source_file, target_url_full)
def __init__(self, redis_url): ''' Constructor ''' server_port=6379 self.redis_url=redis_url self.password=None start_index = self.redis_url.find(REDIS_URL_SCHEME)+len(REDIS_URL_SCHEME) server_and_port = self.redis_url[start_index:] password_end = server_and_port.find("@") # parse out password if password_end != -1: self.password = server_and_port[:password_end] start_index=password_end server_and_port= server_and_port[(password_end+1):] # port and hostname if server_and_port.find(":")==-1: server=server_and_port server_port = REDIS_SERVER_PORT else: server = server_and_port.split(":")[0] server_port = int(server_and_port.split(":")[1]) logger.debug("Connect to Redis: " + server + " Port: " + str(server_port)) if self.password==None: self.redis_client = redis.Redis(host=server, port=server_port, db=0) else: self.redis_client = redis.Redis(host=server, port=server_port, password=self.password, db=0) self.pipe = self.redis_client.pipeline() try: self.redis_client.ping() except: logger.error("Please start Redis server!") raise Exception("Please start Redis server!")
def create_remote_directory(self, target_url): result = urlparse.urlparse(target_url) target_host = result.hostname target_path = result.path target_user = result.username try: if not self.__is_remote_directory(target_url): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) logger.debug("Create directory at: %s" % (target_host)) client.connect(target_host, username=target_user) sftp = client.open_sftp() sftp.mkdir(target_path) sftp.close() client.close() return True except KeyboardInterrupt: raise KeyboardInterrupt except: logger.error("Error creating directory: " + str(target_path) + " at: " + str(target_host)) self.__print_traceback() return False
def export(self, target_url): """ simple implementation of export: copies file from first pilot data to local machine """ if self.get_state() != State.Running: self.wait() if len(self.pilot_data) > 0: # Search for PD that is close to local machine local_hostname = socket.getfqdn() max_score = 0 best_pd = None for pd in self.pilot_data: pd_host = SAGAUrl(pd.service_url).host pd_score = difflib.SequenceMatcher(a=pd_host, b=local_hostname).ratio() logger.debug( "Export locality compute score: Localhost: %s PD at: %s Score: %s" % (local_hostname, pd_host, pd_score)) if pd_score > max_score: best_pd = pd max_score = pd_score #pd_domain = tldextract.extract(pd.service_url).domain #local_domain = tldextract.extract(socket.getfqdn()).domain if best_pd != None: logger.debug("Export from: %s" % (best_pd.service_url)) best_pd.export_du(self, target_url) return # No PD found. Utilize default PD logger.debug("Export from random PD") self.pilot_data[0].export_du(self, target_url) else: logger.error("No Pilot Data for PD found")
def __init_coordination(self, coordination_url): bigjob_coordination = None if (coordination_url.startswith("advert://") or coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logger.debug("Utilizing ADVERT Backend") except: logger.error("Advert Backend could not be loaded") elif (coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug("Utilizing Redis Backend") except: logger.error("Error loading pyredis.") self.__print_traceback() elif (coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error( "ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " + "PYZMQ (http://zeromq.github.com/pyzmq/)") else: logger.error("No suitable coordination backend found.") # check whether coordination subsystem could be initialized if bigjob_coordination == None: raise BigJobError( "Could not initialize coordination subsystem (Redis)") logger.debug("Parsing URL: " + coordination_url) scheme, username, password, host, port, dbtype = self.__parse_url( coordination_url) if port == -1: port = None coordination = bigjob_coordination(server=host, server_port=port, username=username, password=password, dbtype=dbtype, url_prefix=scheme) return coordination
def create_remote_directory(self, target_url): result = urlparse.urlparse(target_url) target_host = result.hostname target_path = result.path target_user = result.username try: if not self.__is_remote_directory(target_url): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) logger.debug("Create directory at: %s"%(target_host)) client.connect(target_host, username=target_user) sftp = client.open_sftp() sftp.mkdir(target_path) sftp.close() client.close() return True except KeyboardInterrupt: raise KeyboardInterrupt except: logger.error("Error creating directory: " + str(target_path) + " at: " + str(target_host)) self.__print_traceback() return False
def __init__(self, args): self.coordination_url = args[1] # objects to store running jobs and processes self.jobs = [] self.processes = {} self.freenodes = [] self.busynodes = [] self.restarted = {} # read config file conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/../" + CONFIG_FILE if not os.path.exists(conf_file): conf_file = os.path.join(sys.prefix, CONFIG_FILE) logging.debug ("read configfile: " + conf_file) config = ConfigParser.ConfigParser() config.read(conf_file) default_dict = config.defaults() self.CPR=False if default_dict.has_key("cpr"): self.CPR = default_dict["cpr"] self.SHELL="/bin/bash" if default_dict.has_key("shell"): self.SHELL=default_dict["shell"] self.MPIRUN="mpirun" # On TACC resources the default MPICH is # linked under mpirun_rsh if default_dict.has_key("mpirun"): self.MPIRUN=default_dict["mpirun"] self.OUTPUT_TAR=False if default_dict.has_key("create_output_tar"): self.OUTPUT_TAR=eval(default_dict["create_output_tar"]) logger.debug("Create output tar: %r", self.OUTPUT_TAR) self.LAUNCH_METHOD="ssh" if default_dict.has_key("launch_method"): self.LAUNCH_METHOD=self.__get_launch_method(default_dict["launch_method"]) logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " + self.MPIRUN + " shell: " + self.SHELL) # init rms (SGE/PBS) self.init_rms() self.failed_polls = 0 ############################################################################## # initialization of coordination and communication subsystem # Redis initialization self.base_url = args[2] self.cds_queue_url = None if len(args)==4: self.cds_queue_url = args[3] logger.debug("External queue: " + str(self.cds_queue_url)) self.id = self.__get_bj_id(self.base_url) logger.debug("BigJob Agent arguments: " + str(args)) logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url) logger.debug("BigJob ID: %s"%self.id) # create bj directory self.work_dir = os.getcwd() if self.work_dir.find(self.id)==-1: # working directory already contains BJ id self.bj_dir = os.path.join(os.getcwd(), self.id) logger.debug("Agent working directory: %s"%self.bj_dir) try: os.makedirs(self.bj_dir) except: logger.debug("Directory already exists.") else: self.bj_dir = os.getcwd() os.chdir(self.bj_dir) if(self.coordination_url.startswith("advert://") or self.coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logging.debug("Utilizing ADVERT Backend: " + self.coordination_url) except: logger.error("Advert Backend could not be loaded") exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exc(file=sys.stderr) traceback.print_tb(exc_traceback, file=sys.stderr) elif (self.coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug("Utilizing Redis Backend: " + self.coordination_url + ". Please make sure Redis server is configured in bigjob_coordination_redis.py") except: logger.error("Error loading pyredis.") elif (self.coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " +"PYZMQ (http://zeromq.github.com/pyzmq/)") ### # Initiate coordination sub-system of both BJ agent and Pilot Data self.coordination = bigjob_coordination(server_connect_url=self.coordination_url) try: # initialize coordination subsystem of pilot data self.pilot_data_service = PilotDataService(coordination_url=self.coordination_url) except: logger.warn("Pilot-Data could not be initialized.") # update state of pilot job to running logger.debug("set state to : " + str(bigjob.state.Running)) self.coordination.set_pilot_state(self.base_url, str(bigjob.state.Running), False) self.pilot_description = self.coordination.get_pilot_description(self.base_url) ############################################################################## # start background thread for polling new jobs and monitoring current jobs self.resource_lock=threading.RLock() self.threadpool = ThreadPool(THREAD_POOL_SIZE) self.launcher_thread=threading.Thread(target=self.dequeue_new_jobs) self.launcher_thread.start() self.monitoring_thread=threading.Thread(target=self.start_background_thread) self.monitoring_thread.start()
def execute_job(self, job_url, job_dict): """ obtain job attributes from c&c and execute process """ state = str(job_dict["state"]) if (state == str(bigjob.state.Unknown) or state == str(bigjob.state.New)): try: #job_dict["state"]=str(saga.job.New) job_id = job_dict["job-id"] logger.debug("Start job id %s specification %s: " % (job_id, str(job_dict))) numberofprocesses = "1" try: if (job_dict.has_key("NumberOfProcesses") == True): numberofprocesses = job_dict["NumberOfProcesses"] except: pass # ignore in particular if Bliss is used spmdvariation = "single" try: if (job_dict.has_key("SPMDVariation") == True): spmdvariation = job_dict["SPMDVariation"] except: pass # ignore in particular if Bliss is used arguments = "" if (job_dict.has_key("Arguments") == True): arguments_raw = job_dict['Arguments'] if type(arguments_raw) == types.ListType: arguments_list = arguments_raw else: arguments_list = eval(job_dict["Arguments"]) for i in arguments_list: arguments = arguments + " " + str(i) environment = os.environ envi = "" self.number_subjobs = 1 if (job_dict.has_key("Environment") == True): env_raw = job_dict['Environment'] if type(env_raw) == types.ListType: env_list = env_raw else: env_list = eval(job_dict["Environment"]) logger.debug("Environment: " + str(env_list)) for i in env_list: logger.debug("Eval " + i) # Hack for conduction experiments on Kraken # Kraken specific support for running n sub-jobs at a time if i.startswith("NUMBER_SUBJOBS"): self.number_subjobs = int(i.split("=")[1].strip()) logger.debug("NUMBER_SUBJOBS: " + str(self.number_subjobs)) else: envi_1 = "export " + i + "; " envi = envi + envi_1 logger.debug(envi) executable = job_dict["Executable"] executable = self.__expand_directory(executable) workingdirectory = os.path.join(os.getcwd(), job_id) if (job_dict.has_key("WorkingDirectory") == True): workingdirectory = job_dict["WorkingDirectory"] workingdirectory = self.__expand_directory( workingdirectory) try: os.makedirs(workingdirectory) except: logger.debug("Directory %s already exists." % workingdirectory) logging.debug("Sub-Job: %s, Working_directory: %s" % (job_id, workingdirectory)) output = "stdout" if (job_dict.has_key("Output") == True): output = job_dict["Output"] if not os.path.isabs(output): output = os.path.join(workingdirectory, output) error = os.path.join(workingdirectory, "stderr") if (job_dict.has_key("Error") == True): error = job_dict["Error"] if not os.path.isabs(error): error = os.path.join(workingdirectory, error) # append job to job list self.jobs.append(job_url) ####################################################################################################### # special setup for MPI NAMD jobs machinefile = self.allocate_nodes(job_dict) host = "localhost" try: machine_file_handler = open(machinefile, "r") node = machine_file_handler.readlines() machine_file_handler.close() host = node[0].strip() except: pass if (machinefile == None): logger.debug("Not enough resources to run: " + job_url) self.coordination.set_job_state(job_url, str(bigjob.state.New)) self.coordination.queue_job(self.base_url, job_url) return # job cannot be run at the moment ####################################################################################################### # File Stage-In of dependent data units if job_dict.has_key("InputData"): self.coordination.set_job_state(job_url, str(bigjob.state.Staging)) self.__stage_in_data_units(eval(job_dict["InputData"]), workingdirectory) # File Stage-In - Move pilot-level files to working directory of sub-job if self.pilot_description != None: try: if self.pilot_description.has_key("description"): file_list = eval( self.pilot_description["description"]) if file_list != None and len(file_list) > 0: logger.debug("Copy %d files to SJ work dir" % len(file_list) > 0) for i in file_list: logger.debug("Process file: %s" % i) if i.find(">") > 0: base_filename = os.path.basename( i[:i.index(">")].strip()) if environment.has_key( "_CONDOR_SCRATCH_DIR"): source_filename = os.path.join( environment[ "_CONDOR_SCRATCH_DIR"], base_filename) else: source_filename = os.path.join( self.work_dir, base_filename) target_filename = os.path.join( workingdirectory, base_filename) try: logger.debug("Copy: %s to %s" % (source_filename, target_filename)) shutil.copyfile( source_filename, target_filename) except: logger.error( "Error copy: %s to %s" % (source_filename, target_filename)) except: logger.debug("Moving of stage-in files failed.") # create stdout/stderr file descriptors output_file = os.path.abspath(output) error_file = os.path.abspath(error) logger.debug("stdout: " + output_file + " stderr: " + error_file) stdout = open(output_file, "w") stderr = open(error_file, "w") # build execution command if self.LAUNCH_METHOD == "aprun": if (spmdvariation.lower() == "mpi"): command = envi + "aprun -n " + str( numberofprocesses ) + " " + executable + " " + arguments else: #env_strip = envi.strip() #env_command = env_strip[:(len(env_strip)-1)] command = envi + "aprun -n " + str( self.number_subjobs ) + " -d " + numberofprocesses + " " + executable + " " + arguments # MPMD Mode => all subjobs on Kraken fail because aprun returns 1 as returncode #command = "aprun" #for i in range(0, self.number_subjobs): # command = command + " -d " + numberofprocesses + " " + executable + " " + arguments # # + " 1 > "+ str(i)+ "-out.txt " + " 2 > "+ str(i)+ "-err.txt" # if i != self.number_subjobs-1: # command = command + " : " elif self.LAUNCH_METHOD == "ibrun" and spmdvariation.lower( ) == "mpi": # Non MPI launch is handled via standard SSH command = envi + "mpirun_rsh -np " + str( numberofprocesses ) + " -hostfile " + machinefile + " `build_env.pl` " + executable + " " + arguments elif (spmdvariation.lower() != "mpi"): command = envi + executable + " " + arguments # In particular for Condor - if executable is staged x flag is not set #command ="chmod +x " + executable +";export PATH=$PATH:" + workingdirectory + ";" +command else: # Environment variables need to be handled later! command = envi + executable + " " + arguments # add working directory and ssh command if self.LAUNCH_METHOD == "aprun" or ( self.LAUNCH_METHOD == "ibrun" and spmdvariation.lower() == "mpi"): command = "cd " + workingdirectory + "; " + command elif self.LAUNCH_METHOD == "local": command = "cd " + workingdirectory + "; " + command else: # ssh launch is default if (spmdvariation.lower() == "mpi"): command = "cd " + workingdirectory + "; " + envi + self.MPIRUN + " -np " + numberofprocesses + " -machinefile " + machinefile + " " + executable + " " + arguments elif host == "localhost": command = "cd " + workingdirectory + "; " + command else: command = "ssh " + host + " \'cd " + workingdirectory + "; " + command + "\'" # start application process shell = self.SHELL logger.debug("execute: " + command + " in " + workingdirectory + " from: " + str(socket.gethostname()) + " (Shell: " + shell + ")") # bash works fine for launching on QB but fails for Abe :-( p = subprocess.Popen(args=command, executable=shell, stderr=stderr, stdout=stdout, cwd=workingdirectory, env=environment, shell=True) logger.debug("started " + command) self.processes[job_url] = p self.coordination.set_job_state(job_url, str(bigjob.state.Running)) except: traceback.print_exc(file=sys.stderr)
def __init__(self, args): self.coordination_url = args[1] # objects to store running jobs and processes self.jobs = [] self.processes = {} self.freenodes = [] self.busynodes = [] self.restarted = {} # read config file conf_file = os.path.dirname( os.path.abspath(__file__)) + "/../" + CONFIG_FILE if not os.path.exists(conf_file): conf_file = os.path.join(sys.prefix, CONFIG_FILE) logging.debug("read configfile: " + conf_file) config = ConfigParser.ConfigParser() config.read(conf_file) default_dict = config.defaults() self.CPR = False if default_dict.has_key("cpr"): self.CPR = default_dict["cpr"] self.SHELL = "/bin/bash" if default_dict.has_key("shell"): self.SHELL = default_dict["shell"] self.MPIRUN = "mpirun" # On TACC resources the default MPICH is # linked under mpirun_rsh if default_dict.has_key("mpirun"): self.MPIRUN = default_dict["mpirun"] if default_dict.has_key("number_executor_threads"): THREAD_POOL_SIZE = int(default_dict["number_executor_threads"]) self.OUTPUT_TAR = False if default_dict.has_key("create_output_tar"): self.OUTPUT_TAR = eval(default_dict["create_output_tar"]) logger.debug("Create output tar: %r", self.OUTPUT_TAR) self.failed_polls = 0 ############################################################################## # initialization of coordination and communication subsystem # Redis initialization self.base_url = args[2] self.cds_queue_url = None if len(args) == 4: self.cds_queue_url = args[3] logger.debug("External queue: " + str(self.cds_queue_url)) self.id = self.__get_bj_id(self.base_url) logger.debug("BigJob Agent arguments: " + str(args)) logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url) logger.debug("BigJob ID: %s" % self.id) # create bj directory self.work_dir = os.getcwd() if self.work_dir.find( self.id) == -1: # working directory already contains BJ id self.bj_dir = os.path.join(os.getcwd(), self.id) logger.debug("Agent working directory: %s" % self.bj_dir) try: os.makedirs(self.bj_dir) except: logger.debug("Directory already exists.") else: self.bj_dir = os.getcwd() os.chdir(self.bj_dir) if (self.coordination_url.startswith("advert://") or self.coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logging.debug("Utilizing ADVERT Backend: " + self.coordination_url) except: logger.error("Advert Backend could not be loaded") exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exc(file=sys.stderr) traceback.print_tb(exc_traceback, file=sys.stderr) elif (self.coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug("Utilizing Redis Backend: " + self.coordination_url + ".") except: logger.error( "Error loading pyredis. Check configuration in bigjob_coordination_redis.py." ) elif (self.coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error( "ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " + "PYZMQ (http://zeromq.github.com/pyzmq/)") ### # Initiate coordination sub-system of both BJ agent and Pilot Data self.coordination = bigjob_coordination( server_connect_url=self.coordination_url) try: # initialize coordination subsystem of pilot data self.pilot_data_service = PilotDataService( coordination_url=self.coordination_url) except: logger.warn("Pilot-Data could not be initialized.") # update state of pilot job to running logger.debug("set state to : " + str(bigjob.state.Running)) self.coordination.set_pilot_state(self.base_url, str(bigjob.state.Running), False) self.pilot_description = self.coordination.get_pilot_description( self.base_url) try: self.pilot_description = ast.literal_eval(self.pilot_description) except: logger.warn("Unable to parse pilot description") self.pilot_description = None ############################################################################ # Detect launch method self.LAUNCH_METHOD = "ssh" if default_dict.has_key("launch_method"): self.LAUNCH_METHOD = default_dict["launch_method"] self.LAUNCH_METHOD = self.__get_launch_method(self.LAUNCH_METHOD) logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " + self.MPIRUN + " shell: " + self.SHELL) # init rms (SGE/PBS) self.init_rms() ############################################################################## # start background thread for polling new jobs and monitoring current jobs # check whether user requested a certain threadpool size if self.pilot_description != None and self.pilot_description.has_key( "number_executor_threads"): THREAD_POOL_SIZE = int( self.pilot_description["number_executor_threads"]) logger.debug("Creating executor thread pool of size: %d" % (THREAD_POOL_SIZE)) self.resource_lock = threading.RLock() self.threadpool = ThreadPool(THREAD_POOL_SIZE) self.launcher_thread = threading.Thread(target=self.dequeue_new_jobs) self.launcher_thread.start() self.monitoring_thread = threading.Thread( target=self.start_background_thread) self.monitoring_thread.start()