def __init__(self, cds_url=None): """ Create a ComputeDataService (Decentral) object. @param cds_url: Reconnect to an existing CDS (optional). """ # Pilot Data self.data_units={} self.pilot_data_services=[] # Pilot Compute self.compute_units={} self.pilot_job_services=[] if cds_url == None: self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) application_url = CoordinationAdaptor.get_base_url(pilot.application_id) self.url = CoordinationAdaptor.add_cds(application_url, self) else: self.id = self.__get_cds_id(cds_url) self.url = cds_url # Background Thread for scheduling self.scheduler = Scheduler() self.du_queue = Queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) self.scheduler_thread.daemon=True self.scheduler_thread.start() logger.debug("Created ComputeDataServiceDecentral")
def __init__(self, cds_url=None): """ Create a Work Data Service object. Keyword arguments: cds_url -- Reconnect to an existing WDS (optional). """ # Pilot Data self.data_units={} self.pilot_data_services=[] # Pilot Job self.compute_units={} self.pilot_job_services=[] if cds_url == None: self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) application_url = CoordinationAdaptor.get_base_url(pilot.application_id) self.url = CoordinationAdaptor.add_cds(application_url, self) else: self.id = self.__get_cds_id(cds_url) self.url = cds_url # Background Thread for scheduling self.scheduler = Scheduler() self.cu_queue = Queue.Queue() self.du_queue = Queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) self.scheduler_thread.start()
class ComputeDataService(ComputeDataService): """ TROY WorkDataService. The WorkDataService is the application's interface to submit ComputeUnits and PilotData/DataUnit to the Pilot-Manager in the P* Model. """ CDS_ID_PREFIX="cds-" def __init__(self, cds_url=None): """ Create a Work Data Service object. Keyword arguments: cds_url -- Reconnect to an existing WDS (optional). """ # Pilot Data self.data_units={} self.pilot_data_services=[] # Pilot Job self.compute_units={} self.pilot_job_services=[] if cds_url == None: self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) application_url = CoordinationAdaptor.get_base_url(pilot.application_id) self.url = CoordinationAdaptor.add_cds(application_url, self) else: self.id = self.__get_cds_id(cds_url) self.url = cds_url # Background Thread for scheduling self.scheduler = Scheduler() self.cu_queue = Queue.Queue() self.du_queue = Queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) self.scheduler_thread.start() def __get_cds_id(self, cds_url): start = cds_url.index(self.CDS_ID_PREFIX) end =cds_url.index("/", start) return cds_url[start:end] ########################################################################### # Pilot Job def add_pilot_compute_service(self, pjs): """ Add a PilotJobService to this CDS. Keyword arguments: pilotjob_services -- The PilotJob Service(s) to which this Work Unit Service will connect. Return: Result """ self.pilot_job_services.append(pjs) CoordinationAdaptor.update_cds(self.url, self) def remove_pilot_compute_service(self, pjs): """ Remove a PilotJobService from this CDS. Note that it won't cancel the PilotJobService, it will just no longer be connected to this WUS. Keyword arguments: pilotjob_services -- The PilotJob Service(s) to remove from this Work Unit Service. Return: Result """ self.pilot_job_services.remove(pjs) CoordinationAdaptor.update_cds(self.url, self) def submit_compute_unit(self, compute_unit_description): """ Submit a WU to this Work Unit Service. Keyword argument: cud -- The ComputeUnitDescription from the application Return: ComputeUnit object """ cu = ComputeUnit(compute_unit_description, self) self.compute_units[cu.id]=cu self.cu_queue.put(cu) CoordinationAdaptor.update_cds(self.url, self) return cu ########################################################################### # Pilot Data def add_pilot_data_service(self, pds): """ Add a PilotDataService Keyword arguments: pds -- The PilotDataService to add. Return: None """ self.pilot_data_services.append(pds) CoordinationAdaptor.update_cds(self.url, self) def remove_pilot_data_service(self, pds): """ Remove a PilotDataService Keyword arguments: pds -- The PilotDataService to remove Return: None """ self.pilot_data_services.remove(pds) CoordinationAdaptor.update_cds(self.url, self) def list_pilot_compute(self): """ List all pilot compute of CDS """ return self.pilot_job_service def list_pilot_data(self): """ List all pilot data of CDS """ return self.pilot_data_services def list_data_units(self): """ List all DUs of CDS """ return self.data_units.items() def get_data_unit(self, du_id): if self.data_units.has_key(du_id): return self.data_units[du_id] return None def submit_data_unit(self, data_unit_description): """ creates a data unit object and binds it to a physical resource (a pilotdata) """ du = DataUnit(pilot_data_service=self, data_unit_description=data_unit_description) self.data_units[du.id]=du self.du_queue.put(du) # queue currently not persisted CoordinationAdaptor.update_cds(self.url, self) return du def cancel(self): """ Cancel the PDS. All associated PD objects are deleted and removed from the associated pilot stores. Keyword arguments: None Return: None """ # terminate background thread self.stop.set() CoordinationAdaptor.delete_cds(self.url) def wait(self): """ Waits for CUs and DUs Return if all du's are running AND cu's are done """ self.cu_queue.join() self.du_queue.join() for i in self.data_units.values(): i.wait() for i in self.compute_units.values(): i.wait() #[pc.wait() for i in self.pilot_job_services for pc in i.list_pilots()] #[pd.wait() for i in self.pilot_data_services for pd in i.list_pilots()] def get_state(self): return self.state def get_id(self): return str(self.id) ########################################################################### # Internal Scheduling def __update_scheduler_resources(self): logging.debug("__update_scheduler_resources") pd = [s for i in self.pilot_data_services for s in i.list_pilots()] self.scheduler.set_pilot_data(pd) pj = [p for i in self.pilot_job_services for p in i.list_pilots()] logging.debug("Pilot-Jobs: " + str(pj)) self.scheduler.set_pilot_jobs(pj) def _schedule_du(self, du): """ Schedule DU to a suitable pilot data Currently one level of scheduling is used: 1.) Add all resources managed by the contained PDS 2.) Select one resource """ logging.debug("Schedule PD") self.__update_scheduler_resources() selected_pilot_data = self.scheduler.schedule_pilot_data(du.data_unit_description) return selected_pilot_data def _schedule_cu(self, cu): logging.debug("Schedule CU") self.__update_scheduler_resources() selected_pilot_job = self.scheduler.schedule_pilot_job(cu.compute_unit_description) return selected_pilot_job def _scheduler_thread(self): while True and self.stop.isSet()==False: try: logging.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") du = self.du_queue.get(True, 1) # check whether this is a real du object if isinstance(du, DataUnit): pd=self._schedule_du(du) if(pd!=None): logging.debug("Initiate Transfer to PD.") du.add_pilot_data(pd) logging.debug("Transfer to PD finished.") du.update_state(State.Running) self.du_queue.task_done() else: self.du_queue.task_done() self.du_queue.put(du) except Queue.Empty: pass try: logging.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Job") cu = self.cu_queue.get(True, 1) if isinstance(cu, ComputeUnit): pj=self._schedule_cu(cu) if pj !=None: cu = self.__expand_working_directory(cu, pj) pj._submit_cu(cu) self.cu_queue.task_done() else: self.cu_queue.task_done() self.cu_queue.put(du) except Queue.Empty: pass except: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) logger.error("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) time.sleep(5) logging.debug("Re-Scheduler terminated") def __expand_working_directory(self, compute_unit, pilot_job): """ Expand pilotdata:// url specified in the compute_unit_description to a local url on the machine of the PJ pilotdata://localhost/434bfc5c-23fd-11e1-a43f-00264a13ca4c to /tmp/pilotstore//434bfc5c-23fd-11e1-a43f-00264a13ca4c on machine running pilot_job """ if compute_unit.compute_unit_description.has_key("working_directory"): working_directory=compute_unit.compute_unit_description["working_directory"] if working_directory.find(DataUnit.DU_ID_PREFIX)!=-1: pilot_data_url = working_directory pj_description = pilot_job.pilot_compute_description pj_dc_affinity = pj_description["affinity_datacenter_label"] pj_machine_affinity = pj_description["affinity_machine_label"] pd = [s for i in self.pilot_data_services for s in i.list_pilots()] # find all pilot stores with the same affinity candidate_pd = [] for i in pd: pd_description = i.pilot_data_description pd_dc_affinity = pd_description["affinity_datacenter_label"] pd_machine_affinity = pd_description["affinity_machine_label"] if pd_dc_affinity == pj_dc_affinity and pd_machine_affinity == pj_machine_affinity: candidate_pd.append(i) # check whether required pilot_data is part of pilot_data target_pd = None target_du = None for pd in candidate_pd: for du in pd.list_data_units(): logger.debug("DU URL:%s"%(du.url)) if du.url == pilot_data_url: logging.debug("Found PD %s at %s"%(du.url, pd.service_url)) target_pd = pd target_du = du break if target_du == None: self.__stage_du_to_pj(pilot_data_url, pilot_job) if target_pd!=None: pd_url = target_pd.url_for_du(target_du) components = urlparse.urlparse(pd_url) compute_unit.compute_unit_description["working_directory"] = components.path compute_unit._update_compute_unit_description(compute_unit.compute_unit_description) logging.debug("__expand_working_directory %s: Set working directory to %s"%(pilot_data_url, compute_unit.compute_unit_description["working_directory"])) return compute_unit return compute_unit def __stage_du_to_pj(self, pilotdata, pilotjob): """ stage required files to machine of pilot job """ pass def __find_pd_at_pj_resource(self, pilotjob): pass
class ComputeDataServiceDecentral(ComputeDataService): """ B{ComputeDataServiceDecentral.} The ComputeDataService is the application's interface to submit ComputeUnits and PilotData/DataUnit to the Pilot-Manager in the P* Model. The decentral ComputeDateService will only work with Redis! """ CDS_ID_PREFIX="cds-" def __init__(self, cds_url=None): """ Create a ComputeDataService (Decentral) object. @param cds_url: Reconnect to an existing CDS (optional). """ # Pilot Data self.data_units={} self.pilot_data_services=[] # Pilot Compute self.compute_units={} self.pilot_job_services=[] if cds_url == None: self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) application_url = CoordinationAdaptor.get_base_url(pilot.application_id) self.url = CoordinationAdaptor.add_cds(application_url, self) else: self.id = self.__get_cds_id(cds_url) self.url = cds_url # Background Thread for scheduling self.scheduler = Scheduler() self.du_queue = Queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) self.scheduler_thread.daemon=True self.scheduler_thread.start() logger.debug("Created ComputeDataServiceDecentral") def __get_cds_id(self, cds_url): start = cds_url.index(self.CDS_ID_PREFIX) end =cds_url.index("/", start) return cds_url[start:end] ########################################################################### # Pilot Compute def add_pilot_compute_service(self, pcs): """ Add a PilotComputeService to this CDS. @param pcs: The PilotComputeService to which this ComputeDataService will connect. """ self.pilot_job_services.append(pcs) CoordinationAdaptor.update_cds(self.url, self) if len(self.pilot_job_services)>1: logger.error("Decentral ComputeDataService only supports 1 PilotComputeService") raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService") def remove_pilot_compute_service(self, pcs): """ Remove a PilotJobService from this CDS. Note that it won't cancel the PilotJobService, it will just no longer be connected to this WUS. Keyword arguments: pilotjob_services -- The PilotJob Service(s) to remove from this Work Unit Service. Return: Result """ self.pilot_job_services.remove(pcs) CoordinationAdaptor.update_cds(self.url, self) if len(self.pilot_job_services)>1: logger.error("Decentral ComputeDataService only supports 1 PilotComputeService") raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService") def submit_compute_unit(self, compute_unit_description): """ Submit a CU to this Compute Data Service. @param compute_unit_description: The L{ComputeUnitDescription} from the application @return: L{ComputeUnit} object """ cu = ComputeUnit(compute_unit_description, self) self.compute_units[cu.id]=cu self.__submit_cu(cu) return cu def list_pilot_compute(self): """ List all pilot compute of CDS """ return self.pilot_job_service def get_details(self): """ returns a list with dicts that contains the details of the Pilot Compute, - job state - description - ... """ pilot_details=[] for pcs in self.pilot_job_services: for pc in pcs.list_pilots(): pilot_details.append(pc.get_details()) return pilot_details ########################################################################### # Compute Data Service private methods def __submit_cu(self, compute_unit): """ Submits compute unit to Bigjob """ if len(self.pilot_job_services)!=1: raise PilotError("No PilotComputeService found. Please start a PCS before submitting ComputeUnits.") self.__wait_for_du(compute_unit) sj = subjob() self.pcs_coordination_namespace=self.pilot_job_services[0].coordination_queue logger.debug("Submit CU to big-job via external queue: %s"%self.pcs_coordination_namespace) sj.submit_job(self.pcs_coordination_namespace, compute_unit.subjob_description) compute_unit._update_subjob(sj) return compute_unit def __wait_for_du(self, compute_unit): """ wait for Data Units that are required for Compute Unit """ cu_description = compute_unit.compute_unit_description if cu_description.has_key("input_data") and len(cu_description["input_data"])>0: for input_du_url in cu_description["input_data"]: for du in self.data_units.values(): if input_du_url == du.get_url(): logger.debug("Wait for DU: %s"%du.get_url()) du.wait() ########################################################################### # Pilot Data def add_pilot_data_service(self, pds): """ Add a PilotDataService @param pds: The PilotDataService to add. """ self.pilot_data_services.append(pds) CoordinationAdaptor.update_cds(self.url, self) def remove_pilot_data_service(self, pds): """ Remove a PilotDataService @param pds: The PilotDataService to remove """ self.pilot_data_services.remove(pds) CoordinationAdaptor.update_cds(self.url, self) def list_pilot_data(self): """ List all pilot data of CDS """ return self.pilot_data_services def list_data_units(self): """ List all DUs of CDS """ return self.data_units.items() def get_data_unit(self, du_id): if self.data_units.has_key(du_id): return self.data_units[du_id] return None def submit_data_unit(self, data_unit_description): """ creates a data unit object and binds it to a physical resource (a pilotdata) """ du = DataUnit(pilot_data=None, data_unit_description=data_unit_description) self.data_units[du.id]=du self.du_queue.put(du) # queue currently not persisted CoordinationAdaptor.update_cds(self.url, self) return du ########################################################################### # General def cancel(self): """ Cancel the CDS. All associated PD and PC objects are canceled. """ # terminate background thread self.stop.set() CoordinationAdaptor.delete_cds(self.url) def wait(self): """ Waits for CUs and DUs. Return after all DU's have been placed (i.e. in state Running) and all CU's have been completed (i.e. in state Done) or if a fault has occurred or the user has cancelled a CU or DU. """ try: dus = self.data_units.values() cus = self.compute_units.values() pilots = [] for i in self.pilot_job_services: pilots.extend(i.list_pilots()) number_dus=len(dus) number_cus=len(cus) number_pilots=len(pilots) completed_dus=0 completed_cus=0 completed_pilots=0 logger.debug("### ComputeDataService wait for completion of %d CUs/ %d DUs ###"%(len(cus), len(dus))) while not (completed_dus==number_dus and completed_cus==number_cus): completed_dus=0 completed_cus=0 completed_pilots=0 for p in pilots: state = p.get_state() if state==State.Done or state==State.Failed: completed_pilots = completed_pilots + 1 if completed_pilots==number_pilots: logger.debug("All pilots done/failed. No more active pilots. Exit.") break for cu in cus: state = cu.get_state() if state==State.Done or state==State.Failed: completed_cus=completed_cus + 1 for du in dus: state = du.get_state() if state==State.Running or state==State.Failed: completed_dus=completed_dus + 1 logger.debug("Compute Data Service Completion Status: %d/%d CUs %d/%d DUs %d/%d Pilots"% (completed_cus, number_cus, completed_dus, number_dus, completed_pilots, number_pilots)) logger.debug("exit? " + str((completed_dus==number_dus and completed_cus==number_cus))) if completed_dus<number_dus and completed_cus<number_cus: time.sleep(2) # for i in self.data_units.values(): # i.wait() # logger.debug("Wait for DUs finished") # # for i in self.compute_units.values(): # i.wait() # logger.debug("CUs done") logger.debug("### END WAIT ###") except: exc_type, exc_value, exc_traceback = sys.exc_info() print "*** print_tb:" traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) print "*** print_exception:" traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) logger.debug("Ctrl-c detected. Terminating ComputeDataService...") self.cancel() raise KeyboardInterrupt def get_state(self): return self.state def get_id(self): return str(self.id) def __del__(self): """ Make sure that background thread terminates""" self.cancel() ########################################################################### # Internal Scheduling def __update_scheduler_resources(self): logger.debug("__update_scheduler_resources") pd = [s for i in self.pilot_data_services for s in i.list_pilots()] self.scheduler.set_pilot_data(pd) pj = [p for i in self.pilot_job_services for p in i.list_pilots()] logger.debug("Pilot-Jobs: " + str(pj)) self.scheduler.set_pilot_jobs(pj) def _schedule_du(self, du): """ Schedule DU to a suitable pilot data Currently one level of scheduling is used: 1.) Add all resources managed by the contained PDS 2.) Select one resource """ logger.debug("Schedule PD") self.__update_scheduler_resources() selected_pilot_data = self.scheduler.schedule_pilot_data(du.data_unit_description) return selected_pilot_data def _scheduler_thread(self): while True and self.stop.isSet()==False: try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") du = self.du_queue.get(True, 1) # check whether this is a real du object if isinstance(du, DataUnit): pd=self._schedule_du(du) if(pd!=None): logger.debug("Initiate Transfer to PD.") du.add_pilot_data(pd) #logger.debug("Transfer to PD finished.") #du._update_state(State.Running) #logger.debug("Updated State to Running.") self.du_queue.task_done() else: self.du_queue.task_done() self.du_queue.put(du) except Queue.Empty: pass if self.du_queue.empty(): time.sleep(5) logger.debug("Re-Scheduler terminated")
class ComputeDataServiceDecentral(ComputeDataService): """ B{ComputeDataServiceDecentral.} The ComputeDataService is the application's interface to submit ComputeUnits and PilotData/DataUnit to the Pilot-Manager in the P* Model. The decentral ComputeDateService will only work with Redis! """ CDS_ID_PREFIX="cds-" def __init__(self, cds_url=None): """ Create a ComputeDataService (Decentral) object. @param cds_url: Reconnect to an existing CDS (optional). """ # Pilot Data self.data_units={} self.pilot_data_services=[] # Pilot Compute self.compute_units={} self.pilot_job_services=[] if cds_url == None: self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) application_url = CoordinationAdaptor.get_base_url(pilot.application_id) self.url = CoordinationAdaptor.add_cds(application_url, self) else: self.id = self.__get_cds_id(cds_url) self.url = cds_url # Background Thread for scheduling self.scheduler = Scheduler() self.du_queue = Queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) self.scheduler_thread.daemon=True self.scheduler_thread.start() logger.debug("Created ComputeDataServiceDecentral") def __get_cds_id(self, cds_url): start = cds_url.index(self.CDS_ID_PREFIX) end =cds_url.index("/", start) return cds_url[start:end] ########################################################################### # Pilot Compute def add_pilot_compute_service(self, pcs): """ Add a PilotComputeService to this CDS. @param pcs: The PilotComputeService to which this ComputeDataService will connect. """ self.pilot_job_services.append(pcs) CoordinationAdaptor.update_cds(self.url, self) if len(self.pilot_job_services)>1: logger.error("Decentral ComputeDataService only supports 1 PilotComputeService") raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService") def remove_pilot_compute_service(self, pcs): """ Remove a PilotJobService from this CDS. Note that it won't cancel the PilotJobService, it will just no longer be connected to this WUS. Keyword arguments: pilotjob_services -- The PilotJob Service(s) to remove from this Work Unit Service. Return: Result """ self.pilot_job_services.remove(pcs) CoordinationAdaptor.update_cds(self.url, self) if len(self.pilot_job_services)>1: logger.error("Decentral ComputeDataService only supports 1 PilotComputeService") raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService") def submit_compute_unit(self, compute_unit_description): """ Submit a CU to this Compute Data Service. @param compute_unit_description: The L{ComputeUnitDescription} from the application @return: L{ComputeUnit} object """ cu = ComputeUnit(compute_unit_description, self) self.compute_units[cu.id]=cu self.__submit_cu(cu) return cu def list_pilot_compute(self): """ List all pilot compute of CDS """ return self.pilot_job_service ########################################################################### # Compute Data Service private methods def __submit_cu(self, compute_unit): """ Submits compute unit to Bigjob """ if len(self.pilot_job_services)!=1: raise PilotError("No PilotComputeService found. Please start a PCS before submitting ComputeUnits.") sj = subjob() self.pcs_coordination_namespace=self.pilot_job_services[0].coordination_queue logger.debug("Submit CU to big-job via external queue: %s"%self.pcs_coordination_namespace) sj.submit_job(self.pcs_coordination_namespace, compute_unit.subjob_description) compute_unit._update_subjob(sj) return compute_unit ########################################################################### # Pilot Data def add_pilot_data_service(self, pds): """ Add a PilotDataService @param pds: The PilotDataService to add. """ self.pilot_data_services.append(pds) CoordinationAdaptor.update_cds(self.url, self) def remove_pilot_data_service(self, pds): """ Remove a PilotDataService @param pds: The PilotDataService to remove """ self.pilot_data_services.remove(pds) CoordinationAdaptor.update_cds(self.url, self) def list_pilot_data(self): """ List all pilot data of CDS """ return self.pilot_data_services def list_data_units(self): """ List all DUs of CDS """ return self.data_units.items() def get_data_unit(self, du_id): if self.data_units.has_key(du_id): return self.data_units[du_id] return None def submit_data_unit(self, data_unit_description): """ creates a data unit object and binds it to a physical resource (a pilotdata) """ du = DataUnit(pilot_data=None, data_unit_description=data_unit_description) self.data_units[du.id]=du self.du_queue.put(du) # queue currently not persisted CoordinationAdaptor.update_cds(self.url, self) return du ########################################################################### # General def cancel(self): """ Cancel the CDS. All associated PD and PC objects are canceled. """ # terminate background thread self.stop.set() CoordinationAdaptor.delete_cds(self.url) def wait(self): """ Waits for CUs and DUs. Return after all DU's have been placed (i.e. in state Running) and all CU's have been completed (i.e. in state Done) or if a fault has occurred or the user has cancelled a CU or DU. """ try: logger.debug("### START WAIT ###") for i in self.data_units.values(): i.wait() logger.debug("DUs done") for i in self.compute_units.values(): i.wait() logger.debug("CUs done") logger.debug("### END WAIT ###") except: logger.debug("Ctrl-c detected. Terminating ComputeDataService...") self.cancel() raise KeyboardInterrupt def get_state(self): return self.state def get_id(self): return str(self.id) def __del__(self): """ Make sure that background thread terminates""" self.cancel() ########################################################################### # Internal Scheduling def __update_scheduler_resources(self): logger.debug("__update_scheduler_resources") pd = [s for i in self.pilot_data_services for s in i.list_pilots()] self.scheduler.set_pilot_data(pd) pj = [p for i in self.pilot_job_services for p in i.list_pilots()] logger.debug("Pilot-Jobs: " + str(pj)) self.scheduler.set_pilot_jobs(pj) def _schedule_du(self, du): """ Schedule DU to a suitable pilot data Currently one level of scheduling is used: 1.) Add all resources managed by the contained PDS 2.) Select one resource """ logger.debug("Schedule PD") self.__update_scheduler_resources() selected_pilot_data = self.scheduler.schedule_pilot_data(du.data_unit_description) return selected_pilot_data def _scheduler_thread(self): while True and self.stop.isSet()==False: try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") du = self.du_queue.get(True, 1) # check whether this is a real du object if isinstance(du, DataUnit): pd=self._schedule_du(du) if(pd!=None): logger.debug("Initiate Transfer to PD.") du.add_pilot_data(pd) logger.debug("Transfer to PD finished.") du._update_state(State.Running) self.du_queue.task_done() else: self.du_queue.task_done() self.du_queue.put(du) except Queue.Empty: pass if self.du_queue.empty(): time.sleep(5) logger.debug("Re-Scheduler terminated")
class ComputeDataService(ComputeDataService): """ B{ComputeDataService (CDS).} The ComputeDataService is the application's interface to submit ComputeUnits and PilotData/DataUnit to the Pilot-Manager in the P* Model. """ CDS_ID_PREFIX = "cds-" def __init__(self, cds_url=None): """ Create a ComputeDataService object. Keyword arguments: cds_url -- Reconnect to an existing CDS (optional). """ # Pilot Data self.data_units = {} self.pilot_data_services = [] # Pilot Job self.compute_units = {} self.pilot_job_services = [] if cds_url == None: self.id = self.CDS_ID_PREFIX + str(uuid.uuid1()) application_url = CoordinationAdaptor.get_base_url( pilot.application_id) self.url = CoordinationAdaptor.add_cds(application_url, self) else: self.id = self.__get_cds_id(cds_url) self.url = cds_url # Background Thread for scheduling self.scheduler = Scheduler() self.cu_queue = Queue.Queue() self.du_queue = Queue.Queue() self.stop = threading.Event() self.scheduler_thread = threading.Thread(target=self._scheduler_thread) self.scheduler_thread.daemon = True self.scheduler_thread.start() def __get_cds_id(self, cds_url): start = cds_url.index(self.CDS_ID_PREFIX) end = cds_url.index("/", start) return cds_url[start:end] ########################################################################### # Pilot Compute def add_pilot_compute_service(self, pcs): """ Add a PilotComputeService to this CDS. @param pcs: The PilotComputeService to which this ComputeDataService will connect. """ self.pilot_job_services.append(pcs) CoordinationAdaptor.update_cds(self.url, self) def remove_pilot_compute_service(self, pcs): """ Remove a PilotJobService from this CDS. Note that it won't cancel the PilotComputeService, it will just no longer be connected to this CDS. Keyword arguments: @param pcs: The PilotComputeService to remove from this ComputeDataService. """ self.pilot_job_services.remove(pcs) CoordinationAdaptor.update_cds(self.url, self) def submit_compute_unit(self, compute_unit_description): """ Submit a CU to this Compute Data Service. @param compute_unit_description: The ComputeUnitDescription from the application @return: ComputeUnit object """ cu = ComputeUnit(compute_unit_description, self) self.compute_units[cu.id] = cu self.cu_queue.put(cu) CoordinationAdaptor.update_cds(self.url, self) return cu def list_pilot_compute(self): """ List all pilot compute of CDS """ return self.pilot_job_service ########################################################################### # Pilot Data def add_pilot_data_service(self, pds): """ Add a PilotDataService @param pds: The PilotDataService to add. """ self.pilot_data_services.append(pds) CoordinationAdaptor.update_cds(self.url, self) def remove_pilot_data_service(self, pds): """ Remove a PilotDataService @param pds: The PilotDataService to remove """ self.pilot_data_services.remove(pds) CoordinationAdaptor.update_cds(self.url, self) def list_pilot_data(self): """ List all pilot data of CDS """ return self.pilot_data_services def list_data_units(self): """ List all DUs of CDS """ return self.data_units.items() def get_data_unit(self, du_id): if self.data_units.has_key(du_id): return self.data_units[du_id] return None def submit_data_unit(self, data_unit_description): """ creates a data unit object and binds it to a physical resource (a pilotdata) """ du = DataUnit(pilot_data=None, data_unit_description=data_unit_description) self.data_units[du.id] = du self.du_queue.put(du) # queue currently not persisted CoordinationAdaptor.update_cds(self.url, self) return du def cancel(self): """ Cancel the CDS. All associated PD and PC objects are canceled. """ # terminate background thread self.stop.set() CoordinationAdaptor.delete_cds(self.url) def wait(self): """ Waits for CUs and DUs. Return after all DU's have been placed (i.e. in state Running) and all CU's have been completed (i.e. in state Done) or if a fault has occurred or the user has cancelled a CU or DU. """ try: logger.debug("### START WAIT ###") self.cu_queue.join() logger.debug("CU queue empty") self.du_queue.join() logger.debug("DU queue empty") for i in self.data_units.values(): i.wait() logger.debug("DUs done") for i in self.compute_units.values(): i.wait() logger.debug("CUs done") logger.debug("### END WAIT ###") except: logger.debug("Ctrl-c detected. Terminating ComputeDataService...") self.cancel() raise KeyboardInterrupt def get_state(self): "@return: State of the ComputeDataService" return self.state def get_id(self): "@return: id of ComputeDataService" return str(self.id) def __del__(self): """ Make sure that background thread terminates""" self.cancel() ########################################################################### # Internal Scheduling def __update_scheduler_resources(self): logger.debug("__update_scheduler_resources") pd = [s for i in self.pilot_data_services for s in i.list_pilots()] self.scheduler.set_pilot_data(pd) pj = [p for i in self.pilot_job_services for p in i.list_pilots()] logger.debug("Pilot-Jobs: " + str(pj)) self.scheduler.set_pilot_jobs(pj) def _schedule_du(self, du): """ Schedule DU to a suitable pilot data Currently one level of scheduling is used: 1.) Add all resources managed by the contained PDS 2.) Select one resource """ logger.debug("Schedule PD") self.__update_scheduler_resources() selected_pilot_data = self.scheduler.schedule_pilot_data( du.data_unit_description) return selected_pilot_data def _schedule_cu(self, cu): logger.debug("Schedule CU") self.__update_scheduler_resources() selected_pilot_job = self.scheduler.schedule_pilot_job( cu.compute_unit_description) return selected_pilot_job def _scheduler_thread(self): while True and self.stop.isSet() == False: try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") du = self.du_queue.get(True, 1) # check whether this is a real du object if isinstance(du, DataUnit): pd = self._schedule_du(du) if (pd != None): logger.debug("Initiate Transfer to PD.") du.add_pilot_data(pd) logger.debug("Transfer to PD finished.") du._update_state(State.Running) self.du_queue.task_done() else: self.du_queue.task_done() self.du_queue.put(du) except Queue.Empty: pass try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Job") cu = self.cu_queue.get(True, 1) if isinstance(cu, ComputeUnit): pj = self._schedule_cu(cu) if pj != None: cu = self.__expand_working_directory(cu, pj) pj._submit_cu(cu) self.cu_queue.task_done() else: logger.debug("No resource found.") self.cu_queue.task_done() self.cu_queue.put(cu) except Queue.Empty: pass except: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) logger.error("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) if self.cu_queue.empty() and self.du_queue.empty(): time.sleep(5) logger.debug("Re-Scheduler terminated") def __expand_working_directory(self, compute_unit, pilot_job): """ DEPRECATED capability! Expand pilotdata:// url specified in the compute_unit_description to a local url on the machine of the PJ pilotdata://localhost/434bfc5c-23fd-11e1-a43f-00264a13ca4c to /tmp/pilotstore//434bfc5c-23fd-11e1-a43f-00264a13ca4c on machine running pilot_job """ #======================================================================= # if compute_unit.compute_unit_description.has_key("working_directory"): # working_directory=compute_unit.compute_unit_description["working_directory"] # if working_directory.find(DataUnit.DU_ID_PREFIX)!=-1: # pilot_data_url = working_directory # pj_description = pilot_job.pilot_compute_description # pj_dc_affinity = pj_description["affinity_datacenter_label"] # pj_machine_affinity = pj_description["affinity_machine_label"] # pd = [s for i in self.pilot_data_services for s in i.list_pilots()] # # # find all pilot stores with the same affinity # candidate_pd = [] # for i in pd: # pd_description = i.pilot_data_description # pd_dc_affinity = pd_description["affinity_datacenter_label"] # pd_machine_affinity = pd_description["affinity_machine_label"] # if pd_dc_affinity == pj_dc_affinity and pd_machine_affinity == pj_machine_affinity: # candidate_pd.append(i) # # # check whether required pilot_data is part of pilot_data # target_pd = None # target_du = None # for pd in candidate_pd: # for du in pd.list_data_units(): # logger.debug("DU URL:%s"%(du.url)) # if du.url == pilot_data_url: # logger.debug("Found PD %s at %s"%(du.url, pd.service_url)) # target_pd = pd # target_du = du # break # if target_du == None: # self.__stage_du_to_pj(pilot_data_url, pilot_job) # # if target_pd!=None: # pd_url = target_pd.url_for_du(target_du) # components = urlparse.urlparse(pd_url) # compute_unit.compute_unit_description["working_directory"] = components.path # compute_unit._update_compute_unit_description(compute_unit.compute_unit_description) # logger.debug("__expand_working_directory %s: Set working directory to %s"%(pilot_data_url, compute_unit.compute_unit_description["working_directory"])) # return compute_unit # #======================================================================= return compute_unit def __stage_du_to_pj(self, pilotdata, pilotjob): """ stage required files to machine of pilot job """ pass def __find_pd_at_pj_resource(self, pilotjob): pass
class ComputeDataService(ComputeDataService): """ B{ComputeDataService (CDS).} The ComputeDataService is the application's interface to submit ComputeUnits and PilotData/DataUnit to the Pilot-Manager in the P* Model. """ CDS_ID_PREFIX="cds-" def __init__(self, cds_url=None): """ Create a ComputeDataService object. Keyword arguments: cds_url -- Reconnect to an existing CDS (optional). """ # Pilot Data self.data_units={} self.pilot_data_services=[] # Pilot Job self.compute_units={} self.pilot_job_services=[] if cds_url == None: self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) application_url = CoordinationAdaptor.get_base_url(pilot.application_id) self.url = CoordinationAdaptor.add_cds(application_url, self) else: self.id = self.__get_cds_id(cds_url) self.url = cds_url # Background Thread for scheduling self.scheduler = Scheduler() self.cu_queue = Queue.Queue() self.du_queue = Queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) self.scheduler_thread.daemon=True self.scheduler_thread.start() def __get_cds_id(self, cds_url): start = cds_url.index(self.CDS_ID_PREFIX) end =cds_url.index("/", start) return cds_url[start:end] ########################################################################### # Pilot Compute def add_pilot_compute_service(self, pcs): """ Add a PilotComputeService to this CDS. @param pcs: The PilotComputeService to which this ComputeDataService will connect. """ self.pilot_job_services.append(pcs) CoordinationAdaptor.update_cds(self.url, self) def remove_pilot_compute_service(self, pcs): """ Remove a PilotJobService from this CDS. Note that it won't cancel the PilotComputeService, it will just no longer be connected to this CDS. Keyword arguments: @param pcs: The PilotComputeService to remove from this ComputeDataService. """ self.pilot_job_services.remove(pcs) CoordinationAdaptor.update_cds(self.url, self) def submit_compute_unit(self, compute_unit_description): """ Submit a CU to this Compute Data Service. @param compute_unit_description: The ComputeUnitDescription from the application @return: ComputeUnit object """ cu = ComputeUnit(compute_unit_description, self) self.compute_units[cu.id]=cu self.cu_queue.put(cu) CoordinationAdaptor.update_cds(self.url, self) return cu def list_pilot_compute(self): """ List all pilot compute of CDS """ return self.pilot_job_service ########################################################################### # Pilot Data def add_pilot_data_service(self, pds): """ Add a PilotDataService @param pds: The PilotDataService to add. """ self.pilot_data_services.append(pds) CoordinationAdaptor.update_cds(self.url, self) def remove_pilot_data_service(self, pds): """ Remove a PilotDataService @param pds: The PilotDataService to remove """ self.pilot_data_services.remove(pds) CoordinationAdaptor.update_cds(self.url, self) def list_pilot_data(self): """ List all pilot data of CDS """ return self.pilot_data_services def list_data_units(self): """ List all DUs of CDS """ return self.data_units.items() def get_data_unit(self, du_id): if self.data_units.has_key(du_id): return self.data_units[du_id] return None def submit_data_unit(self, data_unit_description): """ creates a data unit object and binds it to a physical resource (a pilotdata) """ du = DataUnit(pilot_data=None, data_unit_description=data_unit_description) self.data_units[du.id]=du self.du_queue.put(du) # queue currently not persisted CoordinationAdaptor.update_cds(self.url, self) return du def cancel(self): """ Cancel the CDS. All associated PD and PC objects are canceled. """ # terminate background thread self.stop.set() CoordinationAdaptor.delete_cds(self.url) def wait(self): """ Waits for CUs and DUs. Return after all DU's have been placed (i.e. in state Running) and all CU's have been completed (i.e. in state Done) or if a fault has occurred or the user has cancelled a CU or DU. """ try: logger.debug("### START WAIT ###") self.cu_queue.join() logger.debug("CU queue empty") self.du_queue.join() logger.debug("DU queue empty") for i in self.data_units.values(): i.wait() logger.debug("DUs done") for i in self.compute_units.values(): i.wait() logger.debug("CUs done") logger.debug("### END WAIT ###") except: logger.debug("Ctrl-c detected. Terminating ComputeDataService...") self.cancel() raise KeyboardInterrupt def get_state(self): "@return: State of the ComputeDataService" return self.state def get_id(self): "@return: id of ComputeDataService" return str(self.id) def __del__(self): """ Make sure that background thread terminates""" self.cancel() ########################################################################### # Internal Scheduling def __update_scheduler_resources(self): logger.debug("__update_scheduler_resources") pd = [s for i in self.pilot_data_services for s in i.list_pilots()] self.scheduler.set_pilot_data(pd) pj = [p for i in self.pilot_job_services for p in i.list_pilots()] logger.debug("Pilot-Jobs: " + str(pj)) self.scheduler.set_pilot_jobs(pj) def _schedule_du(self, du): """ Schedule DU to a suitable pilot data Currently one level of scheduling is used: 1.) Add all resources managed by the contained PDS 2.) Select one resource """ logger.debug("Schedule PD") self.__update_scheduler_resources() selected_pilot_data = self.scheduler.schedule_pilot_data(du.data_unit_description) return selected_pilot_data def _schedule_cu(self, cu): logger.debug("Schedule CU") self.__update_scheduler_resources() selected_pilot_job = self.scheduler.schedule_pilot_job(cu.compute_unit_description) return selected_pilot_job def _scheduler_thread(self): while True and self.stop.isSet()==False: try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") du = self.du_queue.get(True, 1) # check whether this is a real du object if isinstance(du, DataUnit): pd=self._schedule_du(du) if(pd!=None): logger.debug("Initiate Transfer to PD.") du.add_pilot_data(pd) logger.debug("Transfer to PD finished.") du._update_state(State.Running) self.du_queue.task_done() else: self.du_queue.task_done() self.du_queue.put(du) except Queue.Empty: pass try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Job") cu = self.cu_queue.get(True, 1) if isinstance(cu, ComputeUnit): pj=self._schedule_cu(cu) if pj !=None: cu = self.__expand_working_directory(cu, pj) pj._submit_cu(cu) self.cu_queue.task_done() else: logger.debug("No resource found.") self.cu_queue.task_done() self.cu_queue.put(cu) except Queue.Empty: pass except: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) logger.error("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stderr) if self.cu_queue.empty() and self.du_queue.empty(): time.sleep(5) logger.debug("Re-Scheduler terminated") def __expand_working_directory(self, compute_unit, pilot_job): """ DEPRECATED capability! Expand pilotdata:// url specified in the compute_unit_description to a local url on the machine of the PJ pilotdata://localhost/434bfc5c-23fd-11e1-a43f-00264a13ca4c to /tmp/pilotstore//434bfc5c-23fd-11e1-a43f-00264a13ca4c on machine running pilot_job """ #======================================================================= # if compute_unit.compute_unit_description.has_key("working_directory"): # working_directory=compute_unit.compute_unit_description["working_directory"] # if working_directory.find(DataUnit.DU_ID_PREFIX)!=-1: # pilot_data_url = working_directory # pj_description = pilot_job.pilot_compute_description # pj_dc_affinity = pj_description["affinity_datacenter_label"] # pj_machine_affinity = pj_description["affinity_machine_label"] # pd = [s for i in self.pilot_data_services for s in i.list_pilots()] # # # find all pilot stores with the same affinity # candidate_pd = [] # for i in pd: # pd_description = i.pilot_data_description # pd_dc_affinity = pd_description["affinity_datacenter_label"] # pd_machine_affinity = pd_description["affinity_machine_label"] # if pd_dc_affinity == pj_dc_affinity and pd_machine_affinity == pj_machine_affinity: # candidate_pd.append(i) # # # check whether required pilot_data is part of pilot_data # target_pd = None # target_du = None # for pd in candidate_pd: # for du in pd.list_data_units(): # logger.debug("DU URL:%s"%(du.url)) # if du.url == pilot_data_url: # logger.debug("Found PD %s at %s"%(du.url, pd.service_url)) # target_pd = pd # target_du = du # break # if target_du == None: # self.__stage_du_to_pj(pilot_data_url, pilot_job) # # if target_pd!=None: # pd_url = target_pd.url_for_du(target_du) # components = urlparse.urlparse(pd_url) # compute_unit.compute_unit_description["working_directory"] = components.path # compute_unit._update_compute_unit_description(compute_unit.compute_unit_description) # logger.debug("__expand_working_directory %s: Set working directory to %s"%(pilot_data_url, compute_unit.compute_unit_description["working_directory"])) # return compute_unit # #======================================================================= return compute_unit def __stage_du_to_pj(self, pilotdata, pilotjob): """ stage required files to machine of pilot job """ pass def __find_pd_at_pj_resource(self, pilotjob): pass