def __init__(self, busname=DEFAULT_NOTIFICATION_BUSNAME, notification_prefix=DEFAULT_NOTIFICATION_PREFIX, dbcreds=None, broker=None): super(RADBPGListener, self).__init__(dbcreds.host, dbcreds.database, dbcreds.user, dbcreds.password) self.notification_prefix = notification_prefix self.event_bus = ToBus(busname, broker=broker) self.subscribe('task_update_with_task_view', self.onTaskUpdated) self.subscribe('task_insert_with_task_view', self.onTaskInserted) self.subscribe('task_delete', self.onTaskDeleted) # when the specification starttime and endtime are updated, then that effects the task as well # so subscribe to specification_update, and use task_view as view_for_row self.subscribe('specification_update_with_task_view', self.onSpecificationUpdated) self.subscribe('resource_claim_update_with_resource_claim_view', self.onResourceClaimUpdated) self.subscribe('resource_claim_insert_with_resource_claim_view', self.onResourceClaimInserted) self.subscribe('resource_claim_delete', self.onResourceClaimDeleted) self.subscribe('resource_availability_update', self.onResourceAvailabilityUpdated) self.subscribe('resource_capacity_update', self.onResourceCapacityUpdated)
def __init__(self, exchange=DEFAULT_BUSNAME, broker=DEFAULT_BROKER): """ :param exchange: name of the exchange to listen on. :param broker: name of the broker to connect to. """ super().__init__() self.otdbrpc = OTDBRPC.create(exchange=exchange, broker=broker) self.radbrpc = RADBRPC.create(exchange=exchange, broker=broker) self.momrpc = MoMQueryRPC.create(exchange=exchange, broker=broker) self.send_bus = ToBus(exchange=exchange, broker=broker)
def __init__(self, servicename, otdb_busname=None, my_busname=None, **kwargs): super(RATaskSpecified, self).__init__(busname=otdb_busname, subject="TaskStatus", **kwargs) self.parset_rpc = RPC(service="TaskSpecification", busname=otdb_busname) self.send_bus = ToBus("%s/%s" % (my_busname, servicename))
def __init__(self, job, momClient, ltaClient, exchange=DEFAULT_BUSNAME, broker=DEFAULT_BROKER, user=getpass.getuser(), globus_timeout=GLOBUS_TIMEOUT, minimal_SIP=False): self.status = IngestPipeline.STATUS_INITIALIZING self.hostname = socket.gethostname() self.job = job self.momClient = momClient self.ltaClient = ltaClient self.user = user if not self.user: self.user=getpass.getuser() self.globus_timeout = globus_timeout self.minimal_SIP = minimal_SIP self.event_bus = ToBus(exchange, broker=broker, connection_log_level=logging.DEBUG) self.Project = job['Project'] self.DataProduct = job['DataProduct'] self.FileName = job['FileName'] self.JobId = job['JobId'] self.ArchiveId = int(job['ArchiveId']) self.ObsId = int(job['ObservationId']) self.ExportID = job['ExportID'] self.Type = job["Type"] self.HostLocation = job['Location'].partition(':')[0] self.Location = job['Location'].partition(':')[2] self.ticket = '' self.FileSize = '0' self.MD5Checksum = '' self.Adler32Checksum = '' self.ChecksumResult = False self.SIP = '' self.PrimaryUri = '' self.SecondaryUri = '' self.lta_site = ''
def __init__(self, otdb_notification_busname=DEFAULT_OTDB_NOTIFICATION_BUSNAME, otdb_notification_subject=DEFAULT_OTDB_NOTIFICATION_SUBJECT, otdb_service_busname=DEFAULT_OTDB_SERVICE_BUSNAME, otdb_service_subject=DEFAULT_OTDB_SERVICENAME, notification_busname=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_BUSNAME, notification_subject=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_SUBJECT, broker=None, **kwargs): super(RATaskSpecified, self).__init__(busname=otdb_notification_busname, subject=otdb_notification_subject, **kwargs) self.otdbrpc = OTDBRPC(busname=otdb_service_busname, servicename=otdb_service_subject, broker=broker) ## , ForwardExceptions=True hardcoded in RPCWrapper right now self.send_bus = ToBus("%s/%s" % (notification_busname, notification_subject))
def __init__( self, otdb_notification_busname=DEFAULT_OTDB_NOTIFICATION_BUSNAME, otdb_notification_subject=DEFAULT_OTDB_NOTIFICATION_SUBJECT, otdb_service_busname=DEFAULT_OTDB_SERVICE_BUSNAME, otdb_service_subject=DEFAULT_OTDB_SERVICENAME, notification_busname=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_BUSNAME, notification_subject=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_SUBJECT, broker=None, **kwargs): super(RATaskSpecified, self).__init__(busname=otdb_notification_busname, subject=otdb_notification_subject, **kwargs) self.otdbrpc = OTDBRPC( busname=otdb_service_busname, servicename=otdb_service_subject, broker=broker ) ## , ForwardExceptions=True hardcoded in RPCWrapper right now self.send_bus = ToBus("%s/%s" % (notification_busname, notification_subject))
def __init__(self, cache_path='.du_cache.py', mountpoint=CEP4_DATA_MOUNTPOINT, exchange=DEFAULT_BUSNAME, broker=DEFAULT_BROKER): self._cache_path = cache_path self.otdb_listener = OTDBBusListener( _CacheManagerOTDBEventMessageHandler, handler_kwargs={'cache_manager': self}, exchange=exchange, broker=broker, num_threads=1) self.dm_listener = DataManagementBusListener( _CacheManagerDataManagementEventMessageHandler, handler_kwargs={'cache_manager': self}, exchange=exchange, broker=broker, num_threads=1) self.event_bus = ToBus(exchange=exchange, broker=broker) self._updateCacheThread = None self._running = False self._cacheLock = RLock() self._cache = {'path_du_results': {}, 'otdb_id2path': {}} self._last_cache_write_timestamp = datetime.datetime(1970, 1, 1) self._readCacheFromDisk() # dict to hold threading Events per path to prevent expensive multiple du calls for each path self._du_threading_events = {} self._du_threading_events_lock = RLock() self.disk_usage = DiskUsage(mountpoint=mountpoint, exchange=exchange, broker=broker)
def __init__(self, exchange=DEFAULT_BUSNAME, dbcreds=None, broker=DEFAULT_BROKER): super(RADBPGListener, self).__init__(dbcreds=dbcreds) self.event_bus = ToBus(exchange=exchange, broker=broker) self.radb = RADatabase(dbcreds=dbcreds) self.subscribe('task_update', self.onTaskUpdated) self.subscribe('task_insert', self.onTaskInserted) self.subscribe('task_delete', self.onTaskDeleted) self.subscribe('task_predecessor_insert_column_task_id', self.onTaskPredecessorChanged) self.subscribe('task_predecessor_update_column_task_id', self.onTaskPredecessorChanged) self.subscribe('task_predecessor_delete_column_task_id', self.onTaskPredecessorChanged) self.subscribe('task_predecessor_insert_column_predecessor_id', self.onTaskSuccessorChanged) self.subscribe('task_predecessor_update_column_predecessor_id', self.onTaskSuccessorChanged) self.subscribe('task_predecessor_delete_column_predecessor_id', self.onTaskSuccessorChanged) # when the specification starttime and endtime are updated, then that effects the task as well self.subscribe('specification_update', self.onSpecificationUpdated) self.subscribe('resource_claim_update', self.onResourceClaimUpdated) self.subscribe('resource_claim_insert', self.onResourceClaimInserted) self.subscribe('resource_claim_delete', self.onResourceClaimDeleted) self.subscribe('resource_availability_update', self.onResourceAvailabilityUpdated) self.subscribe('resource_capacity_update', self.onResourceCapacityUpdated)
def __init__(self, exchange=DEFAULT_BUSNAME, mom_credentials=None, lta_credentials=None, user=None, broker=None, max_nr_of_parallel_jobs=MAX_NR_OF_JOBS): self.user = user if not self.user: self.user = getpass.getuser() self.mom_credentials = mom_credentials self.lta_credentials = lta_credentials self.event_bus = ToBus(exchange=exchange, broker=broker) self.max_nr_of_parallel_jobs = max_nr_of_parallel_jobs self.__running_jobs = {} self.__lock = Lock() self.__prev_bytes_sent = _getBytesSent() self.__prev_bytes_sent_timestamp = datetime.utcnow() self.__prev_used_bandwidth = 0.0 self.__running_jobs_log_timestamp = datetime.utcnow()
def _send_notification(user, host, project, trigger_id, metadata): try: content = { "user": user, "host": host, "project": project, "trigger_id": trigger_id, "metadata": metadata } msg = EventMessage(subject=DEFAULT_TRIGGER_NOTIFICATION_SUBJECT, content=content) with ToBus(exchange=DEFAULT_BUSNAME, broker=DEFAULT_BROKER) as notification_bus: notification_bus.send(msg) except Exception as err: logger.error("Could not send notification ->" + str(err))
class RATaskSpecifiedOTDBEventMessageHandler(OTDBEventMessageHandler): def __init__(self, exchange=DEFAULT_BUSNAME, broker=DEFAULT_BROKER): """ :param exchange: name of the exchange to listen on. :param broker: name of the broker to connect to. """ super().__init__() self.otdbrpc = OTDBRPC.create(exchange=exchange, broker=broker) self.radbrpc = RADBRPC.create(exchange=exchange, broker=broker) self.momrpc = MoMQueryRPC.create(exchange=exchange, broker=broker) self.send_bus = ToBus(exchange=exchange, broker=broker) def start_handling(self): self.otdbrpc.open() self.momrpc.open() self.radbrpc.open() self.send_bus.open() def stop_handling(self): self.otdbrpc.close() self.momrpc.close() self.radbrpc.close() self.send_bus.close() # This is mainly to trigger the propagation of misc field values through read_from_mom # and then sending them to the RA to OTDB Service in the resource assigner. # Might need to be a separate service if we take on more mom-otdb-adapter function. def onObservationApproved(self, main_id, modification_time): self.createAndSendSpecifiedTask(main_id, 'approved') def onObservationPrescheduled(self, main_id, modification_time): self.createAndSendSpecifiedTask(main_id, 'prescheduled') def createAndSendSpecifiedTask(self, main_id, status): spec = Specification(self.otdbrpc, self.momrpc, self.radbrpc) spec.status = status spec.read_from_OTDB_with_predecessors(main_id, "otdb", {}) spec.read_from_mom() spec.update_start_end_times() # spec.insert_into_radb() is still done in resource_assigner for now. result_tree = spec.as_dict() if spec.status == status: logger.info("Sending result: %s" % result_tree) # Put result on bus msg = EventMessage(subject=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_SUBJECT, content=result_tree) with self.send_bus: self.send_bus.send(msg) logger.info("Result sent") else: logger.warning("Problem retrieving task %i" % main_id)
class RATaskSpecified(OTDBBusListener): def __init__( self, otdb_notification_busname=DEFAULT_OTDB_NOTIFICATION_BUSNAME, otdb_notification_subject=DEFAULT_OTDB_NOTIFICATION_SUBJECT, otdb_service_busname=DEFAULT_OTDB_SERVICE_BUSNAME, otdb_service_subject=DEFAULT_OTDB_SERVICENAME, notification_busname=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_BUSNAME, notification_subject=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_SUBJECT, broker=None, **kwargs): super(RATaskSpecified, self).__init__(busname=otdb_notification_busname, subject=otdb_notification_subject, **kwargs) self.otdbrpc = OTDBRPC( busname=otdb_service_busname, servicename=otdb_service_subject, broker=broker ) ## , ForwardExceptions=True hardcoded in RPCWrapper right now self.send_bus = ToBus("%s/%s" % (notification_busname, notification_subject)) def start_listening(self, **kwargs): self.otdbrpc.open() self.send_bus.open() super(RATaskSpecified, self).start_listening(**kwargs) def stop_listening(self, **kwargs): super(RATaskSpecified, self).stop_listening(**kwargs) self.send_bus.close() self.otdbrpc.close() def get_predecessors(self, parset): """ Extract the list of predecessor obs IDs from the given parset. """ key = PARSET_PREFIX + "Observation.Scheduler.predecessors" stringlist = PyParameterValue(str(parset[key]), True).getStringVector() # Key contains values starting with 'S' = Scheduler, 'L'/'T' = OTDB, 'M' = MoM # 'S' we can probably ignore? Might be only internal in the Scheduler? result = [] for s in stringlist: try: # Made the source a string for readability, but it's not efficient if s.startswith('M'): result.append({'source': 'mom', 'id': int(s[1:])}) elif s.startswith('L') or s.startswith('T'): result.append({'source': 'otdb', 'id': int(s[1:])}) else: # 'S' logger.info("found a predecessor ID I can't handle: %s" % s) result.append({'source': 'other', 'id': int(s[1:])}) except ValueError: logger.warning("found a predecessor ID that I can't parse %s" % s) return result def get_specification_with_predecessors(self, id, id_source, state, found_parsets): logger.info("Processing ID %s from %s" % (id, id_source)) if id_source == "other": return None elif id_source == "mom": otdb_id = self.otdbrpc.taskGetIDs(mom_id=id)['otdb_id'] elif id_source == "otdb": otdb_id = id else: logger.warning("Error in understanding id %s", id) logger.info("Processing OTDB ID %s", otdb_id) result = {"otdb_id": otdb_id, "predecessors": []} if state: result["state"] = state # TODO should be status not state else: pass #otdbrpc.taskGetStatus not implemented and maybe not needed? if otdb_id in found_parsets: parset = found_parsets[otdb_id] else: parset = self.otdbrpc.taskGetSpecification( otdb_id=otdb_id)['specification'] found_parsets[otdb_id] = parset logger.info("parset [%s]: %s" % (otdb_id, parset)) result['specification'] = resourceIndicatorsFromParset(parset) key = PARSET_PREFIX + "Observation.processSubtype" result['task_type'], result[ 'task_subtype'] = convertSchedulerProcessSubtype( parset.get(key, "")) logger.info("Processing predecessors") predecessor_ids = self.get_predecessors(parset) for id in predecessor_ids: predecessor_result = self.get_specification_with_predecessors( id['id'], id['source'], "", found_parsets) if predecessor_result: result["predecessors"].append(predecessor_result) return result def onObservationApproved(self, main_id, modificationTime): self.createAndSendSpecifiedTask(main_id, 'approved') def onObservationPrescheduled(self, main_id, modificationTime): self.createAndSendSpecifiedTask(main_id, 'prescheduled') def createAndSendSpecifiedTask(self, main_id, status): # Construct root node of tree resultTree = self.get_specification_with_predecessors( main_id, "otdb", status, {}) logger.info("Sending result: %s" % resultTree) # Put result on bus msg = EventMessage(content=resultTree) self.send_bus.send(msg) logger.info("Result sent")
from lofar.triggerservices.trigger_service_rpc import TriggerRPC from lofar.specificationservices.specification_service_rpc import SpecificationRPC from lofar.mom.momqueryservice.momqueryrpc import MoMQueryRPC from lofar.messaging import ToBus, EventMessage, RPCException, DEFAULT_BROKER, DEFAULT_BUSNAME import logging import traceback logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) triggerrpc = TriggerRPC() specrpc = SpecificationRPC() momrpc = MoMQueryRPC() from .config import TRIGGER_SUBMISSION_NOTIFICATION_SUBJECT notification_bus = ToBus() # The base URL for triggers specifications base_url = 'https://lofar.astron.nl/mom3/user/main/explorer/setUpExplorer.do?action=open&object=FP1_CF' if "LOFARENV" in os.environ: lofar_environment = os.environ['LOFARENV'] if lofar_environment != "PRODUCTION": base_url = 'http://lofartest.control.lofar:8080/mom3/user/main/explorer/setUpExplorer.do?action=open&object=FP1_CF' class TriggerListView(views.APIView): def __init__(self, **kwargs): super(TriggerListView, self).__init__(**kwargs) notification_bus.open()
def setUp(self): # Create a random bus self.busname = "%s-%s" % (sys.argv[0], str(uuid.uuid4())[:8]) self.bus = ToBus(self.busname, { "create": "always", "delete": "always", "node": { "type": "topic" } }) self.bus.open() self.addCleanup(self.bus.close) # Patch SLURM class MockSlurm(object): def __init__(self, *args, **kwargs): self.scheduled_jobs = {} def submit(self, jobName, *args, **kwargs): print "Schedule SLURM job '%s': %s, %s" % (jobName, args, kwargs) self.scheduled_jobs[jobName] = (args, kwargs) # Return job ID return "42" def jobid(self, jobname): if jobname in ["1", "2", "3"]: return jobname # "4" is an observation, so no SLURM job return None patcher = patch('lofar.mac.PipelineControl.Slurm') patcher.start().side_effect = MockSlurm self.addCleanup(patcher.stop) # Catch functions to prevent running executables patcher = patch('lofar.mac.PipelineControl.Parset.dockerImage') patcher.start().return_value = "lofar-pipeline:trunk" self.addCleanup(patcher.stop) # ================================ # Setup mock otdb service # ================================ class MockOTDBService(MessageHandlerInterface): def __init__(self, notification_bus): """ notification_bus: bus to send state changes to """ super(MockOTDBService, self).__init__() self.service2MethodMap = { "TaskGetSpecification": self.TaskGetSpecification, "TaskSetStatus": self.TaskSetStatus, } self.notification_bus = notification_bus def TaskGetSpecification(self, OtdbID): print "***** TaskGetSpecification(%s) *****" % (OtdbID,) if OtdbID == 1: predecessors = "[2,3,4]" elif OtdbID == 2: predecessors = "[3]" elif OtdbID == 3: predecessors = "[]" elif OtdbID == 4: return { "TaskSpecification": { "Version.number": "1", PARSET_PREFIX + "Observation.otdbID": str(OtdbID), PARSET_PREFIX + "Observation.Scheduler.predecessors": "[]", PARSET_PREFIX + "Observation.processType": "Observation", PARSET_PREFIX + "Observation.Cluster.ProcessingCluster.clusterName": "CEP4", PARSET_PREFIX + "Observation.stopTime": "2016-01-01 01:00:00", } } else: raise Exception("Invalid OtdbID: %s" % OtdbID) return { "TaskSpecification": { "Version.number": "1", PARSET_PREFIX + "Observation.otdbID": str(OtdbID), PARSET_PREFIX + "Observation.Scheduler.predecessors": predecessors, PARSET_PREFIX + "Observation.processType": "Pipeline", PARSET_PREFIX + "Observation.Cluster.ProcessingCluster.clusterName": "CEP4", } } def TaskSetStatus(self, OtdbID, NewStatus, UpdateTimestamps): print "***** TaskSetStatus(%s,%s) *****" % (OtdbID, NewStatus) # Broadcast the state change content = { "treeID" : OtdbID, "state" : NewStatus, "time_of_change" : datetime.datetime.utcnow() } msg = EventMessage(context=DEFAULT_OTDB_NOTIFICATION_SUBJECT, content=content) self.notification_bus.send(msg) return {'OtdbID':OtdbID, 'MomID':None, 'Success':True} service = Service(DEFAULT_OTDB_SERVICENAME, MockOTDBService, busname=self.busname, use_service_methods=True, handler_args={ "notification_bus": self.bus }) service.start_listening() self.addCleanup(service.stop_listening) # ================================ # Setup listener to catch result # of our service # ================================ listener = OTDBBusListener(busname=self.busname) listener.start_listening() self.addCleanup(listener.stop_listening) self.trigger = MethodTrigger(listener, "onObservationQueued")
class TestPipelineControl(unittest.TestCase): def setUp(self): # Create a random bus self.busname = "%s-%s" % (sys.argv[0], str(uuid.uuid4())[:8]) self.bus = ToBus(self.busname, { "create": "always", "delete": "always", "node": { "type": "topic" } }) self.bus.open() self.addCleanup(self.bus.close) # Patch SLURM class MockSlurm(object): def __init__(self, *args, **kwargs): self.scheduled_jobs = {} def submit(self, jobName, *args, **kwargs): print "Schedule SLURM job '%s': %s, %s" % (jobName, args, kwargs) self.scheduled_jobs[jobName] = (args, kwargs) # Return job ID return "42" def jobid(self, jobname): if jobname in ["1", "2", "3"]: return jobname # "4" is an observation, so no SLURM job return None patcher = patch('lofar.mac.PipelineControl.Slurm') patcher.start().side_effect = MockSlurm self.addCleanup(patcher.stop) # Catch functions to prevent running executables patcher = patch('lofar.mac.PipelineControl.Parset.dockerImage') patcher.start().return_value = "lofar-pipeline:trunk" self.addCleanup(patcher.stop) # ================================ # Setup mock otdb service # ================================ class MockOTDBService(MessageHandlerInterface): def __init__(self, notification_bus): """ notification_bus: bus to send state changes to """ super(MockOTDBService, self).__init__() self.service2MethodMap = { "TaskGetSpecification": self.TaskGetSpecification, "TaskSetStatus": self.TaskSetStatus, } self.notification_bus = notification_bus def TaskGetSpecification(self, OtdbID): print "***** TaskGetSpecification(%s) *****" % (OtdbID,) if OtdbID == 1: predecessors = "[2,3,4]" elif OtdbID == 2: predecessors = "[3]" elif OtdbID == 3: predecessors = "[]" elif OtdbID == 4: return { "TaskSpecification": { "Version.number": "1", PARSET_PREFIX + "Observation.otdbID": str(OtdbID), PARSET_PREFIX + "Observation.Scheduler.predecessors": "[]", PARSET_PREFIX + "Observation.processType": "Observation", PARSET_PREFIX + "Observation.Cluster.ProcessingCluster.clusterName": "CEP4", PARSET_PREFIX + "Observation.stopTime": "2016-01-01 01:00:00", } } else: raise Exception("Invalid OtdbID: %s" % OtdbID) return { "TaskSpecification": { "Version.number": "1", PARSET_PREFIX + "Observation.otdbID": str(OtdbID), PARSET_PREFIX + "Observation.Scheduler.predecessors": predecessors, PARSET_PREFIX + "Observation.processType": "Pipeline", PARSET_PREFIX + "Observation.Cluster.ProcessingCluster.clusterName": "CEP4", } } def TaskSetStatus(self, OtdbID, NewStatus, UpdateTimestamps): print "***** TaskSetStatus(%s,%s) *****" % (OtdbID, NewStatus) # Broadcast the state change content = { "treeID" : OtdbID, "state" : NewStatus, "time_of_change" : datetime.datetime.utcnow() } msg = EventMessage(context=DEFAULT_OTDB_NOTIFICATION_SUBJECT, content=content) self.notification_bus.send(msg) return {'OtdbID':OtdbID, 'MomID':None, 'Success':True} service = Service(DEFAULT_OTDB_SERVICENAME, MockOTDBService, busname=self.busname, use_service_methods=True, handler_args={ "notification_bus": self.bus }) service.start_listening() self.addCleanup(service.stop_listening) # ================================ # Setup listener to catch result # of our service # ================================ listener = OTDBBusListener(busname=self.busname) listener.start_listening() self.addCleanup(listener.stop_listening) self.trigger = MethodTrigger(listener, "onObservationQueued") def test_setStatus(self): with PipelineControl(otdb_notification_busname=self.busname, otdb_service_busname=self.busname) as pipelineControl: pipelineControl._setStatus(12345, "queued") # Wait for the status to propagate self.assertTrue(self.trigger.wait()) self.assertEqual(self.trigger.args[0], 12345) def testNoPredecessors(self): """ Request to start a simulated obsid 3, with the following predecessor tree: 3 requires nothing """ with PipelineControl(otdb_notification_busname=self.busname, otdb_service_busname=self.busname) as pipelineControl: # Send fake status update pipelineControl._setStatus(3, "scheduled") # Wait for message to arrive self.assertTrue(self.trigger.wait()) # Verify message self.assertEqual(self.trigger.args[0], 3) # otdbId # Check if job was scheduled self.assertIn("3", pipelineControl.slurm.scheduled_jobs) self.assertIn("3-aborted", pipelineControl.slurm.scheduled_jobs) def testPredecessors(self): """ Request to start a simulated obsid 1, with the following predecessor tree: 1 requires 2, 3, 4 2 requires 3 4 is an observation """ with PipelineControl(otdb_notification_busname=self.busname, otdb_service_busname=self.busname) as pipelineControl: # Send fake status update pipelineControl._setStatus(1, "scheduled") # Wait for message to arrive self.assertTrue(self.trigger.wait()) # Verify message self.assertEqual(self.trigger.args[0], 1) # otdbId # Check if job was scheduled self.assertIn("1", pipelineControl.slurm.scheduled_jobs) self.assertIn("1-aborted", pipelineControl.slurm.scheduled_jobs) # Earliest start of this job > stop time of observation for p in pipelineControl.slurm.scheduled_jobs["1"][1]["sbatch_params"]: if p.startswith("--begin="): begin = datetime.datetime.strptime(p, "--begin=%Y-%m-%dT%H:%M:%S") self.assertGreater(begin, datetime.datetime(2016, 1, 1, 1, 0, 0)) break else: self.assertTrue(False, "--begin parameter not given to SLURM job")
class RATaskSpecified(OTDBBusListener): def __init__(self, servicename, otdb_busname=None, my_busname=None, **kwargs): super(RATaskSpecified, self).__init__(busname=otdb_busname, subject="TaskStatus", **kwargs) self.parset_rpc = RPC(service="TaskSpecification", busname=otdb_busname) self.send_bus = ToBus("%s/%s" % (my_busname, servicename)) def start_listening(self, **kwargs): self.parset_rpc.open() self.send_bus.open() super(RATaskSpecified, self).start_listening(**kwargs) def stop_listening(self, **kwargs): super(RATaskSpecified, self).stop_listening(**kwargs) self.send_bus.close() self.parset_rpc.close() def onObservationPrescheduled(self, treeId, modificationTime): logger.info("Processing obs ID %s", treeId) # Request the parset main_obsID = treeId main_parset,_ = self.parset_rpc( OtdbID=main_obsID ) # Construct a dict of all the parsets we retrieved parsets = {} parsets[main_obsID] = main_parset logger.info("Processing predecessors") # Collect the initial set of predecessors request_obsIDs = set(predecessors(main_parset)) logger.info("Processing %s", request_obsIDs) # Iterate recursively over all known predecessor obsIDs, and request their parsets while request_obsIDs: obsID = request_obsIDs.pop() if obsID in parsets: # Predecessor lists can overlap -- we already have this one continue logger.info("Fetching predecessor %s", obsID) # Request predecessor parset parsets[obsID],_ = self.parset_rpc( OtdbID=obsID ) # Add the list of predecessors request_obsIDs = request_obsIDs.union(predecessors(parsets[obsID])) # Convert parsets to resource indicators logger.info("Extracting resource indicators") resourceIndicators = dict([(str(obsID), resourceIndicatorsFromParset(parset)) for (obsID,parset) in parsets.iteritems()]) # Construct and send result message logger.info("Sending result") result = { "sasID": main_obsID, "state": "prescheduled", "time_of_change": modificationTime, "resource_indicators": resourceIndicators, } # Put result on bus msg = EventMessage(content=result) self.send_bus.send(msg) logger.info("Result sent")
# Check the invocation arguments parser = OptionParser("%prog [options]") parser.add_option("-B", "--busname", dest="busname", type="string", default=DEFAULT_OTDB_NOTIFICATION_BUSNAME, help="Busname or queue-name the status changes are published on. [default: %default]") parser.add_option_group(dbcredentials.options_group(parser)) (options, args) = parser.parse_args() dbcreds = dbcredentials.parse_options(options) # Set signalhandler to stop the program in a neat way. signal.signal(signal.SIGINT, signal_handler) alive = True connected = False otdb_connection = None with ToBus(options.busname) as send_bus: while alive: while alive and not connected: # Connect to the database try: otdb_connection = pg.connect(**dbcreds.pg_connect_options()) connected = True logger.info("Connected to database %s" % (dbcreds,)) # Get list of allowed tree states allowed_states = {} for (state_nr, name) in otdb_connection.query("select id,name from treestate").getresult(): allowed_states[state_nr] = name except (TypeError, SyntaxError, pg.InternalError), e: connected = False logger.error("Not connected to database %s, retry in 5 seconds: %s" % (dbcreds, e))
def setUp(self): # Create a random bus self.busname = "%s-%s" % (sys.argv[0], str(uuid.uuid4())[:8]) self.bus = ToBus(self.busname, { "create": "always", "delete": "always", "node": { "type": "topic" } }) self.bus.open() self.addCleanup(self.bus.close) # Patch SLURM class MockSlurm(object): def __init__(self, *args, **kwargs): self.scheduled_jobs = {} def submit(self, jobName, *args, **kwargs): print "Schedule SLURM job '%s': %s, %s" % (jobName, args, kwargs) self.scheduled_jobs[jobName] = (args, kwargs) # Return job ID return "42" def jobid(self, jobname): if jobname in ["1", "2", "3"]: return jobname # "4" is an observation, so no SLURM job return None patcher = patch('lofar.mac.PipelineControl.Slurm') patcher.start().side_effect = MockSlurm self.addCleanup(patcher.stop) # Catch functions to prevent running executables patcher = patch('lofar.mac.PipelineControl.Parset.dockerImage') patcher.start().return_value = "lofar-pipeline:trunk" self.addCleanup(patcher.stop) # ================================ # Setup mock otdb service # ================================ class MockOTDBService(MessageHandlerInterface): def __init__(self, notification_bus): """ notification_bus: bus to send state changes to """ super(MockOTDBService, self).__init__() self.service2MethodMap = { "TaskGetSpecification": self.TaskGetSpecification, "TaskSetStatus": self.TaskSetStatus, } self.notification_bus = notification_bus def TaskGetSpecification(self, OtdbID): print "***** TaskGetSpecification(%s) *****" % (OtdbID, ) if OtdbID == 1: predecessors = "[2,3,4]" elif OtdbID == 2: predecessors = "[3]" elif OtdbID == 3: predecessors = "[]" elif OtdbID == 4: return { "TaskSpecification": { "Version.number": "1", PARSET_PREFIX + "Observation.otdbID": str(OtdbID), PARSET_PREFIX + "Observation.Scheduler.predecessors": "[]", PARSET_PREFIX + "Observation.processType": "Observation", PARSET_PREFIX + "Observation.Cluster.ProcessingCluster.clusterName": "CEP4", PARSET_PREFIX + "Observation.stopTime": "2016-01-01 01:00:00", } } else: raise Exception("Invalid OtdbID: %s" % OtdbID) return { "TaskSpecification": { "Version.number": "1", PARSET_PREFIX + "Observation.otdbID": str(OtdbID), PARSET_PREFIX + "Observation.Scheduler.predecessors": predecessors, PARSET_PREFIX + "Observation.processType": "Pipeline", PARSET_PREFIX + "Observation.Cluster.ProcessingCluster.clusterName": "CEP4", } } def TaskSetStatus(self, OtdbID, NewStatus, UpdateTimestamps): print "***** TaskSetStatus(%s,%s) *****" % (OtdbID, NewStatus) # Broadcast the state change content = { "treeID": OtdbID, "state": NewStatus, "time_of_change": datetime.datetime.utcnow() } msg = EventMessage(context=DEFAULT_OTDB_NOTIFICATION_SUBJECT, content=content) self.notification_bus.send(msg) return {'OtdbID': OtdbID, 'MomID': None, 'Success': True} service = Service(DEFAULT_OTDB_SERVICENAME, MockOTDBService, busname=self.busname, use_service_methods=True, handler_args={"notification_bus": self.bus}) service.start_listening() self.addCleanup(service.stop_listening) # ================================ # Setup listener to catch result # of our service # ================================ listener = OTDBBusListener(busname=self.busname) listener.start_listening() self.addCleanup(listener.stop_listening) self.trigger = MethodTrigger(listener, "onObservationQueued")
class RADBPGListener(PostgresListener): def __init__(self, busname=DEFAULT_NOTIFICATION_BUSNAME, notification_prefix=DEFAULT_NOTIFICATION_PREFIX, dbcreds=None, broker=None): super(RADBPGListener, self).__init__(dbcreds.host, dbcreds.database, dbcreds.user, dbcreds.password) self.notification_prefix = notification_prefix self.event_bus = ToBus(busname, broker=broker) self.subscribe('task_update_with_task_view', self.onTaskUpdated) self.subscribe('task_insert_with_task_view', self.onTaskInserted) self.subscribe('task_delete', self.onTaskDeleted) # when the specification starttime and endtime are updated, then that effects the task as well # so subscribe to specification_update, and use task_view as view_for_row self.subscribe('specification_update_with_task_view', self.onSpecificationUpdated) self.subscribe('resource_claim_update_with_resource_claim_view', self.onResourceClaimUpdated) self.subscribe('resource_claim_insert_with_resource_claim_view', self.onResourceClaimInserted) self.subscribe('resource_claim_delete', self.onResourceClaimDeleted) self.subscribe('resource_availability_update', self.onResourceAvailabilityUpdated) self.subscribe('resource_capacity_update', self.onResourceCapacityUpdated) def onTaskUpdated(self, payload=None): self._sendNotification('TaskUpdated', payload, ['starttime', 'endtime']) def onTaskInserted(self, payload=None): self._sendNotification('TaskInserted', payload, ['starttime', 'endtime']) def onTaskDeleted(self, payload=None): self._sendNotification('TaskDeleted', payload) def onSpecificationUpdated(self, payload=None): # when the specification starttime and endtime are updated, then that effects the task as well # so send a TaskUpdated notification self._sendNotification('TaskUpdated', payload, ['starttime', 'endtime']) def onResourceClaimUpdated(self, payload=None): self._sendNotification('ResourceClaimUpdated', payload, ['starttime', 'endtime']) def onResourceClaimInserted(self, payload=None): self._sendNotification('ResourceClaimInserted', payload, ['starttime', 'endtime']) def onResourceClaimDeleted(self, payload=None): self._sendNotification('ResourceClaimDeleted', payload) def onResourceAvailabilityUpdated(self, payload=None): self._sendNotification('ResourceAvailabilityUpdated', payload) def onResourceCapacityUpdated(self, payload=None): self._sendNotification('ResourceCapacityUpdated', payload) def __enter__(self): super(RADBPGListener, self).__enter__() self.event_bus.open() return self def __exit__(self, exc_type, exc_val, exc_tb): super(RADBPGListener, self).__exit__(exc_type, exc_val, exc_tb) self.event_bus.close() def _formatTimestampsAsIso(self, fields, contentDict): '''convert all requested fields in the contentDict to proper isoformat datetime strings. In postgres we use timestamps without timezone. By convention we only enter utc values. But, if they are json encoded by postgress, they are not properly formatted with the in isoformat with a 'Z' at the end. So, parse the requested fields, and return them as datetime. ''' try: for state in ('old', 'new'): if state in contentDict: for field in fields: try: if contentDict[state] and field in contentDict[ state]: timestampStr = contentDict[state][field] formatStr = '%Y-%m-%dT%H:%M:%S' if 'T' in timestampStr else '%Y-%m-%d %H:%M:%S' if timestampStr.rfind('.') > -1: formatStr += '.%f' timestamp = datetime.strptime( timestampStr, formatStr) contentDict[state][field] = timestamp except Exception as e: logger.error( 'Could not convert field \'%s\' to datetime: %s' % (field, e)) return contentDict except Exception as e: logger.error('Could not parse payload: %s\n%s' % (contentDict, e)) def _sendNotification(self, subject, payload, timestampFields=None): try: content = json.loads(payload) if 'new' in content and content[ 'new'] and 'old' in content and content['old']: # check if new and old are equal. # however, new and old can be based on different views, # so, only check the values for the keys they have in common new_keys = set(content['new'].keys()) old_keys = set(content['old'].keys()) common_keys = new_keys & old_keys equal_valued_keys = [ k for k in common_keys if content['new'][k] == content['old'][k] ] if len(equal_valued_keys) == len(common_keys): logger.info( 'new and old values are equal, not sending notification. %s' % (content['new'])) return if timestampFields: content = self._formatTimestampsAsIso(timestampFields, content) except Exception as e: logger.error('Could not parse payload: %s\n%s' % (payload, e)) content = None try: msg = EventMessage(context=self.notification_prefix + subject, content=content) logger.info('Sending notification %s: %s' % (subject, str(content).replace('\n', ' '))) self.event_bus.send(msg) except Exception as e: logger.error(str(e))
class RATaskSpecified(OTDBBusListener): def __init__(self, otdb_notification_busname=DEFAULT_OTDB_NOTIFICATION_BUSNAME, otdb_notification_subject=DEFAULT_OTDB_NOTIFICATION_SUBJECT, otdb_service_busname=DEFAULT_OTDB_SERVICE_BUSNAME, otdb_service_subject=DEFAULT_OTDB_SERVICENAME, notification_busname=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_BUSNAME, notification_subject=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_SUBJECT, broker=None, **kwargs): super(RATaskSpecified, self).__init__(busname=otdb_notification_busname, subject=otdb_notification_subject, **kwargs) self.otdbrpc = OTDBRPC(busname=otdb_service_busname, servicename=otdb_service_subject, broker=broker) ## , ForwardExceptions=True hardcoded in RPCWrapper right now self.send_bus = ToBus("%s/%s" % (notification_busname, notification_subject)) def start_listening(self, **kwargs): self.otdbrpc.open() self.send_bus.open() super(RATaskSpecified, self).start_listening(**kwargs) def stop_listening(self, **kwargs): super(RATaskSpecified, self).stop_listening(**kwargs) self.send_bus.close() self.otdbrpc.close() def get_predecessors(self, parset): """ Extract the list of predecessor obs IDs from the given parset. """ key = PARSET_PREFIX + "Observation.Scheduler.predecessors" stringlist = PyParameterValue(str(parset[key]), True).getStringVector() # Key contains values starting with 'S' = Scheduler, 'L'/'T' = OTDB, 'M' = MoM # 'S' we can probably ignore? Might be only internal in the Scheduler? result = [] for s in stringlist: try: # Made the source a string for readability, but it's not efficient if s.startswith('M'): result.append({'source': 'mom', 'id': int(s[1:])}) elif s.startswith('L') or s.startswith('T'): result.append({'source': 'otdb', 'id': int(s[1:])}) else: # 'S' logger.info("found a predecessor ID I can't handle: %s" % s) result.append({'source': 'other', 'id': int(s[1:])}) except ValueError: logger.warning("found a predecessor ID that I can't parse %s" % s) return result def get_specification_with_predecessors(self, id, id_source, state, found_parsets): logger.info("Processing ID %s from %s" % (id, id_source)) if id_source == "other": return None elif id_source == "mom": otdb_id = self.otdbrpc.taskGetIDs( mom_id=id )['otdb_id'] elif id_source == "otdb": otdb_id = id else: logger.warning("Error in understanding id %s", id) logger.info("Processing OTDB ID %s", otdb_id) result = {"otdb_id": otdb_id, "predecessors": []} if state: result["state"] = state # TODO should be status not state else: pass #otdbrpc.taskGetStatus not implemented and maybe not needed? if otdb_id in found_parsets: parset = found_parsets[otdb_id] else: parset = self.otdbrpc.taskGetSpecification( otdb_id=otdb_id )['specification'] found_parsets[otdb_id] = parset logger.info("parset [%s]: %s" % (otdb_id, parset)) result['specification'] = resourceIndicatorsFromParset(parset) key = PARSET_PREFIX + "Observation.processSubtype" result['task_type'], result['task_subtype'] = convertSchedulerProcessSubtype(parset.get(key, "")) logger.info("Processing predecessors") predecessor_ids = self.get_predecessors(parset) for id in predecessor_ids: predecessor_result = self.get_specification_with_predecessors(id['id'], id['source'], "", found_parsets) if predecessor_result: result["predecessors"].append(predecessor_result) return result def onObservationApproved(self, main_id, modificationTime): self.createAndSendSpecifiedTask(main_id, 'approved') def onObservationPrescheduled(self, main_id, modificationTime): self.createAndSendSpecifiedTask(main_id, 'prescheduled') def createAndSendSpecifiedTask(self, main_id, status): # Construct root node of tree resultTree = self.get_specification_with_predecessors(main_id, "otdb", status, {}) logger.info("Sending result: %s" % resultTree) # Put result on bus msg = EventMessage(content=resultTree) self.send_bus.send(msg) logger.info("Result sent")
def create_service(busname, dbcreds, state_file_path='~/.lofar/otdb_treestatusevent_state'): alive = True connected = False otdb_connection = None with ToBus(busname) as send_bus: while alive: while alive and not connected: # Connect to the database try: otdb_connection = pg.connect(**dbcreds.pg_connect_options()) connected = True logger.info("Connected to database %s" % (dbcreds,)) # Get list of allowed tree states allowed_states = {} for (state_nr, name) in otdb_connection.query("select id,name from treestate").getresult(): allowed_states[state_nr] = name except (TypeError, SyntaxError, pg.InternalError) as e: connected = False logger.error("Not connected to database %s, retry in 5 seconds: %s" % (dbcreds, e)) time.sleep(5) # When we are connected we can poll the database if connected: # Get start_time (= creation time of last retrieved record if any) try: treestatuseventfilename = os.path.expanduser(state_file_path) with open(treestatuseventfilename, 'r') as f: line = f.readline() if line.rfind('.') > 0: start_time = datetime.datetime.strptime(line, "%Y-%m-%d %H:%M:%S.%f") else: start_time = datetime.datetime.strptime(line, "%Y-%m-%d %H:%M:%S") except Exception as e: logger.warning(e) # start scanning from events since 'now' # this timestamp will be stored in the treestatuseventfilename file start_time = datetime.datetime.utcnow() try: logger.info("creating %s" % (treestatuseventfilename,)) if not os.path.exists(os.path.dirname(treestatuseventfilename)): os.makedirs(os.path.dirname(treestatuseventfilename)) with open(treestatuseventfilename, 'w') as f: f.write(start_time.strftime("%Y-%m-%d %H:%M:%S")) except Exception as e: logger.error(e) try: logger.debug("start_time=%s, polling database" % (start_time,)) record_list = PollForStatusChanges(start_time, otdb_connection) for (treeid, state, modtime, creation) in record_list: content = { "treeID" : treeid, "state" : allowed_states.get(state, "unknown_state"), "time_of_change" : modtime } msg = EventMessage(subject=DEFAULT_OTDB_NOTIFICATION_SUBJECT, content=content) logger.info("sending message treeid %s state %s modtime %s" % (treeid, allowed_states.get(state, "unknown_state"), modtime)) send_bus.send(msg) logger.debug("new start_time:=%s" % (creation,)) try: with open(treestatuseventfilename, 'w') as f: f.write(creation) except Exception as e: logger.error(e) except FunctionError as exc_info: logger.error(exc_info) except Exception as e: logger.error(e) # Redetermine the database status. connected = (otdb_connection and otdb_connection.status == 1) time.sleep(2)
class IngestTransferServer: def __init__(self, exchange=DEFAULT_BUSNAME, mom_credentials=None, lta_credentials=None, user=None, broker=None, max_nr_of_parallel_jobs=MAX_NR_OF_JOBS): self.user = user if not self.user: self.user = getpass.getuser() self.mom_credentials = mom_credentials self.lta_credentials = lta_credentials self.event_bus = ToBus(exchange=exchange, broker=broker) self.max_nr_of_parallel_jobs = max_nr_of_parallel_jobs self.__running_jobs = {} self.__lock = Lock() self.__prev_bytes_sent = _getBytesSent() self.__prev_bytes_sent_timestamp = datetime.utcnow() self.__prev_used_bandwidth = 0.0 self.__running_jobs_log_timestamp = datetime.utcnow() def start_job(self, job_dict): if not self.enoughResourcesAvailable(): raise ResourceWarning( "Not enough resources available to start new job: %s" % job_dict) job_id = job_dict['JobId'] if job_id in self.__running_jobs: raise ValueError( 'job %s is already running. Discarding this new job copy, and keeping the current one running...' % job_id) def threaded_pipeline_func(job): logger.info('starting job %s in the background', job_id) with LTAClient(self.lta_credentials.user, self.lta_credentials.password) as ltaClient, \ MoMClient(self.mom_credentials.user, self.mom_credentials.password) as momClient: jobPipeline = IngestPipeline(job, momClient, ltaClient, exchange=self.event_bus.exchange, broker=self.event_bus.broker, user=self.user) with self.__lock: self.__running_jobs[job_id]['pipeline'] = jobPipeline jobPipeline.run() with self.__lock: thread = Thread(target=threaded_pipeline_func, args=[job_dict], name="transfer_thread_%s" % (job_id, )) thread.daemon = True self.__running_jobs[job_id] = {'thread': thread} thread.start() def __clearFinishedJobs(self): try: with self.__lock: finished_job_ids = [ job_id for job_id, job_thread_dict in list( self.__running_jobs.items()) if not job_thread_dict['thread'].is_alive() ] for job_id in finished_job_ids: logger.info('removing finished job %s', job_id) del self.__running_jobs[job_id] except Exception as e: logger.error('__clearFinishedJobs: %s', e) def enoughResourcesAvailable(self): try: now = datetime.utcnow() bytes_sent = _getBytesSent() if bytes_sent >= self.__prev_bytes_sent: # bytes_sent might wrap around zero # compute current speed in Gbps speed = 8 * (bytes_sent - self.__prev_bytes_sent ) / totalSeconds(now - self.__prev_bytes_sent_timestamp) # running average for used_bandwidth used_bandwidth = 0.5 * speed + 0.5 * self.__prev_used_bandwidth logger.debug("resources: current used_bandwidth = %s", humanreadablesize(used_bandwidth, 'bps')) # store for next iteration self.__prev_bytes_sent = bytes_sent self.__prev_bytes_sent_timestamp = now self.__prev_used_bandwidth = used_bandwidth # only start new jobs if we have some bandwith available # note that this is a 'soft' limit. # we cannot control the actual bandwith used by the running transfers # we can only not start new jobs if we already exceed the MAX_USED_BANDWITH_TO_START_NEW_JOBS if used_bandwidth > MAX_USED_BANDWITH_TO_START_NEW_JOBS: logger.warning( 'resources: not enough bandwith available to start new jobs, using %s, max %s' % (humanreadablesize(used_bandwidth, 'bps'), humanreadablesize( MAX_USED_BANDWITH_TO_START_NEW_JOBS, 'bps'))) return False else: # wrapped around 0, just store for next iteration, do not compute anything self.__prev_bytes_sent = bytes_sent self.__prev_bytes_sent_timestamp = now # only start new jobs if we have some cpu time available idle_cpu_percentage = psutil.cpu_times_percent().idle logger.debug("resources: current idle_cpu_percentage = %s%%", idle_cpu_percentage) if idle_cpu_percentage < 5: logger.warning( 'resources: not enough cpu power available to start new jobs, cpu_idle %s%%' % idle_cpu_percentage) return False # only start new jobs if system load is not too high short_load_avg = os.getloadavg()[0] cpu_count = psutil.cpu_count() allowed_load = 1.5 * cpu_count logger.debug( "resources: current short term load = %s #cpu's = %s allowed_load = %s", short_load_avg, cpu_count, allowed_load) if short_load_avg > allowed_load: logger.warning( 'resources: system load too high (%s > %s), cannot start new jobs' % (short_load_avg, allowed_load)) return False # only allow 1 job at the time if swapping swap_memory_percentage = psutil.swap_memory().percent logger.debug("resources: current swap_memory_percentage = %s%%", swap_memory_percentage) if swap_memory_percentage > 5 and len(self.__running_jobs) > 0: logger.warning( 'resources: system swapping. not enough memory available to start new jobs' ) return False # only start new jobs if number of processes is not too high try: current_user = getpass.getuser() current_user_procs = [ p for p in psutil.process_iter() if p.username() == current_user ] current_num_user_procs = len(current_user_procs) allowed_num_user_procs = 64 * cpu_count logger.debug( "resources: current num_user_procs = %s allowed_num_user_procs = %s", current_num_user_procs, allowed_num_user_procs) if current_num_user_procs > allowed_num_user_procs: logger.warning( 'resources: number of processes by %s too high (%s > %s), cannot start new jobs' % (current_user, current_num_user_procs, allowed_num_user_procs)) return False except Exception as e: logger.exception(e) pass # limit total number of parallel transferring jobs to self.max_nr_of_parallel_jobs with self.__lock: starting_threads = [ job_thread_dict['thread'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' not in job_thread_dict ] pipelines = [ job_thread_dict['pipeline'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' in job_thread_dict ] initializing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_INITIALIZING ] transferring_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_TRANSFERRING ] finalizing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_FINALIZING ] num_busy_transfers = len(starting_threads) + len( initializing_pipelines) + len(transferring_pipelines) num_finalizing_transfers = len(finalizing_pipelines) logger.debug( "resources: current num_busy_transfers = %s num_finalizing_transfers = %s max_nr_of_parallel_jobs = %s", num_busy_transfers, num_finalizing_transfers, self.max_nr_of_parallel_jobs) if num_busy_transfers >= self.max_nr_of_parallel_jobs: logger.warning( 'resources: already running %d parallel jobs (#starting=%d, #transferring=%d) limiting the total number of transferring jobs to %d' % (len(self.__running_jobs), len(initializing_pipelines) + len(starting_threads), len(transferring_pipelines), self.max_nr_of_parallel_jobs)) return False if num_finalizing_transfers >= 2 * self.max_nr_of_parallel_jobs: logger.warning( 'resources: already waiting for %d jobs to finish (updating status/SIP to MoM and LTA). not starting new jobs until some jobs finished...' % (len(finalizing_pipelines), )) return False except Exception as e: logger.exception( "error while checking for available resources: %s", e) num_running_jobs = len(self.__running_jobs) if num_running_jobs <= 4: logger.info("running %d jobs, assuming we can run 1 more: ", num_running_jobs) return True else: logger.warning( "already running %d jobs, assuming for safety we cannot run more jobs...", num_running_jobs) return False return True def run(self): with self.event_bus: while True: try: self.__clearFinishedJobs() with self.__lock: starting_threads = [ job_thread_dict['thread'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' not in job_thread_dict ] pipelines = [ job_thread_dict['pipeline'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' in job_thread_dict ] initializing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_INITIALIZING ] transferring_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_TRANSFERRING ] finalizing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_FINALIZING ] finished_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_FINISHED ] log_interval = 5 if self.__running_jobs else 60 if datetime.utcnow( ) - self.__running_jobs_log_timestamp > timedelta( seconds=log_interval): status_log_line = "status: running %s jobs: #starting=%d, #transferring=%d, #finalizing=%d, #finished=%d, bandwith used on network interface(s) %s %s (%s), load=%.1f" % ( len(self.__running_jobs), len(initializing_pipelines) + len(starting_threads), len(transferring_pipelines), len(finalizing_pipelines), len(finished_pipelines), NET_IF_TO_MONITOR, humanreadablesize(self.__prev_used_bandwidth, 'bps'), humanreadablesize(self.__prev_used_bandwidth / 8, 'Bps'), os.getloadavg()[0]) logger.info(status_log_line) self.__running_jobs_log_timestamp = datetime.utcnow() msg = EventMessage(subject="%s.%s" % (INGEST_NOTIFICATION_PREFIX, 'TransferServiceStatus'), content={ 'ingest_server': socket.gethostname(), 'message': status_log_line }) msg.ttl = 3600 # remove message from queue's when not picked up within 1 hours self.event_bus.send(msg) time.sleep(5) except KeyboardInterrupt: break except Exception as e: logger.error(e)
class CacheManager: def __init__(self, cache_path='.du_cache.py', mountpoint=CEP4_DATA_MOUNTPOINT, exchange=DEFAULT_BUSNAME, broker=DEFAULT_BROKER): self._cache_path = cache_path self.otdb_listener = OTDBBusListener( _CacheManagerOTDBEventMessageHandler, handler_kwargs={'cache_manager': self}, exchange=exchange, broker=broker, num_threads=1) self.dm_listener = DataManagementBusListener( _CacheManagerDataManagementEventMessageHandler, handler_kwargs={'cache_manager': self}, exchange=exchange, broker=broker, num_threads=1) self.event_bus = ToBus(exchange=exchange, broker=broker) self._updateCacheThread = None self._running = False self._cacheLock = RLock() self._cache = {'path_du_results': {}, 'otdb_id2path': {}} self._last_cache_write_timestamp = datetime.datetime(1970, 1, 1) self._readCacheFromDisk() # dict to hold threading Events per path to prevent expensive multiple du calls for each path self._du_threading_events = {} self._du_threading_events_lock = RLock() self.disk_usage = DiskUsage(mountpoint=mountpoint, exchange=exchange, broker=broker) def _sendDiskUsageChangedNotification(self, path, disk_usage, otdb_id=None): try: msg = EventMessage(subject='%s.DiskUsageChanged' % DEFAULT_DM_NOTIFICATION_PREFIX, content={ 'path': path, 'disk_usage': disk_usage, 'disk_usage_readable': humanreadablesize(disk_usage), 'otdb_id': otdb_id }) logger.info('Sending notification with subject %s to %s: %s', msg.subject, self.event_bus.exchange, msg.content) self.event_bus.send(msg) except Exception as e: logger.error(str(e)) def _readCacheFromDisk(self): # maybe this cache on disk is slow, if so, revert to proper db solution try: if os.path.exists(self._cache_path): with open(self._cache_path, 'r') as file: cache_from_disk = eval(file.read().strip()) #slow! with self._cacheLock: self._cache = cache_from_disk if not isinstance(self._cache, dict): self._cache = { 'path_du_results': {}, 'otdb_id2path': {} } if 'path_du_results' not in self._cache: self._cache['path_du_results'] = {} if 'otdb_id2path' not in self._cache: self._cache['otdb_id2path'] = {} except Exception as e: logger.error("Error while reading in du cache: %s", e) with self._cacheLock: self._cache = {'path_du_results': {}, 'otdb_id2path': {}} def _writeCacheToDisk(self): try: # only persist (a subset of) the cache to disk every once in a while. if datetime.datetime.utcnow( ) - self._last_cache_write_timestamp > datetime.timedelta( minutes=5): cache_str = '' with self._cacheLock: # Take a subset of the entire cache # only the path_du_results for paths at project level (like /data/projects, /data/projects/LC9_001) # Do not store path_du_results for deeper levels on disk, because that makes the disk read/write too slow, # and the deeper levels can be obtained via rhb-du calls quite fast anyway. # Furthermore, once a deeper level du results is stored in the memory cache, then it is also available for fast lookup. # We just don't store these deep levels on disk. sub_cache = { path: du_result for path, du_result in list( self._cache['path_du_results'].items()) if self.getDepthToProjectsDir(path) <= 1 and du_result.get('found') } cache_str = str(sub_cache) tmp_path = '/tmp/tmp_storagequery_cache.py' with open(tmp_path, 'w') as file: file.write(cache_str) dir_path = os.path.dirname(self._cache_path) if dir_path: os.makedirs(dir_path, exist_ok=True) shutil.move(tmp_path, self._cache_path) self._last_cache_write_timestamp = datetime.datetime.utcnow() except Exception as e: logger.error("Error while writing du cache: %s", e) def _updateCache(self, du_result, send_notification=True): if not 'path' in du_result: return path = du_result['path'] otdb_id = du_result.get('otdb_id') with self._cacheLock: path_cache = self._cache['path_du_results'] otdb_id2path_cache = self._cache['otdb_id2path'] if otdb_id is None: # try to look up the otdb in the path cache if path in path_cache: otdb_id = du_result.get('otdb_id') # if still None, try to get the id from the path if otdb_id is None: otdb_id = getOTDBIdFromPath(path) if not path in path_cache or path_cache[path][ 'disk_usage'] != du_result['disk_usage']: # update the cache entry, even when no du result found, # cause that will save disk queries next time. logger.info('updating cache entry: %s', du_result) path_cache[path] = du_result if otdb_id != None: otdb_id2path_cache[otdb_id] = path if not du_result['found']: # even when the du for the path is not found, # keep a copy in the cache for fast lookup by clients # Make sure the size is 0 du_result['disk_usage'] = 0 du_result['disk_usage_readable'] = humanreadablesize(0) path_cache[path]['cache_timestamp'] = datetime.datetime.utcnow() path_cache[path]['needs_update'] = False self._writeCacheToDisk() if send_notification: self._sendDiskUsageChangedNotification(path, du_result['disk_usage'], otdb_id) def _invalidateCacheEntryForPath(self, path): with self._cacheLock: path_cache = self._cache['path_du_results'] if path in path_cache: path_cache[path]['needs_update'] = True def getOtdbIdsFoundOnDisk(self): with self._cacheLock: otdb_id2path_cache = self._cache['otdb_id2path'] return sorted(list(otdb_id2path_cache.keys())) def getDiskUsagesForAllOtdbIds(self, force_update=False): otdb_ids = self.getOtdbIdsFoundOnDisk() result = {} for otdb_id in otdb_ids: result[otdb_id] = self.getDiskUsageForOTDBId( otdb_id, force_update=force_update) return result def getDepthToProjectsDir(self, path): return len( path.replace(self.disk_usage.path_resolver.projects_path, '').strip('/').split('/')) def _scanProjectsTree(self): try: def addSubDirectoriesToCache(directory): if not self._running: return depth = self.getDepthToProjectsDir(directory) MAX_SCAN_DEPTH = 2 #depth=0 : projects #depth=1 : projects/<project> #depth=2 : projects/<project>/<obs> #depth=3 : projects/<project>/<obs>/<sub_dir> if depth > MAX_SCAN_DEPTH: return if depth < MAX_SCAN_DEPTH: logger.info('tree scan: scanning \'%s\'', directory) sd_result = self.disk_usage.path_resolver.getSubDirectories( directory) if sd_result['found']: subdir_paths = [ os.path.join(directory, sd) for sd in sd_result['sub_directories'] ] for subdir_path in subdir_paths: # recurse addSubDirectoriesToCache(subdir_path) with self._cacheLock: path_cache = self._cache['path_du_results'] add_empty_du_result_to_cache = not directory in path_cache if add_empty_du_result_to_cache: logger.info( 'tree scan: adding \'%s\' with empty disk_usage to cache which will be du\'ed later', directory) empty_du_result = { 'found': True, 'disk_usage': None, 'path': directory, 'name': directory.split('/')[-1] } self._updateCache(empty_du_result, send_notification=False) with self._cacheLock: path_cache = self._cache['path_du_results'] if directory in path_cache: # mark cache entry for directory to be updated path_cache[directory]['needs_update'] = True addSubDirectoriesToCache( self.disk_usage.path_resolver.projects_path) logger.info('tree scan complete') except Exception as e: logger.exception(str(e)) def _updateOldEntriesInCache(self): logger.info('starting updating old cache entries') while self._running: try: now = datetime.datetime.utcnow() with self._cacheLock: path_cache = self._cache['path_du_results'] old_entries = [ cache_entry for cache_entry in list(path_cache.values()) if now - cache_entry['cache_timestamp'] > MAX_CACHE_ENTRY_AGE ] needs_update_entries = [ cache_entry for cache_entry in list(path_cache.values()) if cache_entry.get('needs_update', False) ] updateable_entries = old_entries + needs_update_entries logger.info( '%s old cache entries need to be updated, #age:%s #needs_update:%s', len(updateable_entries), len(old_entries), len(needs_update_entries)) if updateable_entries: # sort them oldest to newest, 'needs_update' paths first def compareFunc(entry1, entry2): if entry1.get('needs_update' ) and not entry2.get('needs_update'): return -1 if not entry1.get('needs_update') and entry2.get( 'needs_update'): return 1 #depth1 = self.getDepthToProjectsDir(entry1['path']) #depth2 = self.getDepthToProjectsDir(entry2['path']) #if depth1 != depth2: ## lower level dirs are sorted in front of higher level dirs #return depth1 - depth2 if entry1['cache_timestamp'] < entry2[ 'cache_timestamp']: return -1 if entry1['cache_timestamp'] > entry2[ 'cache_timestamp']: return 1 return 0 updateable_entries = sorted(updateable_entries, key=cmp_to_key(compareFunc)) cacheUpdateStart = datetime.datetime.utcnow() for i, cache_entry in enumerate(updateable_entries): try: # it might be that the cache_entry was already updated via another way # so only update it if still to old or needs_update now = datetime.datetime.utcnow() if now - cache_entry[ 'cache_timestamp'] > MAX_CACHE_ENTRY_AGE or cache_entry.get( 'needs_update', False): path = cache_entry.get('path') if path: logger.info( '_updateOldEntriesInCache: examining entry %s/%s. timestamp:%s age:%s needs_update:%s path: \'%s\'', i, len(updateable_entries), cache_entry['cache_timestamp'], format_timedelta( now - cache_entry['cache_timestamp']), cache_entry.get('needs_update', False), path) #du a full update from disk, which might be (really) slow. result = du_getDiskUsageForPath(path) logger.debug( 'trying to update old entry in cache: %s', result) self._updateCache(result) except Exception as e: logger.error(str(e)) if not self._running: logger.info('exiting _updateCacheThread') return if datetime.datetime.utcnow( ) - cacheUpdateStart > datetime.timedelta(minutes=10): # break out of cache update loop if full update takes more than 1min # next loop we'll start with the oldest cache entries again logger.info( 'skipping remaining %s old cache entries updates, they will be updated next time', len(updateable_entries) - i) break #update the CEP4 capacities in the RADB once in a while... self._updateCEP4CapacitiesInRADB() #sleep for a while, (or stop if requested) for i in range(60): sleep(1) if not self._running: logger.info('exiting _updateCacheThread') return except Exception as e: logger.exception(str(e)) def _updateCEP4CapacitiesInRADB(self): try: df_result = self.disk_usage.getDiskFreeSpace() if df_result['found']: #get the total used space, and update the resource availability in the radb radbrpc = self.disk_usage.path_resolver.radbrpc storage_resources = radbrpc.getResources( resource_types='storage', include_availability=True) cep4_storage_resource = next(x for x in storage_resources if 'CEP4' in x['name']) total_capacity = df_result.get('disk_size') used_capacity = df_result.get('disk_usage') available_capacity = df_result.get('disk_free') logger.info( 'updating capacities for resource \'%s\' (id=%s) in the RADB: total=%s, used=%s, available=%s', cep4_storage_resource['name'], cep4_storage_resource['id'], humanreadablesize(total_capacity), humanreadablesize(used_capacity), humanreadablesize(available_capacity)) radbrpc.updateResourceAvailability( cep4_storage_resource['id'], available_capacity=available_capacity, total_capacity=total_capacity) except Exception as e: logger.error('_updateCEP4CapacitiesInRADB: %s', e) def open(self): logger.info("opening storagequeryservice cache...") self._running = True self.disk_usage.open() self.event_bus.open() self._scanProjectsTree() self._updateCacheThread = Thread(target=self._updateOldEntriesInCache) self._updateCacheThread.daemon = True self._updateCacheThread.start() self.otdb_listener.start_listening() self.dm_listener.start_listening() logger.info("opened storagequeryservice cache") def close(self): logger.info("closing storagequeryservice cache...") self._running = False self.otdb_listener.stop_listening() self.dm_listener.stop_listening() self._updateCacheThread.join() self.event_bus.close() self.disk_usage.close() logger.info("closed storagequeryservice cache") def __enter__(self): self.open() return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def onObservationFinished(self, otdb_id, modificationTime): self._onDiskActivityForOTDBId(otdb_id) def onObservationAborted(self, otdb_id, modificationTime): self._onDiskActivityForOTDBId(otdb_id) def onTaskDeleted(self, otdb_id, deleted, paths, message=''): self._onDiskActivityForOTDBId(otdb_id) with self._cacheLock: if deleted and otdb_id != None and otdb_id in self._cache[ 'otdb_id2path']: del self._cache['otdb_id2path'][otdb_id] def _onDiskActivityForOTDBId(self, otdb_id): result = self.disk_usage.getDiskUsageForOTDBId(otdb_id) self._updateCache(result) task_path = result.get('path') projects_path = self.disk_usage.path_resolver.projects_path # update all paths up the tree up to the projects_path # update the resource availability in the radb as well path = task_path while path: parent_path = '/'.join(path.split('/')[:-1]) if projects_path.startswith( parent_path) and len(parent_path) < len(projects_path): break logger.info( 'invalidating cache entry for %s because disk usage for task %s in %s changed', parent_path, otdb_id, task_path) self._invalidateCacheEntryForPath(parent_path) path = parent_path def getDiskUsageForOTDBId(self, otdb_id, include_scratch_paths=True, force_update=False): return self.getDiskUsageForTask( otdb_id=otdb_id, include_scratch_paths=include_scratch_paths, force_update=force_update) def getDiskUsageForMoMId(self, mom_id, include_scratch_paths=True, force_update=False): return self.getDiskUsageForTask( mom_id=mom_id, include_scratch_paths=include_scratch_paths, force_update=force_update) def getDiskUsageForRADBId(self, radb_id, include_scratch_paths=True, force_update=False): return self.getDiskUsageForTask( radb_id=radb_id, include_scratch_paths=include_scratch_paths, force_update=force_update) def getDiskUsageForTask(self, radb_id=None, mom_id=None, otdb_id=None, include_scratch_paths=True, force_update=False): logger.info( "cache.getDiskUsageForTask(radb_id=%s, mom_id=%s, otdb_id=%s, include_scratch_paths=%s, force_update=%s)", radb_id, mom_id, otdb_id, include_scratch_paths, force_update) if otdb_id != None and not include_scratch_paths: with self._cacheLock: path = self._cache['otdb_id2path'].get(otdb_id) if path: logger.info('Using path from cache for otdb_id %s %s', otdb_id, path) return self.getDiskUsageForPath(path, force_update=force_update) logger.info( "cache.getDiskUsageForTask could not find path in cache, determining path..." ) path_result = self.disk_usage.path_resolver.getPathForTask( radb_id=radb_id, mom_id=mom_id, otdb_id=otdb_id, include_scratch_paths=include_scratch_paths) if path_result['found']: task_path = path_result['path'] scratch_paths = path_result.get('scratch_paths', []) # get all du's in parallel over all paths paths = [task_path] + scratch_paths paths_du_result = self.getDiskUsageForPaths( paths, force_update=force_update) # split into project and subdir path_du_result = paths_du_result.pop(task_path) scratch_du_result = paths_du_result task_du_result = dict(path_du_result) # yield id's for if available, or None for id in ['radb_id', 'otdb_id', 'mom_id']: task_du_result[id] = path_result.get(id) if scratch_du_result: task_du_result['scratch_paths'] = scratch_du_result return task_du_result # still no path(s) found for otdb_id, now try from cache and ignore possible scratch paths if otdb_id != None: with self._cacheLock: path = self._cache['otdb_id2path'].get(otdb_id) if path: logger.info( 'Using path from cache for otdb_id %s %s (ignoring possible scratch/share paths)', otdb_id, path) return self.getDiskUsageForPath(path, force_update=force_update) return {'found': False, 'path': path_result['path']} def getDiskUsageForTasks(self, radb_ids=None, mom_ids=None, otdb_ids=None, include_scratch_paths=True, force_update=False): logger.info( "cache.getDiskUsageForTasks(radb_ids=%s, mom_ids=%s, otdb_ids=%s)" % (radb_ids, mom_ids, otdb_ids)) tasks_result = {'radb_ids': {}, 'mom_ids': {}, 'otdb_ids': {}} if radb_ids is None: radb_ids = [] if mom_ids is None: mom_ids = [] if otdb_ids is None: otdb_ids = [] with futures.ThreadPoolExecutor( max_workers=os.cpu_count()) as executor: # helper function to expand parallel p_kwarg dict into kwargs def parallel_getDiskUsageForTask(p_kwarg): return self.getDiskUsageForTask(**p_kwarg) parallel_kwargs = [{ 'radb_id': radb_id, 'include_scratch_paths': include_scratch_paths, 'force_update': force_update } for radb_id in radb_ids] parallel_kwargs += [{ 'mom_id': mom_id, 'include_scratch_paths': include_scratch_paths, 'force_update': force_update } for mom_id in mom_ids] parallel_kwargs += [{ 'otdb_id': otdb_id, 'include_scratch_paths': include_scratch_paths, 'force_update': force_update } for otdb_id in otdb_ids] results = list( executor.map(parallel_getDiskUsageForTask, parallel_kwargs)) # collect results in a dict grouped by id_type for result in results: if result.get('radb_id') in radb_ids: tasks_result['radb_ids'][result['radb_id']] = result if result.get('mom_id') in mom_ids: tasks_result['mom_ids'][result['mom_id']] = result if result.get('otdb_id') in otdb_ids: tasks_result['otdb_ids'][result['otdb_id']] = result logger.info( "cache.getDiskUsageForTasks(radb_ids=%s, mom_ids=%s, otdb_ids=%s) returning: %s" % (radb_ids, mom_ids, otdb_ids, tasks_result)) return tasks_result def getDiskUsageForPaths(self, paths, force_update=False): with futures.ThreadPoolExecutor( max_workers=os.cpu_count()) as executor: # helper function to expand parallel p_arg tuple into list of args def parallel_getDiskUsageForPath(p_arg): return self.getDiskUsageForPath(*p_arg) parallel_args = [(path, force_update) for path in paths] results = list( executor.map(parallel_getDiskUsageForPath, parallel_args)) return {result['path']: result for result in results} def getDiskUsageForPath(self, path, force_update=False): logger.info("cache.getDiskUsageForPath('%s', force_update=%s)", path, force_update) needs_cache_update = False if not force_update: with self._cacheLock: needs_cache_update |= path not in self._cache[ 'path_du_results'] if needs_cache_update or force_update: logger.info("cache update needed for %s", path) # check if some other thread is already doing a du call for this path... with self._du_threading_events_lock: path_threading_event = self._du_threading_events.get(path) need_to_do_du_call = path_threading_event is None if need_to_do_du_call: # no other thread is currently du'ing/updating this path # so create a threading Event and store it in the dict, # so other threads can wait for this event. logger.info("updating the cache for %s current_thread=%s", path, current_thread().name) path_threading_event = Event() self._du_threading_events[path] = path_threading_event if need_to_do_du_call: # no other thread is currently du'ing/updating this path # so we need to do it here. result = du_getDiskUsageForPath(path) self._updateCache(result) # signal threads waiting for this same path du call # and do bookkeeping with self._du_threading_events_lock: logger.info( "signaling other threads that the cache was updated for %s current_thread=%s", path, current_thread().name) path_threading_event.set() del self._du_threading_events[path] else: logger.info( "waiting for du call on other thread that will update the cache for %s current_thread=%s", path, current_thread().name) path_threading_event.wait() logger.info( "another thread just updated the cache for %s current_thread=%s", path, current_thread().name) with self._cacheLock: if path in self._cache['path_du_results']: result = self._cache['path_du_results'][path] else: result = { 'found': False, 'path': path, 'message': 'unknown error' } if not self.disk_usage.path_resolver.pathExists(path): result['message'] = 'No such path: %s' % path result['disk_usage_readable'] = humanreadablesize( result.get('disk_usage', 0)) logger.info('cache.getDiskUsageForPath(\'%s\') result: %s', path, result) return result def getDiskUsageForTaskAndSubDirectories(self, radb_id=None, mom_id=None, otdb_id=None, force_update=False): logger.info( "cache.getDiskUsageForTaskAndSubDirectories(radb_id=%s, mom_id=%s, otdb_id=%s)" % (radb_id, mom_id, otdb_id)) task_du_result = self.getDiskUsageForTask(radb_id=radb_id, mom_id=mom_id, otdb_id=otdb_id, force_update=force_update) if task_du_result['found']: task_sd_result = self.disk_usage.path_resolver.getSubDirectories( task_du_result['path']) if task_sd_result['found']: subdir_paths = [ os.path.join(task_du_result['path'], sd) for sd in task_sd_result['sub_directories'] ] subdirs_du_result = self.getDiskUsageForPaths( subdir_paths, force_update=force_update) result = { 'found': True, 'task_directory': task_du_result, 'sub_directories': subdirs_du_result } logger.info( "result for cache.getDiskUsageForTaskAndSubDirectories(radb_id=%s, mom_id=%s, otdb_id=%s): %s", radb_id, mom_id, otdb_id, result) return result logger.warning( "result for cache.getDiskUsageForTaskAndSubDirectories(radb_id=%s, mom_id=%s, otdb_id=%s): %s", radb_id, mom_id, otdb_id, task_du_result) return task_du_result def getDiskUsageForProjectDirAndSubDirectories(self, radb_id=None, mom_id=None, otdb_id=None, project_name=None, force_update=False): logger.info( "cache.getDiskUsageForProjectDirAndSubDirectories(radb_id=%s, mom_id=%s, otdb_id=%s)" % (radb_id, mom_id, otdb_id)) path_result = self.disk_usage.path_resolver.getProjectDirAndSubDirectories( radb_id=radb_id, mom_id=mom_id, otdb_id=otdb_id, project_name=project_name) if path_result['found']: projectdir_path = path_result['path'] subdir_paths = [ os.path.join(path_result['path'], sd) for sd in path_result['sub_directories'] ] # get all du's in parallel over all paths paths = [projectdir_path] + subdir_paths paths_du_result = self.getDiskUsageForPaths( paths, force_update=force_update) # split into project and subdir projectdir_du_result = paths_du_result.pop(projectdir_path) subdirs_du_result = paths_du_result # create total result dict result = { 'found': True, 'projectdir': projectdir_du_result, 'sub_directories': subdirs_du_result } logger.info( 'cache.getDiskUsageForProjectDirAndSubDirectories result: %s' % result) return result return path_result def getDiskUsageForProjectsDirAndSubDirectories(self, force_update=False): logger.info("cache.getDiskUsageForProjectsDirAndSubDirectories") projects_path = self.disk_usage.path_resolver.projects_path project_subdirs_result = self.disk_usage.path_resolver.getSubDirectories( projects_path) subdir_paths = [ os.path.join(projects_path, sd) for sd in project_subdirs_result['sub_directories'] ] if project_subdirs_result['found'] else [] # get all du's in parallel over all paths paths = [projects_path] + subdir_paths paths_du_result = self.getDiskUsageForPaths(paths, force_update=force_update) # split into project and subdir projectsdir_du_result = paths_du_result.pop(projects_path) subdirs_du_result = paths_du_result # create total result dict result = { 'found': True, 'projectdir': projectsdir_du_result, 'sub_directories': subdirs_du_result } logger.info( 'cache.getDiskUsageForProjectsDirAndSubDirectories result: %s' % result) return result
class RADBPGListener(PostgresListener): def __init__(self, exchange=DEFAULT_BUSNAME, dbcreds=None, broker=DEFAULT_BROKER): super(RADBPGListener, self).__init__(dbcreds=dbcreds) self.event_bus = ToBus(exchange=exchange, broker=broker) self.radb = RADatabase(dbcreds=dbcreds) self.subscribe('task_update', self.onTaskUpdated) self.subscribe('task_insert', self.onTaskInserted) self.subscribe('task_delete', self.onTaskDeleted) self.subscribe('task_predecessor_insert_column_task_id', self.onTaskPredecessorChanged) self.subscribe('task_predecessor_update_column_task_id', self.onTaskPredecessorChanged) self.subscribe('task_predecessor_delete_column_task_id', self.onTaskPredecessorChanged) self.subscribe('task_predecessor_insert_column_predecessor_id', self.onTaskSuccessorChanged) self.subscribe('task_predecessor_update_column_predecessor_id', self.onTaskSuccessorChanged) self.subscribe('task_predecessor_delete_column_predecessor_id', self.onTaskSuccessorChanged) # when the specification starttime and endtime are updated, then that effects the task as well self.subscribe('specification_update', self.onSpecificationUpdated) self.subscribe('resource_claim_update', self.onResourceClaimUpdated) self.subscribe('resource_claim_insert', self.onResourceClaimInserted) self.subscribe('resource_claim_delete', self.onResourceClaimDeleted) self.subscribe('resource_availability_update', self.onResourceAvailabilityUpdated) self.subscribe('resource_capacity_update', self.onResourceCapacityUpdated) def onTaskUpdated(self, payload=None): # Send notification for the given updated task task_id = payload task = self.radb.getTask(task_id) self._sendNotification('TaskUpdated', task) # The "blocked_by_ids" property of the given task's successors might have been updated due to the given task # status being updated. Therefore also send a notification for these successors - lazily ignoring that they # might not have changed. suc_sched_tasks = self.radb.getTasks(task_ids=task['successor_ids'], task_status='scheduled') for suc_sched_task in suc_sched_tasks: self._sendNotification('TaskUpdated', suc_sched_task) def onTaskInserted(self, payload=None): self._sendNotification('TaskInserted', self.radb.getTask(payload)) def onTaskDeleted(self, payload=None): self._sendNotification('TaskDeleted', payload) def onTaskPredecessorChanged(self, task_id): logger.info('onTaskPredecessorChanged(task_id=%s)', task_id) self._sendNotification('TaskUpdated', self.radb.getTask(task_id)) def onTaskSuccessorChanged(self, task_id): logger.info('onTaskSuccessorChanged(task_id=%s)', task_id) self._sendNotification('TaskUpdated', self.radb.getTask(task_id)) def onSpecificationUpdated(self, payload=None): # when the specification starttime and endtime are updated, then that effects the task as well self._sendNotification('TaskUpdated', self.radb.getTask(specification_id=payload)) def onResourceClaimUpdated(self, payload=None): self._sendNotification('ResourceClaimUpdated', self.radb.getResourceClaim(payload)) def onResourceClaimInserted(self, payload=None): self._sendNotification('ResourceClaimInserted', self.radb.getResourceClaim(payload)) def onResourceClaimDeleted(self, payload=None): self._sendNotification('ResourceClaimDeleted', payload) def onResourceAvailabilityUpdated(self, payload=None): r = self.radb.getResources(resource_ids=[payload], include_availability=True)[0] r = {k: r[k] for k in ['id', 'active']} self._sendNotification('ResourceAvailabilityUpdated', r) def onResourceCapacityUpdated(self, payload=None): r = self.radb.getResources(resource_ids=[payload], include_availability=True)[0] r = { k: r[k] for k in ['id', 'total_capacity', 'available_capacity', 'used_capacity'] } self._sendNotification('ResourceCapacityUpdated', r) def __enter__(self): super(RADBPGListener, self).__enter__() self.radb.connect() self.event_bus.open() return self def __exit__(self, exc_type, exc_val, exc_tb): super(RADBPGListener, self).__exit__(exc_type, exc_val, exc_tb) self.radb.disconnect() self.event_bus.close() def _formatTimestampsAsIso(self, fields, contentDict): '''convert all requested fields in the contentDict to proper isoformat datetime strings. In postgres we use timestamps without timezone. By convention we only enter utc values. But, if they are json encoded by postgress, they are not properly formatted with the in isoformat with a 'Z' at the end. So, parse the requested fields, and return them as datetime. ''' try: for field in fields: try: if field in contentDict: timestampStr = contentDict[field] formatStr = '%Y-%m-%dT%H:%M:%S' if 'T' in timestampStr else '%Y-%m-%d %H:%M:%S' if timestampStr.rfind('.') > -1: formatStr += '.%f' timestamp = datetime.strptime(timestampStr, formatStr) contentDict[field] = timestamp except Exception as e: logger.error( 'Could not convert field \'%s\' to datetime: %s' % (field, e)) return contentDict except Exception as e: logger.error( 'Error while convering timestamp fields \'%s\'in %s\n%s' % (fields, contentDict, e)) def _sendNotification(self, subject, contentDict): try: if subject and contentDict: msg = EventMessage(subject="%s.%s" % (DEFAULT_RADB_NOTIFICATION_PREFIX, subject), content=contentDict) logger.info('Sending notification %s to %s: %s', subject, self.event_bus.exchange, str(contentDict).replace('\n', ' ')) self.event_bus.send(msg) except Exception as e: logger.error(str(e))
class TestPipelineControl(unittest.TestCase): def setUp(self): # Create a random bus self.busname = "%s-%s" % (sys.argv[0], str(uuid.uuid4())[:8]) self.bus = ToBus(self.busname, { "create": "always", "delete": "always", "node": { "type": "topic" } }) self.bus.open() self.addCleanup(self.bus.close) # Patch SLURM class MockSlurm(object): def __init__(self, *args, **kwargs): self.scheduled_jobs = {} def submit(self, jobName, *args, **kwargs): print "Schedule SLURM job '%s': %s, %s" % (jobName, args, kwargs) self.scheduled_jobs[jobName] = (args, kwargs) # Return job ID return "42" def jobid(self, jobname): if jobname in ["1", "2", "3"]: return jobname # "4" is an observation, so no SLURM job return None patcher = patch('lofar.mac.PipelineControl.Slurm') patcher.start().side_effect = MockSlurm self.addCleanup(patcher.stop) # Catch functions to prevent running executables patcher = patch('lofar.mac.PipelineControl.Parset.dockerImage') patcher.start().return_value = "lofar-pipeline:trunk" self.addCleanup(patcher.stop) # ================================ # Setup mock otdb service # ================================ class MockOTDBService(MessageHandlerInterface): def __init__(self, notification_bus): """ notification_bus: bus to send state changes to """ super(MockOTDBService, self).__init__() self.service2MethodMap = { "TaskGetSpecification": self.TaskGetSpecification, "TaskSetStatus": self.TaskSetStatus, } self.notification_bus = notification_bus def TaskGetSpecification(self, OtdbID): print "***** TaskGetSpecification(%s) *****" % (OtdbID, ) if OtdbID == 1: predecessors = "[2,3,4]" elif OtdbID == 2: predecessors = "[3]" elif OtdbID == 3: predecessors = "[]" elif OtdbID == 4: return { "TaskSpecification": { "Version.number": "1", PARSET_PREFIX + "Observation.otdbID": str(OtdbID), PARSET_PREFIX + "Observation.Scheduler.predecessors": "[]", PARSET_PREFIX + "Observation.processType": "Observation", PARSET_PREFIX + "Observation.Cluster.ProcessingCluster.clusterName": "CEP4", PARSET_PREFIX + "Observation.stopTime": "2016-01-01 01:00:00", } } else: raise Exception("Invalid OtdbID: %s" % OtdbID) return { "TaskSpecification": { "Version.number": "1", PARSET_PREFIX + "Observation.otdbID": str(OtdbID), PARSET_PREFIX + "Observation.Scheduler.predecessors": predecessors, PARSET_PREFIX + "Observation.processType": "Pipeline", PARSET_PREFIX + "Observation.Cluster.ProcessingCluster.clusterName": "CEP4", } } def TaskSetStatus(self, OtdbID, NewStatus, UpdateTimestamps): print "***** TaskSetStatus(%s,%s) *****" % (OtdbID, NewStatus) # Broadcast the state change content = { "treeID": OtdbID, "state": NewStatus, "time_of_change": datetime.datetime.utcnow() } msg = EventMessage(context=DEFAULT_OTDB_NOTIFICATION_SUBJECT, content=content) self.notification_bus.send(msg) return {'OtdbID': OtdbID, 'MomID': None, 'Success': True} service = Service(DEFAULT_OTDB_SERVICENAME, MockOTDBService, busname=self.busname, use_service_methods=True, handler_args={"notification_bus": self.bus}) service.start_listening() self.addCleanup(service.stop_listening) # ================================ # Setup listener to catch result # of our service # ================================ listener = OTDBBusListener(busname=self.busname) listener.start_listening() self.addCleanup(listener.stop_listening) self.trigger = MethodTrigger(listener, "onObservationQueued") def test_setStatus(self): with PipelineControl( otdb_notification_busname=self.busname, otdb_service_busname=self.busname) as pipelineControl: pipelineControl._setStatus(12345, "queued") # Wait for the status to propagate self.assertTrue(self.trigger.wait()) self.assertEqual(self.trigger.args[0], 12345) def testNoPredecessors(self): """ Request to start a simulated obsid 3, with the following predecessor tree: 3 requires nothing """ with PipelineControl( otdb_notification_busname=self.busname, otdb_service_busname=self.busname) as pipelineControl: # Send fake status update pipelineControl._setStatus(3, "scheduled") # Wait for message to arrive self.assertTrue(self.trigger.wait()) # Verify message self.assertEqual(self.trigger.args[0], 3) # otdbId # Check if job was scheduled self.assertIn("3", pipelineControl.slurm.scheduled_jobs) self.assertIn("3-aborted", pipelineControl.slurm.scheduled_jobs) def testPredecessors(self): """ Request to start a simulated obsid 1, with the following predecessor tree: 1 requires 2, 3, 4 2 requires 3 4 is an observation """ with PipelineControl( otdb_notification_busname=self.busname, otdb_service_busname=self.busname) as pipelineControl: # Send fake status update pipelineControl._setStatus(1, "scheduled") # Wait for message to arrive self.assertTrue(self.trigger.wait()) # Verify message self.assertEqual(self.trigger.args[0], 1) # otdbId # Check if job was scheduled self.assertIn("1", pipelineControl.slurm.scheduled_jobs) self.assertIn("1-aborted", pipelineControl.slurm.scheduled_jobs) # Earliest start of this job > stop time of observation for p in pipelineControl.slurm.scheduled_jobs["1"][1][ "sbatch_params"]: if p.startswith("--begin="): begin = datetime.datetime.strptime( p, "--begin=%Y-%m-%dT%H:%M:%S") self.assertGreater(begin, datetime.datetime(2016, 1, 1, 1, 0, 0)) break else: self.assertTrue(False, "--begin parameter not given to SLURM job")
class RATaskSpecified(OTDBBusListener): def __init__(self, servicename, otdb_busname=None, my_busname=None, **kwargs): super(RATaskSpecified, self).__init__(busname=otdb_busname, subject="TaskStatus", **kwargs) self.parset_rpc = RPC(service="TaskSpecification", busname=otdb_busname) self.send_bus = ToBus("%s/%s" % (my_busname, servicename)) def start_listening(self, **kwargs): self.parset_rpc.open() self.send_bus.open() super(RATaskSpecified, self).start_listening(**kwargs) def stop_listening(self, **kwargs): super(RATaskSpecified, self).stop_listening(**kwargs) self.send_bus.close() self.parset_rpc.close() def onObservationPrescheduled(self, treeId, modificationTime): logger.info("Processing obs ID %s", treeId) # Request the parset main_obsID = treeId main_parset, _ = self.parset_rpc(OtdbID=main_obsID) # Construct a dict of all the parsets we retrieved parsets = {} parsets[main_obsID] = main_parset logger.info("Processing predecessors") # Collect the initial set of predecessors request_obsIDs = set(predecessors(main_parset)) logger.info("Processing %s", request_obsIDs) # Iterate recursively over all known predecessor obsIDs, and request their parsets while request_obsIDs: obsID = request_obsIDs.pop() if obsID in parsets: # Predecessor lists can overlap -- we already have this one continue logger.info("Fetching predecessor %s", obsID) # Request predecessor parset parsets[obsID], _ = self.parset_rpc(OtdbID=obsID) # Add the list of predecessors request_obsIDs = request_obsIDs.union(predecessors(parsets[obsID])) # Convert parsets to resource indicators logger.info("Extracting resource indicators") resourceIndicators = dict([(str(obsID), resourceIndicatorsFromParset(parset)) for (obsID, parset) in parsets.iteritems()]) # Construct and send result message logger.info("Sending result") result = { "sasID": main_obsID, "state": "prescheduled", "time_of_change": modificationTime, "resource_indicators": resourceIndicators, } # Put result on bus msg = EventMessage(content=result) self.send_bus.send(msg) logger.info("Result sent")
class IngestPipeline(): STATUS_INITIALIZING = 1 STATUS_TRANSFERRING = 2 STATUS_FINALIZING = 3 STATUS_FINISHED = 4 def __init__(self, job, momClient, ltaClient, exchange=DEFAULT_BUSNAME, broker=DEFAULT_BROKER, user=getpass.getuser(), globus_timeout=GLOBUS_TIMEOUT, minimal_SIP=False): self.status = IngestPipeline.STATUS_INITIALIZING self.hostname = socket.gethostname() self.job = job self.momClient = momClient self.ltaClient = ltaClient self.user = user if not self.user: self.user=getpass.getuser() self.globus_timeout = globus_timeout self.minimal_SIP = minimal_SIP self.event_bus = ToBus(exchange, broker=broker, connection_log_level=logging.DEBUG) self.Project = job['Project'] self.DataProduct = job['DataProduct'] self.FileName = job['FileName'] self.JobId = job['JobId'] self.ArchiveId = int(job['ArchiveId']) self.ObsId = int(job['ObservationId']) self.ExportID = job['ExportID'] self.Type = job["Type"] self.HostLocation = job['Location'].partition(':')[0] self.Location = job['Location'].partition(':')[2] self.ticket = '' self.FileSize = '0' self.MD5Checksum = '' self.Adler32Checksum = '' self.ChecksumResult = False self.SIP = '' self.PrimaryUri = '' self.SecondaryUri = '' self.lta_site = '' def GetStorageTicket(self): do_check_already_in_lta=isProductionEnvironment() result = self.ltaClient.GetStorageTicket(self.Project, self.FileName, self.FileSize, self.ArchiveId, self.JobId, self.ObsId, do_check_already_in_lta, self.Type) error = result.get('error') if error: if 'StorageTicket with mom ID "%i"' % (self.ArchiveId) in error: if 'existing_ticket_id' in result and 'existing_ticket_state' in result: logger.warning("Got a Tier 1 GetStorageTicket error for an incomplete storage ticket %s with status %s" % (result['existing_ticket_id'],result['existing_ticket_state'])) if result['existing_ticket_state'] < IngestSuccessful: try: self.ticket = result['existing_ticket_id'] logger.warning("trying to repair status of StorageTicket %s" % self.ticket) self.SendStatusToLTA(IngestFailed) except Exception as e: logger.exception('ResettingStatus IngestFailed failed for %s' % self.ticket) raise Exception ('Had to reset state for %s' % self.ticket) else: raise PipelineError('GetStorageTicket error: Dataproduct already in LTA for %s' % (self.JobId), PipelineAlreadyInLTAError) else: raise Exception('GetStorageTicket error I can''t interpret: %s' % result) if 'no storage resources defined for project' in error or "project does not exists" in error: raise PipelineError('GetStorageTicket error for project not known in LTA: %s' % error, PipelineNoProjectInLTAError) raise Exception('GetStorageTicket error: %s' % error) else: self.ticket = result.get('ticket') self.PrimaryUri = result.get('primary_uri_rnd') self.SecondaryUri = result.get('secondary_uri_rnd') if 'sara' in self.PrimaryUri: self.lta_site = 'sara' elif 'juelich' in self.PrimaryUri: self.lta_site = 'juelich' elif 'psnc' in self.PrimaryUri: self.lta_site = 'poznan' def TransferFile(self): try: logger.info('Starting file transfer for %s ' % self.JobId) start = time.time() self.status = IngestPipeline.STATUS_TRANSFERRING self.__sendNotification('JobProgress', message='transfer starting', percentage_done=0.0, total_bytes_transfered=0) local_ip = hostnameToIp(self.hostname) if 'cep4' in self.HostLocation.lower() or 'cpu' in self.HostLocation.lower(): self.HostLocation = 'localhost' if 'locus' in self.HostLocation.lower() and not '.' in self.HostLocation.lower(): self.HostLocation += '.cep2.lofar' def progress_callback(percentage_done, current_speed, total_bytes_transfered): self.__sendNotification('JobProgress', percentage_done=min(100.0, round(10.0*percentage_done)/10.0), current_speed=current_speed, total_bytes_transfered=total_bytes_transfered) if (os.path.splitext(self.Location)[-1] == '.h5' and os.path.splitext(os.path.basename(self.Location))[0].endswith('_bf')): logger.info('dataproduct is a beamformed h5 file. adding raw file to the transfer') self.Location = [self.Location, self.Location.replace('.h5', '.raw')] if self.DataProduct not in self.Location and 'Source' in self.job: # old hack, is needed to support dynspec / pulsar archiving scripts self.Location = os.path.join(self.Location, self.job['Source']) cp = LtaCp(self.HostLocation, self.Location, self.PrimaryUri, self.user, local_ip=local_ip, globus_timeout=self.globus_timeout, progress_callback=progress_callback) transfer_result = cp.transfer(force=True) self.status = IngestPipeline.STATUS_FINALIZING if not transfer_result: msg = 'error while transferring %s with ltacp' % (self.JobId) logger.error(msg) raise Exception(msg) self.MD5Checksum = transfer_result[0] self.Adler32Checksum = transfer_result[1] self.FileSize = transfer_result[2] if self.MD5Checksum and self.Adler32Checksum and self.FileSize: logger.debug('valid checksums found for %s with filesize %sB (%s). md5: %s adler32: %s', self.JobId, self.FileSize, humanreadablesize(int(self.FileSize), 'B'), self.MD5Checksum, self.Adler32Checksum) else: msg = 'no valid checksums found for %s with filesize %sB (%s). md5: %s adler32: %s' % (self.JobId, self.FileSize, humanreadablesize(int(self.FileSize), 'B'), self.MD5Checksum, self.Adler32Checksum) logger.error(msg) raise Exception(msg) try: self.__sendNotification('JobProgress', message='transfer finished', percentage_done=100.0, total_bytes_transfered=int(self.FileSize)) except ValueError: pass elapsed = time.time() - start try: if int(self.FileSize) > 0: avgSpeed = float(self.FileSize) / elapsed logger.info("Finished file transfer for %s in %d sec with an average speed of %s for %s including ltacp overhead" % (self.JobId, elapsed, humanreadablesize(avgSpeed, 'Bps'), humanreadablesize(float(self.FileSize), 'B'))) except Exception: logger.info('Finished file transfer of %s in %s' % (self.JobId, elapsed)) except Exception as exp: if isinstance(exp, LtacpException): if '550 File not found' in exp.value: logger.error('Destination directory does not exist. Creating %s in LTA for %s' % (self.PrimaryUri, self.JobId)) if create_missing_directories(self.PrimaryUri) == 0: logger.info('Created path %s in LTA for %s' % (self.PrimaryUri, self.JobId)) elif 'source path' in exp.value and 'does not exist' in exp.value: raise PipelineError(exp.value, PipelineNoSourceError) raise Exception('transfer failed for %s: %s' % (self.JobId, str(exp))) def SendChecksumsToLTA(self): result = self.ltaClient.SendChecksums(self.JobId, self.Project, self.ticket, self.FileSize, self.PrimaryUri, self.SecondaryUri, self.MD5Checksum, self.Adler32Checksum) if not result.get('error'): #store final uri's self.PrimaryUri = result['primary_uri'] self.SecondaryUri = result.get('secondary_uri') def SendStatusToLTA(self, lta_state_id): if self.ticket: self.ltaClient.UpdateUriState(self.JobId, self.Project, self.ticket, self.PrimaryUri, lta_state_id) def CheckForValidSIP(self): if self.Type == "MoM": try: with self.momClient: self.momClient.getSIP(self.ArchiveId, validate=True, log_prefix=self.JobId) except Exception as e: logger.log(logging.WARNING if self.minimal_SIP else logging.ERROR, 'CheckForValidSIP: Getting SIP from MoM failed for %s: %s', self.JobId, e) if not self.minimal_SIP: raise elif 'SIPLocation' in self.job: # job file might know where the sip is when it is not a MoM job try: sip_host = self.job['SIPLocation'].split(':')[0] sip_path = self.job['SIPLocation'].split(':')[1] cmd = ['ssh', '-tt', '-n', '-x', '-q', '%s@%s' % (self.user, sip_host), 'cat %s' % sip_path] logger.info("GetSIP for %s at SIPLocation %s - cmd %s" % (self.JobId, self.job['SIPLocation'], ' ' .join(cmd))) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = communicate_returning_strings(p) if p.returncode != 0: raise PipelineError('GetSIP error getting EoR SIP for %s: %s' % (self.JobId, out + err)) tmp_SIP = out tmp_SIP = addIngestInfoToSIP(tmp_SIP, self.ticket, self.FileSize, self.MD5Checksum, self.Adler32Checksum) tmp_SIP = tmp_SIP.replace('<stationType>Europe</stationType>','<stationType>International</stationType>') #make sure the source in the SIP is the same as the type of the storageticket tmp_SIP = re.compile('<source>eor</source>', re.IGNORECASE).sub('<source>%s</source>' % (self.Type,), tmp_SIP) if not validateSIPAgainstSchema(tmp_SIP): logger.error('CheckForValidSIP: Invalid SIP:\n%s', tmp_SIP) raise Exception('SIP for %s does not validate against schema' % self.JobId) except: logger.exception('CheckForValidSIP: Getting SIP from SIPLocation %s failed', self.job['SIPLocation']) raise logger.info('SIP for %s is valid, can proceed with transfer' % (self.JobId,)) def GetSIP(self): try: if self.Type == "MoM": with self.momClient: self.SIP = self.momClient.uploadDataAndGetSIP(self.ArchiveId, self.ticket, self.FileName, self.PrimaryUri, self.FileSize, self.MD5Checksum, self.Adler32Checksum, validate=True) elif 'SIPLocation' in self.job: # job file might know where the sip is when it is not a MoM job try: sip_host = self.job['SIPLocation'].split(':')[0] sip_path = self.job['SIPLocation'].split(':')[1] cmd = ['ssh', '-tt', '-n', '-x', '-q', '%s@%s' % (self.user, sip_host), 'cat %s' % sip_path] logger.info("GetSIP for %s at SIPLocation %s - cmd %s" % (self.JobId, self.job['SIPLocation'], ' ' .join(cmd))) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = communicate_returning_strings(p) if p.returncode != 0: raise PipelineError('GetSIP error getting EoR SIP for %s: %s' % (self.JobId, out + err)) self.SIP = out self.SIP = addIngestInfoToSIP(self.SIP, self.ticket, self.FileSize, self.MD5Checksum, self.Adler32Checksum) self.SIP = self.SIP.replace('<stationType>Europe</stationType>','<stationType>International</stationType>') #make sure the source in the SIP is the same as the type of the storageticket self.SIP = re.compile('<source>eor</source>', re.IGNORECASE).sub('<source>%s</source>' % (self.Type,), self.SIP) if not validateSIPAgainstSchema(self.SIP): logger.error('Invalid SIP:\n%s', self.SIP) raise Exception('SIP for %s does not validate against schema' % self.JobId) except: logger.exception('Getting SIP from SIPLocation %s failed', self.job['SIPLocation']) raise logger.info('SIP received for %s from SIPLocation %s with size %d (%s): \n%s' % (self.JobId, self.job['SIPLocation'], len(self.SIP), humanreadablesize(len(self.SIP)), self.SIP[0:1024])) else: self.SIP = makeSIP(self.Project, self.ObsId, self.ArchiveId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type) self.FileType = FILE_TYPE_UNSPECIFIED except Exception as e: if self.minimal_SIP: logger.info('making minimal SIP for %s', self.JobId) self.SIP = makeSIP(self.Project, self.ObsId, self.ArchiveId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type) logger.info('minimal SIP for %s: \n%s', self.JobId, self.SIP) self.FileType = FILE_TYPE_UNSPECIFIED else: raise def SendSIPToLTA(self): try: self.ltaClient.SendSIP(self.JobId, self.SIP, self.ticket) except Exception as e: logger.error('SendSIPToLTA exception: %s', e) raise PipelineError(str(e), PipelineJobFailedError) def RollBack(self): try: logger.info('rolling back file transfer for %s', self.JobId) start = time.time() if self.PrimaryUri: srmrm(self.PrimaryUri, log_prefix=self.JobId, timeout=300) if self.SecondaryUri: srmrm(self.SecondaryUri, log_prefix=self.JobId, timeout=300) logger.debug("rollBack for %s took %ds", self.JobId, time.time() - start) except Exception as e: logger.exception('rollback failed for %s: %s', self.JobId, e) def __sendNotification(self, subject, message='', **kwargs): try: contentDict = { 'job_id': self.JobId, 'export_id': self.job.get('job_group_id'), 'archive_id': self.ArchiveId, 'project': self.Project, 'type': self.Type, 'ingest_server': self.hostname, 'dataproduct': self.DataProduct, 'srm_url': self.PrimaryUri } if 'ObservationId' in self.job: contentDict['otdb_id'] = self.job['ObservationId'] if self.lta_site: contentDict['lta_site'] = self.lta_site if message: contentDict['message'] = message for k,v in list(kwargs.items()): contentDict[k] = v msg = EventMessage(subject="%s.%s" % (INGEST_NOTIFICATION_PREFIX, subject), content=contentDict) msg.ttl = 48*3600 #remove message from queue's when not picked up within 48 hours logger.info('Sending notification %s: %s' % (subject, str(contentDict).replace('\n', ' '))) self.event_bus.send(msg) except Exception as e: logger.error(str(e)) def run(self): with self.event_bus: try: logger.info("starting ingestpipeline for %s" % self.JobId) start = time.time() self.__sendNotification('JobStarted') self.GetStorageTicket() self.CheckForValidSIP() self.TransferFile() self.SendChecksumsToLTA() self.GetSIP() self.SendSIPToLTA() self.SendStatusToLTA(IngestSuccessful) avgSpeed = 0 elapsed = time.time() - start try: avgSpeed = float(self.FileSize) / elapsed logger.info("Ingest Pipeline finished for %s in %d sec with average speed of %s for %s including all overhead", self.JobId, elapsed, humanreadablesize(avgSpeed, 'Bps'), humanreadablesize(float(self.FileSize), 'B')) except Exception: logger.info("Ingest Pipeline finished for %s in %d sec", self.JobId, elapsed) self.__sendNotification('JobFinished', average_speed=avgSpeed, total_bytes_transfered=int(self.FileSize)) except PipelineError as pe: logger.log(logging.WARNING if pe.type == PipelineAlreadyInLTAError else logging.ERROR, 'Encountered PipelineError for %s : %s', self.JobId, str(pe)) if pe.type == PipelineNoSourceError: self.__sendNotification('JobTransferFailed', 'data not transfered because it was not on disk') elif pe.type == PipelineAlreadyInLTAError: self.__sendNotification('JobFinished', 'data was already in the LTA', average_speed=0, total_bytes_transfered=0) else: self.RollBack() # by default the error_message for the notification is the exception error_message = str(pe) # for known messsages in the exception, make a nice readable error_message if 'MoM returned login screen instead of SIP' in error_message: error_message = 'MoM returned login screen instead of SIP' self.__sendNotification('JobTransferFailed', error_message) try: if pe.type != PipelineAlreadyInLTAError: self.SendStatusToLTA(IngestFailed) except Exception as e: logger.error('SendStatusToLTA failed for %s: %s', self.JobId, e) except Exception as e: logger.error('Encountered unexpected error for %s: %s', self.JobId, e) # by default the error_message for the notification is the exception error_message = str(e) # for known messsages in the exception, make a nice readable error_message if 'ltacp' in error_message and ('file listing failed' in error_message or 'du failed' in error_message): error_message = 'dataproduct %s not found at location %s:%s' % (self.DataProduct, self.HostLocation, self.Location) elif 'does not validate against schema' in error_message: error_message = 'invalid SIP does not validate against schema' try: self.RollBack() except Exception as rbe: logger.error('RollBack failed for %s: %s', self.JobId, rbe) try: self.SendStatusToLTA(IngestFailed) except Exception as sse: logger.error('SendStatusToLTA failed for %s: %s', self.JobId, sse) try: self.__sendNotification('JobTransferFailed', error_message) except Exception as sne: logger.error('sendNotification failed for %s: %s', self.JobId, sne) finally: self.status = IngestPipeline.STATUS_FINISHED