def test_communicator_manager(self): """ Make sure that es communicator manager thread works as expected. """ communicator_manager = None try: args = { 'workflow': 'eventservice_hpc', 'queue': 'BNL_CLOUD_MCORE', 'site': 'BNL_CLOUD_MCORE', 'port': 25443, 'url': 'https://aipanda007.cern.ch', 'job_label': 'ptest', 'pilot_user': '******', 'node': socket.getfqdn(), 'mem': 16000, 'disk_space': 160000, 'working_group': '', 'cpu': 2601.0, 'info': None } communicator_manager = CommunicationManager() communicator_manager.start() self.assertTrue(communicator_manager.is_alive()) jobs = communicator_manager.get_jobs(njobs=2, args=args) self.assertEqual(len(jobs), 2) jobs = communicator_manager.get_jobs(njobs=1, args=args) self.assertEqual(len(jobs), 1) job_list = [] for job in jobs: job_data = { 'node': socket.getfqdn(), 'pilotErrorCode': 0, 'startTime': time.time(), 'jobMetrics': 'coreCount=8', 'schedulerID': 'unknown', 'timestamp': time_stamp(), 'exeErrorCode': 0, 'pilotID': 'unknown|PR|2.0.0 (80)', 'transExitCode': 0, 'pilotErrorDiag': '', 'exeErrorDiag': '' } job_data['jobId'] = job['PandaID'] job_data['siteName'] = 'BNL_CLOUD_MCORE' job_data['state'] = 'running' job_data['attemptNr'] = job['attemptNr'] + 1 job_list.append(job_data) status = communicator_manager.update_jobs(jobs=job_list) self.assertEqual(status[0], True) events = communicator_manager.get_event_ranges(num_event_ranges=1, job=jobs[0]) self.assertEqual(len(events), 1) for event in events: event_range_status = { "errorCode": 1220, "eventRangeID": event['eventRangeID'], "eventStatus": 'failed' } event_range_message = { 'version': 0, 'eventRanges': json.dumps(event_range_status) } res = communicator_manager.update_events( update_events=event_range_message) self.assertEqual(res['StatusCode'], 0) events = communicator_manager.get_event_ranges(num_event_ranges=2, job=jobs[0]) self.assertEqual(len(events), 2) update_events = [] for event in events: event_range = { "eventRangeID": event['eventRangeID'], "eventStatus": 'finished' } update_events.append(event_range) event_range_status = [{ "zipFile": { "numEvents": len(update_events), "objstoreID": 1318, "adler32": '000000', "lfn": 'test_file', "fsize": 100, "pathConvention": 1000 }, "eventRanges": update_events }] event_range_message = { 'version': 1, 'eventRanges': json.dumps(event_range_status) } res = communicator_manager.update_events( update_events=event_range_message) self.assertEqual(res['StatusCode'], 0) communicator_manager.stop() time.sleep(2) self.assertFalse(communicator_manager.is_alive()) except Exception as ex: if communicator_manager: communicator_manager.stop() raise ex
class BaseExecutor(threading.Thread, PluginFactory): def __init__(self, **kwargs): super(BaseExecutor, self).__init__() self.setName("BaseExecutor") self.queue = None self.payload = None self.args = None for key in kwargs: setattr(self, key, kwargs[key]) self.__stop = threading.Event() self.__event_ranges = [] self.__is_set_payload = False self.__is_retrieve_payload = False self.communication_manager = None self.proc = None def get_pid(self): return self.proc.pid if self.proc else None def __del__(self): self.stop() if self.communication_manager: self.communication_manager.stop() def is_payload_started(self): return False def start(self): super(BaseExecutor, self).start() self.communication_manager = CommunicationManager() self.communication_manager.start() def stop(self): if not self.is_stop(): self.__stop.set() def is_stop(self): return self.__stop.is_set() def stop_communicator(self): logger.info("Stopping communication manager") if self.communication_manager: while self.communication_manager.is_alive(): if not self.communication_manager.is_stop(): self.communication_manager.stop() logger.info("Communication manager stopped") def set_payload(self, payload): self.payload = payload self.__is_set_payload = True job = self.get_job() if job and job.workdir: os.chdir(job.workdir) def is_set_payload(self): return self.__is_set_payload def set_retrieve_payload(self): self.__is_retrieve_payload = True def is_retrieve_payload(self): return self.__is_retrieve_payload def retrieve_payload(self): logger.info("Retrieving payload: %s" % self.args) jobs = self.communication_manager.get_jobs(njobs=1, args=self.args) logger.info("Received jobs: %s" % jobs) if jobs: job = create_job(jobs[0], queue=self.queue) # get the payload command from the user specific code pilot_user = os.environ.get('PILOT_USER', 'atlas').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 cmd = user.get_payload_command(job) logger.info("payload execution command: %s" % cmd) payload = {'executable': cmd, 'workdir': job.workdir, 'job': job} logger.info("Retrieved payload: %s" % payload) return payload return None def get_payload(self): if self.__is_set_payload: return self.payload def get_job(self): return self.payload['job'] if self.payload and 'job' in list( self.payload.keys()) else None # Python 2/3 def get_event_ranges(self, num_event_ranges=1, queue_factor=2): if config.Payload.executor_type.lower() == 'raythena': old_queue_factor = queue_factor queue_factor = 1 logger.info("raythena - Changing queue_factor from %s to %s" % (old_queue_factor, queue_factor)) logger.info( "Getting event ranges: (num_ranges: %s) (queue_factor: %s)" % (num_event_ranges, queue_factor)) if len(self.__event_ranges) < num_event_ranges: ret = self.communication_manager.get_event_ranges( num_event_ranges=num_event_ranges * queue_factor, job=self.get_job()) for event_range in ret: self.__event_ranges.append(event_range) ret = [] for _ in range(num_event_ranges): if len(self.__event_ranges) > 0: event_range = self.__event_ranges.pop(0) ret.append(event_range) logger.info("Received event ranges(num:%s): %s" % (len(ret), ret)) return ret def update_events(self, messages): logger.info("Updating event ranges: %s" % messages) ret = self.communication_manager.update_events(messages) logger.info("Updated event ranges status: %s" % ret) return ret def update_jobs(self, jobs): logger.info("Updating jobs: %s" % jobs) ret = self.communication_manager.update_jobs(jobs) logger.info("Updated jobs status: %s" % ret) return ret def run(self): """ Main run process """ raise NotImplementedError()