class GwEmMad(object): """ Execution manager MAD GridWay uses a Middleware Access Driver (MAD) module to submit, control and monitor the execution of jobs. The format to send a request to the Execution MAD, through its standard input, is: OPERATION JID HOST/JM RSL Where: -OPERATION: Can be one of the following: -INIT: Initializes the MAD (i.e. INIT - - -). -SUBMIT: Submits a job(i.e. SUBMIT JID HOST/JM RSL). -POLL: Polls a job to obtain its state (i.e. POLL JID - -). -CANCEL: Cancels a job (i.e. CANCEL JID - -). -FINALIZE:Finalizes the MAD (i.e. FINALIZE - - -). -JID: Is a job identifier, chosen by GridWay. -HOST: If the operation is SUBMIT, it specifies the resource contact to submit the job. Otherwise it is ignored. -JM: If the operation is SUBMIT, it specifies the job manager to submit the job. Otherwise it is ignored. -RSL: If the operation is SUBMIT, it specifies the resource specification to submit the job. Otherwise it is ignored. The format to receive a response from the MAD, through its standard output, is: OPERATION JID RESULT INFO Where: -OPERATION: Is the operation specified in the request that originated the response or CALLBACK, in the case of an asynchronous notification of a state change. -JID: It is the job identifier, as provided in the submission request. -RESULT: It is the result of the operation. Could be SUCCESS or FAILURE -INFO: If RESULT is FAILURE, it contains the cause of failure. Otherwise, if OPERATION is POLL or CALLBACK,it contains the state of the job. """ logger = logging.getLogger(__name__) message = Send() def __init__(self): self._callback_interval = 30 #seconds self._max_thread = 10 self._min_thread = 3 self._job_list = List() self._configure = None self._communicators = dict() self._lock = threading.Lock() def do_INIT(self, args): """ Initializes the MAD (i.e. INIT - - -) @param args : arguments of operation @type args : string """ out = 'INIT - SUCCESS -' self.message.stdout(out) self.logger.debug(out) def do_SUBMIT(self, args): """ Submits a job(i.e. SUBMIT JID HOST/JM RSL). @param args : arguments of operation @type args : string """ OPERATION, JID, HOST_JM, RSL = args.split() try: HOST, JM = HOST_JM.rsplit('/', 1) # Init Job class job, communicator = self._update_resource(HOST) job.Communicator = communicator # Parse rsl rsl = Rsl2Parser(RSL).parser() if 'project' in job.resfeatures: rsl['project'] = job.resfeatures['project'] if 'parallel_env' in job.resfeatures: rsl['parallel_env'] = job.resfeatures['parallel_env'] if 'vo' in job.resfeatures and "::" in HOST: _, host = HOST.split('::') job.resfeatures['host'] = host job.resfeatures['jm'] = JM job.resfeatures['env_file'] = join(dirname(RSL), "job.env") job.resfeatures['queue'] = rsl['queue'] # Update remote directories ABS_REMOTE_JOBS_DIR = job.get_abs_directory( job.resfeatures.get('scratch', REMOTE_JOBS_DIR)) for key in ["stdout", "stderr", "executable"]: rsl[key] = join(ABS_REMOTE_JOBS_DIR, rsl[key]) # Create and copy wrapper_drm4g file local_file = join(dirname(RSL), "wrapper_drm4g.%s" % RSL.split('.')[-1]) remote_file = join(dirname(rsl['executable']), 'wrapper_drm4g') job.createWrapper(local_file, job.jobTemplate(rsl)) job.copyWrapper(local_file, remote_file) # Execute wrapper_drm4g job.JobId = job.jobSubmit(remote_file) self._job_list.put(JID, job) out = 'SUBMIT %s SUCCESS %s:%s' % (JID, HOST, job.JobId) except Exception as err: out = 'SUBMIT %s FAILURE %s' % (JID, str(err)) self.logger.error(err, exc_info=1) self.message.stdout(out) self.logger.debug(out) def do_FINALIZE(self, args): """ Finalizes the MAD (i.e. FINALIZE - - -). @param args : arguments of operation @type args : string """ out = 'FINALIZE - SUCCESS -' self.message.stdout(out) self.logger.debug(out) sys.exit(0) def do_POLL(self, args): """ Polls a job to obtain its state (i.e. POLL JID - -). @param args : arguments of operation @type args : string """ OPERATION, JID, HOST_JM, RSL = args.split() try: if self._job_list.has_key(JID): job = self._job_list.get(JID) status = job.getStatus() out = 'POLL %s SUCCESS %s' % (JID, status) else: out = 'POLL %s FAILURE Job not submitted' % (JID) except Exception as err: out = 'POLL %s FAILURE %s' % (JID, str(err)) self.logger.error(err, exc_info=1) self.message.stdout(out) self.logger.debug(out) def do_RECOVER(self, args): """ Polls a job to obtain its state (i.e. RECOVER JID - -). @param args : arguments of operation @type args : string """ OPERATION, JID, HOST_JM, RSL = args.split() try: HOST, remote_job_id = HOST_JM.split(':', 1) job, communicator = self._update_resource(HOST) job.Communicator = communicator job.JobId = remote_job_id job.refreshJobStatus() self._job_list.put(JID, job) out = 'RECOVER %s SUCCESS %s' % (JID, job.getStatus()) except Exception as err: out = 'RECOVER %s FAILURE %s' % (JID, str(err)) self.logger.error(err, exc_info=1) self.message.stdout(out) self.logger.debug(out) def do_CALLBACK(self): """ Show the state of the job """ while True: time.sleep(self._callback_interval) self.logger.debug("CALLBACK new iteration ...") for JID, job in self._job_list.items(): try: self.logger.debug("CALLBACK checking job '%s'" % JID) oldStatus = job.getStatus() job.refreshJobStatus() newStatus = job.getStatus() if oldStatus != newStatus or newStatus == 'DONE' or newStatus == 'FAILED': if newStatus == 'DONE' or newStatus == 'FAILED': self._job_list.delete(JID) time.sleep(0.1) out = 'CALLBACK %s SUCCESS %s' % (JID, newStatus) self.message.stdout(out) self.logger.debug(out) except Exception as err: out = 'CALLBACK %s FAILURE %s' % (JID, str(err)) self.logger.error(err, exc_info=1) self.message.stdout(out) def do_CANCEL(self, args): """ Cancels a job (i.e. CANCEL JID - -). @param args : arguments of operation @type args : string """ OPERATION, JID, HOST_JM, RSL = args.split() try: if self._job_list.has_key(JID): self._job_list.get(JID).jobCancel() out = 'CANCEL %s SUCCESS -' % (JID) else: out = 'CANCEL %s FAILURE Job not submitted' % (JID) except Exception as err: out = 'CANCEL %s FAILURE %s' % (JID, str(err)) self.logger.error(err, exc_info=1) self.message.stdout(out) self.logger.debug(out) methods = { 'INIT': do_INIT, 'SUBMIT': do_SUBMIT, 'POLL': do_POLL, 'RECOVER': do_RECOVER, 'CANCEL': do_CANCEL, 'FINALIZE': do_FINALIZE } def processLine(self): """ Choose the OPERATION through the command line """ try: worker = threading.Thread(target=self.do_CALLBACK, ) worker.setDaemon(True) worker.start() self._configure = Configuration() pool = ThreadPool(self._min_thread, self._max_thread) while True: input = sys.stdin.readline().split() self.logger.debug(' '.join(input)) OPERATION = input[0].upper() if len(input) == 4 and OPERATION in self.methods: if OPERATION in ('FINALIZE', 'INIT', 'SUBMIT', 'RECOVER'): self.methods[OPERATION](self, ' '.join(input)) else: pool.add_task(self.methods[OPERATION], self, ' '.join(input)) else: out = 'WRONG COMMAND' self.message.stdout(out) self.logger.debug(out) except Exception as err: self.logger.warning(str(err), exc_info=1) def _update_resource(self, host): with self._lock: if not self._configure.check_update( ) or not self._configure.resources: self._configure.load() errors = self._configure.check() if errors: self.logger.error(' '.join(errors)) raise Exception(' '.join(errors)) for resname, resdict in list(self._configure.resources.items()): if 'cloud_provider' in self._configure.resources[resname].keys( ): continue if '::' in host: _resname, _ = host.split('::') if resname != _resname: continue elif resname != host: continue if resname not in self._communicators: self._communicators[ resname] = self._configure.make_communicators( )[resname] job = self._configure.make_resources()[resname]['Job'] communicator = self._communicators[resname] return job, communicator
class GwImMad(object): """ Information manager MAD The format to send a request to the Information MAD, through its standard input, is: OPERATION HID HOST ARGS Where: -OPERATION: Can be one of the following: -INIT: Initializes the MAD (i.e. INIT - - -). -DISCOVER: Discovers hosts (i.e. DISCOVER - - - ). -MONITOR: Monitors a host (i.e. MONITOR HID HOST -). -FINALIZE: Finalizes the MAD (i.e. FINALIZE - - -). -HID: if the operation is MONITOR, it is a host identifier, chosen by GridWay. Otherwise it is ignored. -HOST: If the operation is MONITOR it specifies the host to monitor. Otherwise it is ignored. The format to receive a response from the MAD, through its standard output, is: OPERATION HID RESULT INFO Where: -OPERATION: Is the operation specified in the request that originated the response. -HID: It is the host identifier, as provided in the submission request. -RESULT: It is the result of the operation. Could be SUCCESS or FAILURE. -INFO: If RESULT is FAILURE, it contains the cause of failure. Otherwise, if OPERATION is DISCOVER, it contains a list of discovered host, or if OPERATION is MONITOR, it contains a list of host attributes. """ logger = logging.getLogger(__name__) message = Send() def __init__(self): self._resources = dict() self._config = None def do_INIT(self, args): """ Initializes the MAD (i.e. INIT - - -) @param args : arguments of operation @type args : string """ out = 'INIT - SUCCESS -' self.message.stdout(out) self.logger.debug(out) def do_DISCOVER(self, args, output=True): """ Discovers hosts (i.e. DISCOVER - - -) @param args : arguments of operation @type args : string """ OPERATION, HID, HOST, ARGS = args.split() try: self._config = Configuration() self._config.load() errors = self._config.check() assert not errors, ' '.join(errors) self._resources = self._config.make_resources() communicators = self._config.make_communicators() hosts = "" for resname in sorted(self._resources.keys()): if self._config.resources[resname]['enable'].lower( ) == 'false': continue if 'cloud_provider' in self._config.resources[resname].keys(): continue try: self._resources[resname][ 'Resource'].Communicator = communicators[resname] self._resources[resname]['Resource'].Communicator.connect() hosts = hosts + " " + self._resources[resname][ 'Resource'].hosts() self._resources[resname]['Resource'].Communicator.close() except Exception as err: self.logger.error(err, exc_info=1) out = 'DISCOVER %s SUCCESS %s' % (HID, hosts) except Exception as err: out = 'DISCOVER - FAILURE %s' % str(err) if output: self.message.stdout(out) self.logger.debug(out, exc_info=1) def do_MONITOR(self, args, output=True): """ Monitors a host (i.e. MONITOR HID HOST -) @param args : arguments of operation @type args : string """ OPERATION, HID, HOST, ARGS = args.split() try: info = "" for resname, resdict in list(self._resources.items()): if self._config.resources[resname]['enable'].lower( ) == 'false': raise Exception("Resource '%s' is not enable" % resname) if HOST in resdict['Resource'].host_list: info = resdict['Resource'].host_properties(HOST) resdict['Resource'].Communicator.close() break assert info, "Host '%s' is not available" % HOST out = 'MONITOR %s SUCCESS %s' % (HID, info) except Exception as err: out = 'MONITOR %s FAILURE %s' % (HID, str(err)) if output: self.message.stdout(out) self.logger.debug(out, exc_info=1) def do_FINALIZE(self, args): """ Finalizes the MAD (i.e. FINALIZE - - -) @param args : arguments of operation @type args : string """ out = 'FINALIZE - SUCCESS -' self.message.stdout(out) self.logger.debug(out) sys.exit(0) methods = { 'INIT': do_INIT, 'DISCOVER': do_DISCOVER, 'MONITOR': do_MONITOR, 'FINALIZE': do_FINALIZE, } def processLine(self): """ Choose the OPERATION through the command line """ try: while True: input = sys.stdin.readline().split() self.logger.debug(' '.join(input)) OPERATION = input[0].upper() if len(input) == 4 and OPERATION in self.methods: self.methods[OPERATION](self, ' '.join(input)) else: out = 'WRONG COMMAND' self.message.stdout(out) self.logger.debug(out) except Exception as e: self.logger.warning(str(e))