def _validate_response(response): """Validate the response from the QCG PJM. This method checks the format of the response and exit code. Args: response (dict): deserialized JSON response Returns: dict: validated response data Raises: InternalError: in case the response format is invalid ConnectionError: in case of non zero exit code """ if not isinstance(response, dict) or 'code' not in response: raise errors.InternalError('Invalid reply from the service') if response['code'] != 0: if 'message' in response: raise errors.ConnectionError('Request failed - {}'.format( response['message'])) raise errors.ConnectionError('Request failed') if 'data' not in response: raise errors.InternalError('Invalid reply from the service') return response['data']
def _assure_connected(self): """Check if connection has been successfully opened. Raises: ConnectionError: if connection has not been established yet """ if not self._connected: raise errors.ConnectionError('Not connected')
def wait4(self, names): """Wait for finish of specific jobs. This method waits until all specified jobs finish its execution (successfully or not). The QCG-PilotJob manager is periodically polled about status of not finished jobs. The poll interval (2 sec by default) can be changed by defining a 'poll_delay' key with appropriate value (in seconds) in configuration of instance. Args: names (str|list(str)): list of job names to get detailed information about Returns: dict - a map with job names and their terminal status Raises: InternalError: in case the response format is invalid ConnectionError: in case of non zero exit code, or if connection has not been established yet """ if isinstance(names, str): job_names = [names] else: job_names = list(names) _logger.info("waiting for finish of %d jobs", len(job_names)) result = {} not_finished = job_names while len(not_finished) > 0: try: jobs_status = self.status(not_finished) not_finished = [] for job_name, job_data in jobs_status['jobs'].items(): if 'status' not in job_data['data'] or job_data[ 'status'] != 0 or 'data' not in job_data: raise errors.InternalError( "Missing job's {} data".format(job_name)) if not Manager.is_status_finished( job_data['data']['status']): not_finished.append(job_name) else: result[job_name] = job_data['data']['status'] if len(not_finished) > 0: _logger.info("still %d jobs not finished", len(not_finished)) time.sleep(self._poll_delay) except Exception as exc: raise errors.ConnectionError(exc.args[0]) _logger.info("all jobs finished") return result
def _disconnect(self): """Close connection to the QCG-PJM Raises: ConnectionError: if there was an error during closing the connection. """ try: if self._connected: self._zmq_socket.close() self._connected = False except Exception as exc: raise errors.ConnectionError('Failed to disconnect {}'.format( exc.args[0]))
def _connect(self): """Connect to the QCG-PJM. The connection is made to the address defined in the constructor. The success of this method is does not mean that communication with QCG-PilotJob manager instance has been established, as in case of ZMQ communication, only when sending and receiving messages the real communication takes place. Raises: ConnectionError: in case of error during establishing connection. """ self._disconnect() _logger.info("connecting to the PJM @ %s", self._address) try: self._zmq_socket = self._zmq_ctx.socket(zmq.REQ) # pylint: disable=maybe-no-member self._zmq_socket.connect(self._address) self._connected = True _logger.info("connection established") except Exception as exc: raise errors.ConnectionError('Failed to connect to {} - {}'.format( self._address, exc.args[0]))
def __init__(self, server_args=None, cfg=None): """Initialize instance. Launch QCG-PilotJob manager instance in background thread and connect to it. The port number for ZMQ interface of QCG-PilotJob manager instance is randomly selected. Args: server_args (list(str)): the command line arguments for QCG-PilotJob manager instance --net enable network interface --net-port NET_PORT port to listen for network interface (implies --net) --net-port-min NET_PORT_MIN minimum port range to listen for network interface if exact port number is not defined (implies --net) --net-port-max NET_PORT_MAX maximum port range to listen for network interface if exact port number is not defined (implies --net) --file enable file interface --file-path FILE_PATH path to the request file (implies --file) --wd WD working directory for the service --envschema ENVSCHEMA job environment schema [auto|slurm] --resources RESOURCES source of information about available resources [auto|slurm|local] as well as a method of job execution (through local processes or as a Slurm sub jobs) --report-format REPORT_FORMAT format of job report file [text|json] --report-file REPORT_FILE name of the job report file --nodes NODES configuration of available resources (implies --resources local) --log {critical,error,warning,info,debug,notset} log level --system-core reserve one of the core for the QCG-PJM --disable-nl disable custom launching method --show-progress print information about executing tasks --governor run manager in the governor mode, where jobs will be scheduled to execute to the dependant managers --parent PARENT address of the parent manager, current instance will receive jobs from the parent manaqger --id ID optional manager instance identifier - will be generated automatically when not defined --tags TAGS optional manager instance tags separated by commas --slurm-partition-nodes SLURM_PARTITION_NODES split Slurm allocation by given number of nodes, where each group will be controlled by separate manager (implies --governor) --slurm-limit-nodes-range-begin SLURM_LIMIT_NODES_RANGE_BEGIN limit Slurm allocation to specified range of nodes (starting node) --slurm-limit-nodes-range-end SLURM_LIMIT_NODES_RANGE_END limit Slurm allocation to specified range of nodes (ending node) each command line argument and (optionaly) it's value should be passed as separate entry in the list cfg (dict) - the configuration; currently the following keys are supported: 'init_timeout' - the timeout (in seconds) client should wait for QCG-PilotJob manager start until it raise error, 300 by default 'poll_delay' - the delay between following status polls in wait methods 'log_file' - the location of the log file 'log_level' - the log level ('DEBUG'); by default the log level is set to INFO """ client_cfg = cfg or {} self._setup_logging(client_cfg) _logger.debug('initializing MP start method with "fork"') mp.set_start_method("fork", force=True) mp.freeze_support() if LocalManager.is_notebook(): _logger.debug( 'Creating a new event loop due to run in an interactive environment' ) import asyncio asyncio.set_event_loop(asyncio.new_event_loop()) try: from qcg.pilotjob.service import QCGPMServiceProcess except ImportError: raise errors.ServiceError('qcg.pilotjob library is not available') if not server_args: server_args = ['--net'] elif '--net' not in server_args: server_args.append('--net') server_args = [str(arg) for arg in server_args] self.qcgpm_queue = mp.Queue() self.qcgpm_process = QCGPMServiceProcess(server_args, self.qcgpm_queue) self.qcgpm_conf = None _logger.debug('manager process created') self.qcgpm_process.start() _logger.debug('manager process started') try: # timeout of single iteration wait_single_timeout = 2 # number of iterations wait_iters = int( client_cfg.get('init_timeout', 300) / wait_single_timeout) + 1 _logger.debug( f'waiting {wait_iters * wait_single_timeout} secs for service start ...' ) service_wait_start = datetime.now() for i in range(wait_iters): if not self.qcgpm_process.is_alive(): raise errors.ServiceError('Service not started') try: self.qcgpm_conf = self.qcgpm_queue.get( block=True, timeout=wait_single_timeout) break except queue.Empty: continue # raise errors.ServiceError('Service not started - timeout') except Exception as exc: raise errors.ServiceError('Service not started: {}'.format( str(exc))) if not self.qcgpm_conf: raise errors.ServiceError('Service not started') if self.qcgpm_conf.get('error', None): raise errors.ServiceError(self.qcgpm_conf['error']) except Exception as ex: if self.qcgpm_process: try: _logger.debug( 'killing pilotjob service process as not started properly' ) self.qcgpm_process.terminate() except: _logger.exception('failed to kill pilotjob service') raise _logger.info( f'service started after {(datetime.now() - service_wait_start).total_seconds()} secs' ) _logger.debug('got manager configuration: %s', str(self.qcgpm_conf)) if not self.qcgpm_conf.get('zmq_addresses', None): raise errors.ConnectionError( 'Missing QCGPM network interface address') zmq_iface_address = self.qcgpm_conf['zmq_addresses'][0] _logger.info('manager zmq iface address: %s', zmq_iface_address) super(LocalManager, self).__init__(zmq_iface_address, cfg)