def send_command(self, command, **kwargs): expected_cmd = 'qstat -i'\ ' `echo {} '\ '| xargs -n 1 qselect -N` '\ '| tail -n+6 '\ '| awk \'{{ print $4 "|" $10 }}\''.format( shlex_quote(' '.join( map(shlex_quote, job_names)))) self._test_case.assertEqual(command, expected_cmd) return """ test_1 | S test 2 | R\n""", 0
def _get_states_detailed(self, job_names): """ Get job states by job names This function uses `qstat` command to query PBSPro. Please don't launch this call very frequently. Polling it frequently, especially across all users on the cluster, will slow down response times and may bring scheduling to a crawl. It allows to a precise mapping of Torque states to Slurm states by taking into account `exit_code`. Unlike `get_states_tabular` it parses output on host and uses several SSH commands. """ # identify job ids # Read environment, required by some HPC (e.g. HLRS Hawk) read_environment = "source /etc/profile > /dev/null 2>&1; " call = read_environment + "echo {} | xargs -n 1 qselect -x -N".format( shlex_quote(' '.join(map(shlex_quote, job_names)))) client = SshClient(self.credentials) output, exit_code = client.execute_shell_command(call, workdir=self.workdir, wait_result=True) job_ids = Pbspro._parse_qselect(output) if not job_ids: return {} # get detailed information about jobs call = read_environment + "qstat -x -f {}".format(' '.join( map(str, job_ids))) output, exit_code = client.execute_shell_command(call, workdir=self.workdir, wait_result=True) client.close_connection() try: job_states, audits = Pbspro._parse_qstat_detailed(output) except SyntaxError as e: self.logger.warning( "cannot parse state response for job ids=[{}]".format(','.join( map(str, job_ids)))) self.logger.warning( "{err}\n`qstat -x -f` output to parse:\n\\[\n{text}\n\\]". format(err=str(e), text=output)) # TODO: think whether error ignoring is better # for the correct lifecycle raise e return job_states, audits
def test_identifying_job_ids_call(self): """ Call for revealing job ids by job names. """ job_names = ('test_1', 'test 2') # @TODO: replace by _get_jobids_by_name() as soon as the dependency on # SSH client is removed. from croupier_plugin.utilities import shlex_quote response = "qstat -i `echo {} | xargs -n 1 qselect -N` |"\ " tail -n+6 | awk '{{ print $4 \" \" $1 }}'".format( shlex_quote(' '.join(map(shlex_quote, job_names)))) self.assertEqual( response, 'qstat -i ' '`echo \'\'"\'"\'test 2\'"\'"\' test_1\' |' ' xargs -n 1 qselect -N` |' ' tail -n+6 | awk \'{ print $4 " " $1 }\'')
def _get_states_tabular(ssh_client, job_names, logger): """ Get job states by job names This function uses `qstat` command to query Torque. Please don't launch this call very friquently. Polling it frequently, especially across all users on the cluster, will slow down response times and may bring scheduling to a crawl. It invokes `tail/awk` to make simple parsing on the remote HPC. """ # TODO:(emepetres) set start day of consulting # @caution This code fails to manage the situation # if several jobs have the same name call = "qstat -i `echo {} | xargs -n 1 qselect -N` "\ "| tail -n+6 | awk '{{ print $4 \"|\" $10 }}'".format( shlex_quote(' '.join(map(shlex_quote, job_names)))) output, exit_code = ssh_client.send_command(call, wait_result=True) return Torque._parse_qstat_tabular(output) if exit_code == 0 else {}
def send_command(self, command, exec_timeout=3000, read_chunk_timeout=500, wait_result=False): """Sends a command and returns stdout, stderr and exitcode""" # Check if connection is made previously if self._client is not None: if self._login_shell: cmd = "bash -l -c {}".format(shlex_quote(command)) else: cmd = command # there is one channel per command stdin, stdout, stderr = self._client.exec_command( cmd, timeout=exec_timeout) if wait_result: # get the shared channel for stdout/stderr/stdin channel = stdout.channel # we do not need stdin stdin.close() # indicate that we're not going to write to that channel channel.shutdown_write() # read stdout/stderr in order to prevent read block hangs stdout_chunks = [] stdout_chunks.append(stdout.channel.recv( len(stdout.channel.in_buffer))) # chunked read to prevent stalls while (not channel.closed or channel.recv_ready() or channel.recv_stderr_ready()): # Stop if channel was closed prematurely, # and there is no data in the buffers. got_chunk = False readq, _, _ = select.select([stdout.channel], [], [], read_chunk_timeout) for c in readq: if c.recv_ready(): stdout_chunks.append(stdout.channel.recv( len(c.in_buffer))) got_chunk = True if c.recv_stderr_ready(): # make sure to read stderr to prevent stall stderr.channel.recv_stderr(len(c.in_stderr_buffer)) got_chunk = True ''' 1) make sure that there are at least 2 cycles with no data in the input buffers in order to not exit too early (i.e. cat on a >200k file). 2) if no data arrived in the last loop, check if we already received the exit code 3) check if input buffers are empty 4) exit the loop ''' if (not got_chunk and stdout.channel.exit_status_ready() and not stderr.channel.recv_stderr_ready() and not stdout.channel.recv_ready()): # Indicate that we're not going to read from # this channel anymore stdout.channel.shutdown_read() # close the channel stdout.channel.close() # Remote side is finished & our bufferes are empty break # close all the pseudofiles stdout.close() stderr.close() if wait_result: # exit code is always ready at this point exit_code = stdout.channel.recv_exit_status() if exit_code == 0: output = ''.join(stdout_chunks) else: output = ''.join(stdout_chunks) # TODO stderr return (output, exit_code) else: return True else: if wait_result: return (None, None) else: return False
def _build_job_submission_call(self, name, job_settings, logger): # check input information correctness if not isinstance(job_settings, dict) or \ not isinstance(name, basestring): return {'error': "Incorrect inputs"} if 'type' not in job_settings or 'command' not in job_settings: return { 'error': "'type' and 'command' " + "must be defined in job settings" } if 'type' in job_settings and job_settings['type'] != 'SBATCH': return { 'error': "Job type '" + job_settings['type'] + "'not supported. Torque support only batched jobs." } # Build single line command torque_call = '' # NOTE an uploaded script could also be interesting to execute if 'pre' in job_settings: for entry in job_settings['pre']: torque_call += entry + '; ' # ################### Torque settings ################### # qsub command plus job name torque_call += "qsub -V -N {}".format(shlex_quote(name)) resources_request = "" if 'nodes' in job_settings: resources_request = "nodes={}".format(job_settings['nodes']) # number of cores requested per node if 'tasks_per_node' in job_settings: resources_request += ':ppn={}'.format( job_settings['tasks_per_node']) else: if 'tasks_per_node' in job_settings: logger.error( r"Specify 'tasks_per_node' while 'nodes' is not specified") if 'max_time' in job_settings: if len(resources_request) > 0: resources_request += ',' resources_request += 'walltime={}'.format(job_settings['max_time']) if len(resources_request) > 0: torque_call += ' -l {}'.format(resources_request) # more precisely is it a destination [queue][@server] if 'queue' in job_settings: torque_call += " -q {}".format(shlex_quote(job_settings['queue'])) if 'rerunnable' in job_settings: # same to requeue in SLURM torque_call += " -r {}".format( 'y' if job_settings['rerunnable'] else 'n') if 'work_dir' in job_settings: torque_call += " -w {}".format( shlex_quote(job_settings['work_dir'])) additional_attributes = {} if 'group_name' in job_settings: additional_attributes["group_list"] = shlex_quote( job_settings['group_name']) if len(additional_attributes) > 0: torque_call += " -W {}".format(','.join( "{0}={1}".format(k, v) for k, v in additional_attributes.iteritems())) # if 'tasks' in job_settings: # torque_call += ' -n ' + str(job_settings['tasks']) # ####################################################### response = {} if 'scale' in job_settings and \ int(job_settings['scale']) > 1: # set the max of parallel jobs scale_max = job_settings['scale'] # set the job array torque_call += ' -J 0-{}'.format(scale_max - 1) if 'scale_max_in_parallel' in job_settings and \ int(job_settings['scale_max_in_parallel']) > 0: torque_call += '%{}'.format( job_settings['scale_max_in_parallel']) scale_max = job_settings['scale_max_in_parallel'] # map the orchestrator variables after last sbatch scale_env_mapping_call = \ "sed -i '/# DYNAMIC VARIABLES/a\\" \ "SCALE_INDEX=$PBS_ARRAYID\\n" \ "SCALE_COUNT={scale_count}\\n" \ "SCALE_MAX={scale_max}' {command}".format( scale_count=job_settings['scale'], scale_max=scale_max, command=job_settings['command'].split()[0]) # file only response['scale_env_mapping_call'] = scale_env_mapping_call # add executable and arguments torque_call += ' {}'.format(job_settings['command']) # NOTE an uploaded script could also be interesting to execute if 'post' in job_settings: torque_call += '; ' for entry in job_settings['post']: torque_call += entry + '; ' response['call'] = torque_call return response
def _build_job_cancellation_call(self, name, job_settings, logger): return r"qselect -N {} | xargs qdel".format(shlex_quote(name))
def _parse_job_settings(self, job_id, job_settings, script=False): _settings = {'data': ''} if script: _prefix = '#PBS' _suffix = '\n' else: _prefix = '' _suffix = '' # TODO writ for script (prefix, suffix ??) if not script: # qsub command plus job name _settings['data'] += "qsub -V -N {}".format(shlex_quote(job_id)) # Check if exists and has content def _check_job_settings_key(key): return key in job_settings and str(job_settings[key]).strip() def _add_setting(option, value, op_separator=' '): _settings['data'] += '{} {}{}{}{}'.format(_prefix, option, op_separator, value, _suffix) if not _check_job_settings_key('nodes') and \ _check_job_settings_key('tasks_per_node'): return { 'error': "Specified 'tasks_per_node' while" "'nodes' is not specified" } if _check_job_settings_key('nodes'): node_request = "nodes={}".format(job_settings['nodes']) # number of cores requested per node # TODO If tasks and no tasks_per_node, then # tasks_per_node = tasks/nodes if _check_job_settings_key('tasks_per_node'): node_request += ':ppn={}'.format( job_settings['tasks_per_node']) _add_setting('-l', node_request) if _check_job_settings_key('max_time'): _add_setting('-l', 'walltime={}'.format(job_settings['max_time'])) if _check_job_settings_key('queue') or \ _check_job_settings_key('partition'): if _check_job_settings_key('queue'): queue = job_settings['queue'] else: queue = job_settings['partition'] _add_setting('-q', shlex_quote(queue)) if _check_job_settings_key('memory'): _add_setting('-l', 'mem={}'.format(job_settings('memory'))) if _check_job_settings_key('mail_user'): _add_setting('-M', job_settings['mail_user']) # FIXME make slurm and torque compatible # a (aborted) # b (when it begins) # e (when it ends) # f (when it terminates with a non-zero exit code) if _check_job_settings_key('mail_type'): _add_setting('-m', job_settings['mail_type']) if _check_job_settings_key('account'): _add_setting('-A', job_settings['account']) if _check_job_settings_key('stderr_file'): _add_setting('-e', job_settings['stdoutstderr_file_file']) else: _add_setting('-e', job_id + '.err') if _check_job_settings_key('stdout_file'): _add_setting('-o', job_settings['stdout_file']) else: _add_setting('-o', job_id + '.out') additional_attributes = {} if 'group_name' in job_settings: additional_attributes["group_list"] = shlex_quote( job_settings['group_name']) # add scale, executable and arguments if not script: if 'scale' in job_settings and \ int(job_settings['scale']) > 1: # set the job array _settings['data'] += ' -t 0-{}'.format(job_settings['scale'] - 1) if 'scale_max_in_parallel' in job_settings and \ int(job_settings['scale_max_in_parallel']) > 0: _settings['data'] += '%{}'.format( job_settings['scale_max_in_parallel']) _settings['data'] += ' ' + job_settings['script'] if _check_job_settings_key('arguments'): args = '' for arg in job_settings['arguments']: args += arg + ' ' _settings['data'] += ' -F "{}"'.format(args) _settings['data'] += '; ' return _settings