Exemplo n.º 1
0
 def __do_acct(self, job, cmd, parse):
     """Run `cmd` to get accounting information and update `job` state
     accordingly."""
     exit_code, stdout, stderr = self.transport.execute_command(cmd)
     if exit_code == 0:
         jobstatus = parse(stdout)
         job.update(jobstatus)
         if 'exitcode' in jobstatus:
             if 'signal' in jobstatus:
                 job.returncode = (jobstatus['signal'],
                                   jobstatus['exitcode'])
             else:
                 # XXX: we're assuming the batch system executes the
                 # job through a shell, and collects the shell exit
                 # code -- IOW, a job is never exec()'d directly from
                 # the batch system daemon.  I'm not sure this is
                 # actually true in all cases.
                 job.returncode = Run.shellexit_to_returncode(
                     int(jobstatus['exitcode']))
             job.state = Run.State.TERMINATING
         return job.state
     else:
         raise gc3libs.exceptions.AuxiliaryCommandError(
             "Failed running accounting command `%s`:"
             " exit code: %d, stderr: '%s'" % (cmd, exit_code, stderr),
             do_log=True)
Exemplo n.º 2
0
Arquivo: sge.py Projeto: uzh/gc3pie
 def _parse_acct_output(self, stdout, stderr):
     acctinfo = {}
     for line in stdout.split("\n"):
         # skip empty and header lines
         line = line.strip()
         if line == "" or "===" in line:
             continue
         # extract key/value pairs from `qacct` output
         key, value = line.split(" ", 1)
         value = value.strip()
         if key == "failed":
             # value may be, e.g., "100 : assumedly after job"
             value = value.split()[0]
         try:
             dest, conv = self._qacct_keyval_mapping[key]
             acctinfo[dest] = conv(value)
         except KeyError:
             # no conversion by default -- keep it a string
             acctinfo["sge_" + key] = value
         except (ValueError, TypeError) as err:
             log.error(
                 "Cannot parse value '%s' for qacct parameter '%s': %s: %s",
                 value,
                 key,
                 err.__class__.__name__,
                 str(err),
             )
             acctinfo[dest] = None
     assert "exitcode" in acctinfo, "Could not extract exit code from `tracejob` output"
     acctinfo["termstatus"] = Run.shellexit_to_returncode(acctinfo.pop("exitcode"))
     return acctinfo
Exemplo n.º 3
0
 def _parse_acct_output(self, stdout, stderr):
     """Parse `tracejob` output."""
     acctinfo = {}
     for line in stdout.split('\n'):
         for pattern, carry_on in [
                 # regexp                   exit loop?
                 # =====================    ==========
             (self._tracejob_queued_re, True),
             (self._tracejob_run_re, True),
             (self._tracejob_last_re, False),
         ]:
             match = pattern.match(line)
             if match:
                 for key, value in match.groupdict().items():
                     attr, conv = self._tracejob_keyval_mapping[key]
                     acctinfo[attr] = conv(value)
                 if carry_on:
                     continue
                 else:
                     break
     assert 'exitcode' in acctinfo, (
         "Could not extract exit code from `tracejob` output")
     acctinfo['termstatus'] = Run.shellexit_to_returncode(
         acctinfo.pop('exitcode'))
     return acctinfo
Exemplo n.º 4
0
Arquivo: pbs.py Projeto: fliem/gc3pie
 def _parse_acct_output(self, stdout, stderr):
     """Parse `tracejob` output."""
     acctinfo = {}
     for line in stdout.split('\n'):
         for pattern, carry_on in [
                 # regexp                   exit loop?
                 # =====================    ==========
                 (self._tracejob_queued_re, True),
                 (self._tracejob_run_re,    True),
                 (self._tracejob_last_re,   False),
         ]:
             match = pattern.match(line)
             if match:
                 for key, value in match.groupdict().iteritems():
                     attr, conv = self._tracejob_keyval_mapping[key]
                     acctinfo[attr] = conv(value)
                 if carry_on:
                     continue
                 else:
                     break
     assert 'exitcode' in acctinfo, (
         "Could not extract exit code from `tracejob` output")
     acctinfo['termstatus'] = Run.shellexit_to_returncode(
         acctinfo.pop('exitcode'))
     return acctinfo
Exemplo n.º 5
0
Arquivo: sge.py Projeto: imcf/gc3pie
 def _parse_acct_output(self, stdout, stderr):
     acctinfo = {}
     for line in stdout.split('\n'):
         # skip empty and header lines
         line = line.strip()
         if line == '' or '===' in line:
             continue
         # extract key/value pairs from `qacct` output
         key, value = line.split(' ', 1)
         value = value.strip()
         if key == 'failed':
             # value may be, e.g., "100 : assumedly after job"
             value = value.split()[0]
         try:
             dest, conv = self._qacct_keyval_mapping[key]
             acctinfo[dest] = conv(value)
         except KeyError:
             # no conversion by default -- keep it a string
             acctinfo['sge_' + key] = value
         except (ValueError, TypeError) as err:
             log.error(
                 "Cannot parse value '%s' for qacct parameter '%s': %s: %s",
                 value, key, err.__class__.__name__, str(err))
             acctinfo[dest] = None
     assert 'exitcode' in acctinfo, (
         "Could not extract exit code from `tracejob` output")
     acctinfo['termstatus'] = Run.shellexit_to_returncode(
         acctinfo.pop('exitcode'))
     return acctinfo
Exemplo n.º 6
0
 def __do_acct(self, job, cmd, parse):
     """Run `cmd` to get accounting information and update `job` state
     accordingly."""
     exit_code, stdout, stderr = self.transport.execute_command(cmd)
     if exit_code == 0:
         jobstatus = parse(stdout)
         job.update(jobstatus)
         if 'exitcode' in jobstatus:
             if 'signal' in jobstatus:
                 job.returncode = (jobstatus['signal'],
                                   jobstatus['exitcode'])
             else:
                 # XXX: we're assuming the batch system executes the
                 # job through a shell, and collects the shell exit
                 # code -- IOW, a job is never exec()'d directly from
                 # the batch system daemon.  I'm not sure this is
                 # actually true in all cases.
                 job.returncode = Run.shellexit_to_returncode(
                     int(jobstatus['exitcode']))
             job.state = Run.State.TERMINATING
         return job.state
     else:
         raise gc3libs.exceptions.AuxiliaryCommandError(
             "Failed running accounting command `%s`:"
             " exit code: %d, stderr: '%s'"
             % (cmd, exit_code, stderr),
             do_log=True)
Exemplo n.º 7
0
    def _parse_stat_output(self, stdout, stderr):
        # LSF's `bjobs` can only report info for terminated jobs, if
        # they finished no longer than ``CLEAN_PERIOD`` seconds
        # before; for older jobs it just prints ``Job XXX is not
        # found`` to STDERR.  However, it does the same when passed a
        # non-existent job ID.  We cannot distinguish the two cases
        # here; let's just be optimistic and presume that if a job ID
        # is not found, it must have been terminated since (at least)
        # we have it in our records so it *was* submitted...  See
        # issue #513 for details.
        if self._job_not_found_re.match(stderr):
            return self._stat_result(Run.State.TERMINATING, None)

        # LSF `bjobs -l` uses a LDIF-style continuation lines, wherein
        # a line is truncated at 79 characters and continues upon the
        # next one; continuation lines start with a fixed amount of
        # whitespace.  However, the amount of whitespace varies with
        # LSF release and possibly other factors, so we need to guess
        # or have users configure it...
        if self._CONTINUATION_LINE_START is None:
            self._CONTINUATION_LINE_START = ' ' \
                * self._guess_continuation_line_prefix_len(stdout)

        # Join continuation lines, so that we can work on a single
        # block of text.
        lines = []
        for line in stdout.split('\n'):
            if len(line) == 0:
                continue
            if line.startswith(self._CONTINUATION_LINE_START):
                lines[-1] += line[len(self._CONTINUATION_LINE_START):]
            else:
                lines.append(line)

        # now rebuild stdout by joining the reconstructed lines
        stdout = '\n'.join(lines)

        state = Run.State.UNKNOWN
        termstatus = None

        # XXX: this only works if the current status is the first one
        # reported in STDOUT ...
        match = LsfLrms._status_re.search(stdout)
        if match:
            lsf_job_state = match.group('state')
            state = LsfLrms._lsf_state_to_gc3pie_state(lsf_job_state)
            if lsf_job_state == 'DONE':
                # DONE = success
                termstatus = (0, 0)
            elif lsf_job_state == 'EXIT':
                # EXIT = job exited with exit code != 0
                match = LsfLrms._unsuccessful_exit_re.search(stdout)
                if match:
                    exit_status = int(match.group('exit_status'))
                    termstatus = Run.shellexit_to_returncode(exit_status)

        return self._stat_result(state, termstatus)
Exemplo n.º 8
0
    def _parse_stat_output(self, stdout, stderr):
        # LSF's `bjobs` can only report info for terminated jobs, if
        # they finished no longer than ``CLEAN_PERIOD`` seconds
        # before; for older jobs it just prints ``Job XXX is not
        # found`` to STDERR.  However, it does the same when passed a
        # non-existent job ID.  We cannot distinguish the two cases
        # here; let's just be optimistic and presume that if a job ID
        # is not found, it must have been terminated since (at least)
        # we have it in our records so it *was* submitted...  See
        # issue #513 for details.
        if self._job_not_found_re.match(stderr):
            return self._stat_result(Run.State.TERMINATING, None)

        # LSF `bjobs -l` uses a LDIF-style continuation lines, wherein
        # a line is truncated at 79 characters and continues upon the
        # next one; continuation lines start with a fixed amount of
        # whitespace.  However, the amount of whitespace varies with
        # LSF release and possibly other factors, so we need to guess
        # or have users configure it...
        if self._CONTINUATION_LINE_START is None:
            self._CONTINUATION_LINE_START = ' ' \
                * self._guess_continuation_line_prefix_len(stdout)

        # Join continuation lines, so that we can work on a single
        # block of text.
        lines = []
        for line in stdout.split('\n'):
            if len(line) == 0:
                continue
            if line.startswith(self._CONTINUATION_LINE_START):
                lines[-1] += line[len(self._CONTINUATION_LINE_START):]
            else:
                lines.append(line)

        # now rebuild stdout by joining the reconstructed lines
        stdout = str.join('\n', lines)

        state = Run.State.UNKNOWN
        termstatus = None

        # XXX: this only works if the current status is the first one
        # reported in STDOUT ...
        match = LsfLrms._status_re.search(stdout)
        if match:
            lsf_job_state = match.group('state')
            state = LsfLrms._lsf_state_to_gc3pie_state(lsf_job_state)
            if lsf_job_state == 'DONE':
                # DONE = success
                termstatus = (0, 0)
            elif lsf_job_state == 'EXIT':
                # EXIT = job exited with exit code != 0
                match = LsfLrms._unsuccessful_exit_re.search(stdout)
                if match:
                    exit_status = int(match.group('exit_status'))
                    termstatus = Run.shellexit_to_returncode(exit_status)

        return self._stat_result(state, termstatus)
Exemplo n.º 9
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug(
                "Process with PID %s found."
                " Checking its running status", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
        else:
            log.debug(
                "Process with PID %d not found."
                " Checking wrapper file ...", pid)
            app.execution.state = Run.State.TERMINATING
            if pid in self.job_infos:
                self.job_infos[pid]['terminated'] = True
                assert (app.requested_memory == self.job_infos[pid]
                        ['requested_memory'])
                if app.requested_memory:
                    self.available_memory += app.requested_memory
            wrapper_filename = posixpath.join(
                app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR,
                ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
            try:
                wrapper_file = self.transport.open(wrapper_filename, 'r')
            except Exception as err:
                self._delete_job_resource_file(pid)
                raise gc3libs.exceptions.InvalidValue(
                    "Could not open wrapper file '%s' for task '%s': %s" %
                    (wrapper_filename, app, err),
                    do_log=True)
            try:
                outcome = self._parse_wrapper_output(wrapper_file)
                app.execution.returncode = \
                    Run.shellexit_to_returncode(int(outcome.ReturnCode))
                self._delete_job_resource_file(pid)
            finally:
                wrapper_file.close()

        self._get_persisted_resource_state()
        return app.execution.state
Exemplo n.º 10
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug("Process with PID %s found."
                      " Checking its running status", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
        else:
            log.debug(
                "Process with PID %d not found."
                " Checking wrapper file ...", pid)
            app.execution.state = Run.State.TERMINATING
            if pid in self.job_infos:
                self.job_infos[pid]['terminated'] = True
                assert (app.requested_memory
                        == self.job_infos[pid]['requested_memory'])
                if app.requested_memory:
                    self.available_memory += app.requested_memory
            wrapper_filename = posixpath.join(
                app.execution.lrms_execdir,
                ShellcmdLrms.WRAPPER_DIR,
                ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
            try:
                wrapper_file = self.transport.open(wrapper_filename, 'r')
            except Exception as err:
                self._delete_job_resource_file(pid)
                raise gc3libs.exceptions.InvalidValue(
                    "Could not open wrapper file '%s' for task '%s': %s"
                    % (wrapper_filename, app, err), do_log=True)
            try:
                outcome = self._parse_wrapper_output(wrapper_file)
                app.execution.returncode = \
                    Run.shellexit_to_returncode(int(outcome.ReturnCode))
                self._delete_job_resource_file(pid)
            finally:
                wrapper_file.close()

        self._get_persisted_resource_state()
        return app.execution.state
Exemplo n.º 11
0
 def _parse_secondary_acct_output(self, stdout, stderr):
     """Parse `qstat -x -f` output (PBSPro only)."""
     acctinfo = {}
     # FIXME: could be a bit smarter and not use a dumb quadratic
     # complexity algo...
     for line in stdout.split('\n'):
         for key, (attr, conv) in self._pbspro_keyval_mapping:
             if (key + ' = ') in line:
                 value = line.split('=')[1].strip()
                 acctinfo[attr] = conv(value)
     assert 'exitcode' in acctinfo, (
         "Could not extract exit code from `qstat -x -f` output")
     acctinfo['termstatus'] = Run.shellexit_to_returncode(
         acctinfo.pop('exitcode'))
     return acctinfo
Exemplo n.º 12
0
Arquivo: pbs.py Projeto: fliem/gc3pie
 def _parse_secondary_acct_output(self, stdout, stderr):
     """Parse `qstat -x -f` output (PBSPro only)."""
     acctinfo = {}
     # FIXME: could be a bit smarter and not use a dumb quadratic
     # complexity algo...
     for line in stdout.split('\n'):
         for key, (attr, conv) in self._pbspro_keyval_mapping:
             if (key + ' = ') in line:
                 value = line.split('=')[1].strip()
                 acctinfo[attr] = conv(value)
     assert 'exitcode' in acctinfo, (
         "Could not extract exit code from `qstat -x -f` output")
     acctinfo['termstatus'] = Run.shellexit_to_returncode(
         acctinfo.pop('exitcode'))
     return acctinfo
Exemplo n.º 13
0
def _parse_returncode_string(val):
    return Run.shellexit_to_returncode(int(val))
Exemplo n.º 14
0
    def update_job_state(self, app):
        try:
            job = app.execution
            job.lrms_jobid
        except AttributeError as ex:
            # `job` has no `lrms_jobid`: object is invalid
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: %s" % str(ex))

        try:
            self.transport.connect()
            cmd = self._stat_command(job)
            log.debug("Checking remote job status with '%s' ..." % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                jobstatus = self._parse_stat_output(stdout)
                job.update(jobstatus)

                job.state = jobstatus.get('state', Run.State.UNKNOWN)
                if job.state == Run.State.UNKNOWN:
                    log.warning("Unknown batch job status,"
                                " setting GC3Pie job state to `UNKNOWN`")

                if 'exit_status' in jobstatus:
                    job.returncode = Run.shellexit_to_returncode(
                        int(jobstatus['exit_status']))

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
                log.error("Failed while running the `qstat`/`bjobs` command."
                          " exit code: %d, stderr: '%s'" % (exit_code, stderr))

            # In some batch systems, jobs disappear from qstat
            # output as soon as they are finished. In these cases,
            # we have to check some *accounting* command to check
            # the exit status.
            cmd = self._acct_command(job)
            if cmd:
                log.debug("Retrieving accounting information using command"
                          " '%s' ..." % cmd)
                try:
                    return self.__do_acct(job, cmd, self._parse_acct_output)
                except gc3libs.exceptions.AuxiliaryCommandError:
                    # This is used to distinguish between a standard
                    # Torque installation and a PBSPro where `tracejob`
                    # does not work but if `job_history_enable=True`,
                    # then we can actually access information about
                    # finished jobs with `qstat -x -f`.
                    try:
                        cmd = self._secondary_acct_command(job)
                        if cmd:
                            log.debug(
                                "The primary job accounting command"
                                " returned no information; trying"
                                " with '%s' instead...", cmd)
                            return self.__do_acct(
                                job, cmd, self._parse_secondary_acct_output)
                    except (gc3libs.exceptions.AuxiliaryCommandError,
                            NotImplementedError):
                        # ignore error -- there is nothing we can do
                        pass

            # No *stat command and no *acct command returned
            # correctly.
            try:
                if (time.time() - job.stat_failed_at) > self.accounting_delay:
                    # accounting info should be there, if it's not
                    # then job is definitely lost
                    log.critical(
                        "Failed executing remote command: '%s';"
                        "exit status %d", cmd, exit_code)
                    log.debug("  remote command returned stdout: '%s'", stdout)
                    log.debug("  remote command returned stderr: '%s'", stderr)
                    raise gc3libs.exceptions.LRMSError(
                        "Failed executing remote command: '%s'; exit status %d"
                        % (cmd, exit_code))
                else:
                    # do nothing, let's try later...
                    return job.state
            except AttributeError:
                # this is the first time `qstat` fails, record a
                # timestamp and retry later
                job.stat_failed_at = time.time()

        except Exception as ex:
            log.error("Error in querying Batch resource '%s': %s: %s",
                      self.name, ex.__class__.__name__, str(ex))
            raise
        # If we reach this point it means that we don't actually know
        # the current state of the job.
        job.state = Run.State.UNKNOWN
        return job.state
Exemplo n.º 15
0
def _parse_returncode_string(val):
    return Run.shellexit_to_returncode(int(val))
Exemplo n.º 16
0
    def update_job_state(self, app):
        try:
            job = app.execution
            job.lrms_jobid
        except AttributeError as ex:
            # `job` has no `lrms_jobid`: object is invalid
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: %s" % str(ex))

        try:
            self.transport.connect()
            cmd = self._stat_command(job)
            log.debug("Checking remote job status with '%s' ..." % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                jobstatus = self._parse_stat_output(stdout)
                job.update(jobstatus)

                job.state = jobstatus.get('state', Run.State.UNKNOWN)
                if job.state == Run.State.UNKNOWN:
                    log.warning(
                        "Unknown batch job status,"
                        " setting GC3Pie job state to `UNKNOWN`")

                if 'exit_status' in jobstatus:
                    job.returncode = Run.shellexit_to_returncode(
                        int(jobstatus['exit_status']))

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
                log.error(
                    "Failed while running the `qstat`/`bjobs` command."
                    " exit code: %d, stderr: '%s'" % (exit_code, stderr))

            # In some batch systems, jobs disappear from qstat
            # output as soon as they are finished. In these cases,
            # we have to check some *accounting* command to check
            # the exit status.
            cmd = self._acct_command(job)
            if cmd:
                log.debug(
                    "Retrieving accounting information using command"
                    " '%s' ..." % cmd)
                try:
                    return self.__do_acct(job, cmd, self._parse_acct_output)
                except gc3libs.exceptions.AuxiliaryCommandError:
                    # This is used to distinguish between a standard
                    # Torque installation and a PBSPro where `tracejob`
                    # does not work but if `job_history_enable=True`,
                    # then we can actually access information about
                    # finished jobs with `qstat -x -f`.
                    try:
                        cmd = self._secondary_acct_command(job)
                        if cmd:
                            log.debug("The primary job accounting command"
                                      " returned no information; trying"
                                      " with '%s' instead...", cmd)
                            return self.__do_acct(
                                job, cmd, self._parse_secondary_acct_output)
                    except (gc3libs.exceptions.AuxiliaryCommandError,
                            NotImplementedError):
                        # ignore error -- there is nothing we can do
                        pass

            # No *stat command and no *acct command returned
            # correctly.
            try:
                if (time.time() - job.stat_failed_at) > self.accounting_delay:
                    # accounting info should be there, if it's not
                    # then job is definitely lost
                    log.critical(
                        "Failed executing remote command: '%s';"
                        "exit status %d", cmd, exit_code)
                    log.debug(
                        "  remote command returned stdout: '%s'", stdout)
                    log.debug(
                        "  remote command returned stderr: '%s'", stderr)
                    raise gc3libs.exceptions.LRMSError(
                        "Failed executing remote command: '%s'; exit status %d"
                        % (cmd, exit_code))
                else:
                    # do nothing, let's try later...
                    return job.state
            except AttributeError:
                # this is the first time `qstat` fails, record a
                # timestamp and retry later
                job.stat_failed_at = time.time()

        except Exception as ex:
            log.error("Error in querying Batch resource '%s': %s: %s",
                      self.name, ex.__class__.__name__, str(ex))
            raise
        # If we reach this point it means that we don't actually know
        # the current state of the job.
        job.state = Run.State.UNKNOWN
        return job.state