Пример #1
0
 def __dirs_content_equal(self, dir1, dir2):
     if dir1 == '' or dir2 == '' or dir1 is None or dir2 is None:
         return -1
     _, stdout1, stderr1 = execute(' '.join(['ls', dir1, '|', 'grep', '-v', 'dest']))
     _, stdout2, stderr2 = execute(' '.join(['ls', dir2, '|', 'grep', '-v', 'dest']))
     if stdout1 != stdout2:
         return -2
     return 0
Пример #2
0
 def __dirs_content_valid(self, dir1, dir2, dir1_expected_content=None, dir2_expected_content=None):
     # currently this fails: need to fix
     if dir1 == '' or dir2 == '' or dir1 is None or dir2 is None:
         return -1
     _, stdout1, stderr1 = execute(' '.join(['ls', dir1, '|', 'grep', '-v', 'dest']))
     if dir1_expected_content is not None and stdout1 != dir1_expected_content:
         return -3
     _, stdout2, stderr2 = execute(' '.join(['ls', dir2, '|', 'grep', '-v', 'dest']))
     if dir2_expected_content is not None and stdout2 != dir2_expected_content:
         return -4
     return 0
Пример #3
0
def kill_looping_job(job):
    """
    Kill the looping process.
    TODO: add allow_looping_job() exp. spec?

    :param job: job object.
    :return: (updated job object.)
    """

    # the child process is looping, kill it
    diagnostics = "pilot has decided to kill looping job %s at %s" % (
        job.jobid, time_stamp())
    logger.fatal(diagnostics)

    cmd = 'ps -fwu %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ls -ltr %s' % (job.workdir)
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'pstree -g -a'
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    # set the relevant error code
    if job.state == 'stagein':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEINTIMEOUT)
    elif job.state == 'stageout':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEOUTTIMEOUT)
    else:
        # most likely in the 'running' state, but use the catch-all 'else'
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.LOOPINGJOB)
    set_pilot_state(job=job, state="failed")

    # remove any lingering input files from the work dir
    lfns, guids = job.get_lfns_and_guids()
    if lfns:
        ec = remove_files(job.workdir, lfns)
        if ec != 0:
            logger.warning('failed to remove all files')

    kill_processes(job.pid)
Пример #4
0
def set_core_counts(job):
    """
    Set the number of used cores.

    :param job: job object.
    :return:
    """

    if job.pgrp:
        cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (
            job.pgrp, job.pgrp)
        exit_code, stdout, stderr = execute(cmd, mute=True)
        logger.debug('%s: %s' % (cmd, stdout))
        try:
            job.actualcorecount = int(stdout)
        except Exception as e:
            logger.warning(
                'failed to convert number of actual cores to int: %s' % e)
        else:
            logger.debug('set number of actual cores to: %d' %
                         job.actualcorecount)

            # overwrite the original core count and add it to the list
            job.corecount = job.actualcorecount
            job.corecounts = add_core_count(job.actualcorecount)
            logger.debug('current core counts list: %s' % str(job.corecounts))

    else:
        logger.debug(
            'payload process group not set - cannot check number of cores used by payload'
        )
Пример #5
0
def has_instruction_sets(instruction_sets):
    """
    Determine whether a given list of CPU instruction sets is available.
    The function will use grep to search in /proc/cpuinfo (both in upper and lower case).
    Example: instruction_sets = ['AVX', 'AVX2', 'SSE4_2', 'XXX'] -> "AVX|AVX2|SSE4_2"
    :param instruction_sets: instruction set (e.g. AVX2) (string).
    :return: Boolean
    """

    ret = ''
    r = ''

    for i in instruction_sets:
        r += r'\|%s[^ ]*\|%s[^ ]*' % (
            i.lower(), i.upper()) if r else r'%s[^ ]*\|%s[^ ]*' % (i.lower(),
                                                                   i.upper())
    cmd = "grep -o \'%s\' /proc/cpuinfo" % r

    exit_code, stdout, stderr = execute(cmd)
    if not exit_code and not stderr:
        for i in instruction_sets:
            if i.lower() in stdout.split() or i.upper() in stdout.split():
                ret += '|%s' % i.upper() if ret else i.upper()

    return ret
Пример #6
0
    def execute_utility_command(self, cmd, job, label):
        """
        Execute a utility command (e.g. pre/postprocess commands; label=preprocess etc).

        :param cmd: full command to be executed (string).
        :param job: job object.
        :param label: command label (string).
        :return: exit code (int).
        """

        exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False)
        if exit_code:
            ignored_exit_codes = [160, 161, 162]
            logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details', cmd, exit_code)
            if label == 'preprocess':
                err = errors.PREPROCESSFAILURE
            elif label == 'postprocess':
                err = errors.POSTPROCESSFAILURE
            else:
                err = 0  # ie ignore
            if err and exit_code not in ignored_exit_codes:  # ignore no-more-data-points exit codes
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(err)
            if exit_code in ignored_exit_codes:
                job.transexitcode = exit_code

        # write output to log files
        self.write_utility_output(job.workdir, label, stdout, stderr)

        return exit_code
Пример #7
0
    def utility_after_payload_started(self, job):
        """
        Functions to run after payload started
        :param job: job object
        """

        # get the payload command from the user specific code
        pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
        user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3

        # should any additional commands be executed after the payload?
        cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job)
        if cmd_dictionary:
            cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
            logger.info('utility command to be executed after the payload: %s', cmd)

            # how should this command be executed?
            utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job)
            if not utilitycommand:
                logger.warning('empty utility command - nothing to run')
                return
            try:
                proc1 = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False,
                                stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job)
            except Exception as error:
                logger.error('could not execute: %s', error)
            else:
                # store process handle in job object, and keep track on how many times the command has been launched
                # also store the full command in case it needs to be restarted later (by the job_monitor() thread)
                job.utilities[cmd_dictionary.get('command')] = [proc1, 1, utilitycommand]
Пример #8
0
def verify_arcproxy(envsetup, limit):
    """
    Verify the proxy using arcproxy.

    :param envsetup: general setup string for proxy commands (string).
    :param limit: time limit in hours (int).
    :return: exit code (int), error diagnostics (string).
    """

    ec = 0
    diagnostics = ""

    cmd = "%sarcproxy -i vomsACvalidityLeft" % (envsetup)

    exit_code, stdout, stderr = execute(cmd, shell=True)
    if stdout is not None:
        if 'command not found' in stdout:
            logger.warning(
                "arcproxy is not available on this queue,"
                "this can lead to memory issues with voms-proxy-info on SL6: %s"
                % (stdout))
        else:
            ec, diagnostics = interpret_proxy_info(exit_code, stdout, stderr,
                                                   limit)
            if ec == 0:
                logger.info("voms proxy verified using arcproxy")
                return 0, diagnostics
            elif ec == errors.NOVOMSPROXY:
                return ec, diagnostics
            else:
                logger.info("will try voms-proxy-info instead")
    else:
        logger.warning('command execution failed')

    return ec, diagnostics
Пример #9
0
 def test_copy_in_symlink(self):
     copy_in(self.indata, copy_type='symlink', workdir=self.tmp_dst_dir)
     # here check files linked
     self.assertEqual(self.__dirs_content_equal(self.tmp_src_dir, self.tmp_dst_dir), 0)
     # check dst files are links
     _, stdout, _ = execute(r'find %s -type l -exec echo -n l \;' % self.tmp_dst_dir)  # Python 3 (added r)
     self.assertEqual(stdout, ''.join('l' for i in range(self.numfiles)))
Пример #10
0
def get_time_for_last_touch(job, mt, looping_limit):
    """
    Return the time when the files in the workdir were last touched.
    in case no file was touched since the last check, the returned value will be the same as the previous time.

    :param job: job object.
    :param mt: `MonitoringTime` object.
    :param looping_limit: looping limit in seconds.
    :return: time in seconds since epoch (int) (or None in case of failure).
    """

    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
    loopingjob_definitions = __import__(
        'pilot.user.%s.loopingjob_definitions' % pilot_user, globals(),
        locals(), [pilot_user], 0)  # Python 2/3

    # locate all files that were modified the last N minutes
    cmd = "find %s -type f -mmin -%d" % (job.workdir, int(looping_limit / 60))
    exit_code, stdout, stderr = execute(cmd)
    if exit_code == 0:
        if stdout != "":
            files = stdout.split(
                "\n")  # find might add a \n even for single entries

            # remove unwanted list items (*.py, *.pyc, workdir, ...)
            files = loopingjob_definitions.remove_unwanted_files(
                job.workdir, files)
            if files:
                logger.info('found %d files that were recently updated',
                            len(files))
                #logger.debug('recent files:\n%s', files)
                updated_files = verify_file_list(files)

                # now get the mod times for these file, and identify the most recently update file
                latest_modified_file, mtime = find_latest_modified_file(
                    updated_files)
                if latest_modified_file:
                    logger.info(
                        "file %s is the most recently updated file (at time=%d)",
                        latest_modified_file, mtime)
                else:
                    logger.warning(
                        'looping job algorithm failed to identify latest updated file'
                    )
                    return mt.ct_looping_last_touched

                # store the time of the last file modification
                mt.update('ct_looping_last_touched', modtime=mtime)
            else:
                logger.warning("found no recently updated files!")
        else:
            logger.warning('found no recently updated files')
    else:
        # cut the output if too long
        stdout = cut_output(stdout)
        stderr = cut_output(stderr)
        logger.warning('find command failed: %d, %s, %s', exit_code, stdout,
                       stderr)

    return mt.ct_looping_last_touched
Пример #11
0
    def run_command(self, cmd, label=None):
        """
        Execute the given command and return the process id.

        :param cmd: command (string).
        :return: process id (int).
        """

        if label:
            logger.info('\n\n%s:\n\n%s\n', label, cmd)
        if label == 'coprocess':
            try:
                out = open(os.path.join(self.__job.workdir, self.__coprocess_stdout_name), 'wb')
                err = open(os.path.join(self.__job.workdir, self.__coprocess_stderr_name), 'wb')
            except Exception as error:
                logger.warning('failed to open coprocess stdout/err: %s', error)
                out = None
                err = None
        else:
            out = None
            err = None
        try:
            proc = execute(cmd, workdir=self.__job.workdir, returnproc=True, stdout=out, stderr=err,
                           usecontainer=False, cwd=self.__job.workdir, job=self.__job)
        except Exception as error:
            logger.error('could not execute: %s', error)
            return None
        if isinstance(proc, tuple) and not proc[0]:
            logger.error('failed to execute command')
            return None

        logger.info('started %s -- pid=%s executable=%s', label, proc.pid, cmd)

        return proc
Пример #12
0
def find_processes_in_group(cpids, pid):
    """
    Find all processes that belong to the same group.
    Recursively search for the children processes belonging to pid and return their pid's.
    pid is the parent pid and cpids is a list that has to be initialized before calling this function and it contains
    the pids of the children AND the parent.

    :param cpids: list of pid's for all child processes to the parent pid, as well as the parent pid itself (int).
    :param pid: parent process id (int).
    :return: (updated cpids input parameter list).
    """

    if not pid:
        return

    cpids.append(pid)

    cmd = "ps -eo pid,ppid -m | grep %d" % pid
    exit_code, psout, stderr = execute(cmd, mute=True)

    lines = psout.split("\n")
    if lines != ['']:
        for i in range(0, len(lines)):
            try:
                thispid = int(lines[i].split()[0])
                thisppid = int(lines[i].split()[1])
            except Exception as e:
                logger.warning('exception caught: %s' % e)
            if thisppid == pid:
                find_processes_in_group(cpids, thispid)
Пример #13
0
def create_core_dump(pid=None, workdir=None):
    """
    Create core dump and copy it to work directory
    """

    if not pid or not workdir:
        logger.warning(
            'cannot create core file since pid or workdir is unknown')
        return

    cmd = 'gdb --pid %d -ex \'generate-core-file\'' % pid
    exit_code, stdout, stderr = execute(cmd)
    if not exit_code:
        path = locate_core_file(pid=pid)
        if path:
            try:
                copy(path, workdir)
            except Exception as error:
                logger.warning('failed to copy core file: %s', error)
            else:
                logger.debug('copied core dump to workdir')

    else:
        logger.warning('failed to execute command: %s, stdout+err=%s', cmd,
                       stdout + stderr)
Пример #14
0
def verify_vomsproxy(envsetup, limit):
    """
    Verify proxy using voms-proxy-info command.

    :param envsetup: general setup string for proxy commands (string).
    :param limit: time limit in hours (int).
    :return: exit code (int), error diagnostics (string).
    """

    exit_code = 0
    diagnostics = ""

    if os.environ.get('X509_USER_PROXY', '') != '':
        cmd = "%svoms-proxy-info -actimeleft --file $X509_USER_PROXY" % envsetup
        logger.info('executing command: %s', cmd)
        _exit_code, stdout, stderr = execute(cmd, shell=True)
        if stdout is not None:
            if "command not found" in stdout:
                logger.info(
                    "skipping voms proxy check since command is not available")
            else:
                exit_code, diagnostics, validity_end = interpret_proxy_info(
                    _exit_code, stdout, stderr, limit)
                if exit_code == 0:
                    logger.info("voms proxy verified using voms-proxy-info")
                    return 0, diagnostics
        else:
            logger.warning('command execution failed')
    else:
        logger.warning('X509_USER_PROXY is not set')

    return exit_code, diagnostics
Пример #15
0
    def untar_file(self, lfn, job):

        pfn = os.path.join(job.workdir, lfn)
        command = "tar -xf %s -C %s" % (pfn, job.workdir)
        logger.info("untar file: %s", command)
        exit_code, stdout, stderr = execute(command)
        logger.info("exit_code: %s, stdout: %s, stderr: %s\n", exit_code, stdout, stderr)
Пример #16
0
def check_number_used_cores(job):
    """
    Check the number of cores used by the payload.
    The number of actual used cores is reported with job metrics (if set).

    :param job: job object.
    :return:
    """

    if job.pgrp:
        cmd = "ps axo pgid,psr | sort | grep %d | uniq | wc -l" % job.pgrp
        exit_code, stdout, stderr = execute(cmd, mute=True)
        logger.debug('%s:\n%s' % (cmd, stdout))
        try:
            job.actualcorecount = int(stdout)
        except Exception as e:
            logger.warning(
                'failed to convert number of actual cores to int: %s' % e)
        else:
            logger.debug('set number of actual cores to: %d' %
                         job.actualcorecount)
    else:
        logger.debug(
            'payload process group not set - cannot check number of cores used by payload'
        )
Пример #17
0
def create_log(job, logfile, tarball_name):
    """

    :param job:
    :param logfile:
    :param tarball_name:
    :raises LogFileCreationFailure: in case of log file creation problem
    :return:
    """

    log = get_logger(job.jobid)
    log.debug('preparing to create log file')

    # perform special cleanup (user specific) prior to log file creation
    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
    user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(),
                      [pilot_user], -1)
    user.remove_redundant_files(job.workdir)

    input_files = [e.lfn for e in job.indata]
    output_files = [e.lfn for e in job.outdata]

    # remove any present input/output files before tarring up workdir
    for f in input_files + output_files:
        path = os.path.join(job.workdir, f)
        if os.path.exists(path):
            log.info('removing file: %s' % path)
            remove(path)

    # rename the workdir for the tarball creation
    newworkdir = os.path.join(os.path.dirname(job.workdir), tarball_name)
    orgworkdir = job.workdir
    log.debug('renaming %s to %s' % (job.workdir, newworkdir))
    os.rename(job.workdir, newworkdir)
    job.workdir = newworkdir

    fullpath = os.path.join(job.workdir,
                            logfile.lfn)  # /some/path/to/dirname/log.tgz

    log.info('will create archive %s' % fullpath)
    try:
        #newdirnm = "tarball_PandaJob_%s" % job.jobid
        #tarballnm = "%s.tar.gz" % newdirnm
        #os.rename(job.workdir, newdirnm)
        cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % (
            fullpath, tarball_name)
        exit_code, stdout, stderr = execute(cmd)
        #with closing(tarfile.open(name=fullpath, mode='w:gz', dereference=True)) as archive:
        #    archive.add(os.path.basename(job.workdir), recursive=True)
    except Exception as e:
        raise LogFileCreationFailure(e)
    else:
        log.debug('stdout = %s' % stdout)
    log.debug('renaming %s back to %s' % (job.workdir, orgworkdir))
    try:
        os.rename(job.workdir, orgworkdir)
    except Exception as e:
        log.debug('exception caught: %s' % e)
    job.workdir = orgworkdir
Пример #18
0
def kill_orphans():
    """
    Find and kill all orphan processes belonging to current pilot user.

    :return:
    """

    # exception for BOINC
    if 'BOINC' in os.environ.get('PILOT_SITENAME', ''):
        logger.info("Do not look for orphan processes in BOINC jobs")
        return

    if 'PILOT_NOKILL' in os.environ:
        return

    logger.info("searching for orphan processes")

    cmd = "ps -o pid,ppid,args -u %s" % whoami()
    exit_code, _processes, stderr = execute(cmd)
    #pattern = re.compile(r'(\d+)\s+(\d+)\s+(\S+)')  # Python 3 (added r)
    pattern = re.compile(r'(\d+)\s+(\d+)\s+([\S\s]+)')  # Python 3 (added r)

    count = 0
    for line in _processes.split('\n'):
        ids = pattern.search(line)
        if ids:
            pid = ids.group(1)
            ppid = ids.group(2)
            args = ids.group(3)
            try:
                pid = int(pid)
            except Exception as error:
                logger.warning('failed to convert pid to int: %s', error)
                continue
            if 'cvmfs2' in args:
                logger.info(
                    "ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'",
                    pid, ppid, args)
            elif 'pilots_starter.py' in args or 'runpilot2-wrapper.sh' in args:
                logger.info(
                    "ignoring pilot launcher: pid=%s, ppid=%s, args='%s'", pid,
                    ppid, args)
            elif ppid == '1':
                count += 1
                logger.info("found orphan process: pid=%s, ppid=%s, args='%s'",
                            pid, ppid, args)
                if 'bash' in args or ('python' in args and 'pilot.py' in args):
                    logger.info("will not kill bash process")
                else:
                    killpg(pid, signal.SIGTERM, args)
                    _t = 10
                    logger.info("sleeping %d s to allow processes to exit", _t)
                    time.sleep(_t)
                    killpg(pid, signal.SIGKILL, args)

    if count == 0:
        logger.info("did not find any orphan processes")
    else:
        logger.info("found %d orphan process(es)", count)
Пример #19
0
def check_for_lsm(dst_in=True):
    cmd = None
    if dst_in:
        cmd = 'which lsm-get'
    else:
        cmd = 'which lsm-put'
    exit_code, gfal_path, _ = execute(cmd)
    return exit_code == 0
Пример #20
0
def set_core_counts(job):
    """
    Set the number of used cores.

    :param job: job object.
    :return:
    """

    # something like this could be used if prmon also gave info about ncores
    # (change nprocs -> ncores and add ncores to list in utilities module, get_average_summary_dictionary_prmon())

    #summary_dictionary = get_memory_values(job.workdir, name=job.memorymonitor)
    #if summary_dictionary:
    #    if 'nprocs' in summary_dictionary["Other"]:
    #        try:
    #            job.actualcorecount = int(summary_dictionary["Other"]["nprocs"])
    #        except Exception as exc:
    #            logger.warning('exception caught: %s', exc)
    #        else:
    #            job.corecounts = add_core_count(job.actualcorecount)
    #            logger.debug('current core counts list: %s', str(job.corecounts))
    #    else:
    #        logger.debug('summary_dictionary[Other]=%s', summary_dictionary["Other"])
    #else:
    #    logger.debug('no summary_dictionary')

    if job.pgrp:
        # for debugging
        #cmd = "ps axo pgid,psr,comm,args | grep %d" % job.pgrp
        #exit_code, stdout, stderr = execute(cmd, mute=True)
        #logger.debug('%s:\n%s\n', cmd, stdout)

        # ps axo pgid,psr -> 154628   8 \n 154628   9 \n 1546280 1 ..
        # sort is redundant; uniq removes any duplicate lines; wc -l gives the final count
        # awk is added to get the pgrp list only and then grep -x makes sure that false positives are removed, e.g. 1546280
        cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (
            job.pgrp, job.pgrp)
        _, stdout, _ = execute(cmd, mute=True)
        logger.debug('%s: %s', cmd, stdout)
        try:
            job.actualcorecount = int(stdout)
        except ValueError as exc:
            logger.warning(
                'failed to convert number of actual cores to int: %s', exc)
        else:
            job.corecounts = add_core_count(
                job.actualcorecount)  #, core_counts=job.corecounts)
            #logger.debug('current core counts list: %s', str(job.corecounts))
            # check suspicious values
            #if job.actualcorecount > 5:
            #    logger.warning('detected large actualcorecount: %d', job.actualcorecount)
            #    cmd = "ps axo pgid,stat,euid,ruid,tty,tpgid,sess,pgrp,ppid,pid,pcpu,comm | sort | uniq | grep %d" % job.pgrp
            #    exit_code, stdout, stderr = execute(cmd, mute=True)
            #    logger.debug('%s (pgrp=%d): %s', cmd, job.pgrp, stdout)
    else:
        logger.debug(
            'payload process group not set - cannot check number of cores used by payload'
        )
Пример #21
0
def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], output_files=[], is_looping=False, debugmode=False):
    """
    Create the tarball for the job.

    :param workdir: work directory for the job (string).
    :param logfile_name: log file name (string).
    :param tarball_name: tarball name (string).
    :param cleanup: perform cleanup (Boolean).
    :param input_files: list of input files to remove (list).
    :param output_files: list of output files to remove (list).
    :param is_looping: True for looping jobs, False by default (Boolean).
    :param debugmode: True if debug mode has been switched on (Boolean).
    :raises LogFileCreationFailure: in case of log file creation problem.
    :return:
    """

    logger.debug('preparing to create log file (debug mode=%s)', str(debugmode))

    # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir)
    pilot_home = os.environ.get('PILOT_HOME', os.getcwd())
    current_dir = os.getcwd()
    if pilot_home != current_dir:
        os.chdir(pilot_home)

    # perform special cleanup (user specific) prior to log file creation
    if cleanup:
        pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
        user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
        user.remove_redundant_files(workdir, islooping=is_looping, debugmode=debugmode)

    # remove any present input/output files before tarring up workdir
    for fname in input_files + output_files:
        path = os.path.join(workdir, fname)
        if os.path.exists(path):
            logger.info('removing file: %s', path)
            remove(path)

    # rename the workdir for the tarball creation
    newworkdir = os.path.join(os.path.dirname(workdir), tarball_name)
    orgworkdir = workdir
    os.rename(workdir, newworkdir)
    workdir = newworkdir

    fullpath = os.path.join(workdir, logfile_name)  # /some/path/to/dirname/log.tgz
    logger.info('will create archive %s', fullpath)
    try:
        cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % (fullpath, tarball_name)
        _, stdout, _ = execute(cmd)
    except Exception as error:
        raise LogFileCreationFailure(error)
    else:
        if pilot_home != current_dir:
            os.chdir(pilot_home)
        logger.debug('stdout = %s', stdout)
    try:
        os.rename(workdir, orgworkdir)
    except Exception as error:
        logger.debug('exception caught: %s', error)
Пример #22
0
def download_transform(url, transform_name, workdir):
    """
    Download the transform from the given url
    :param url: download URL with path to transform (string).
    :param transform_name: trf name (string).
    :param workdir: work directory (string).
    :return:
    """

    status = False
    diagnostics = ""
    path = os.path.join(workdir, transform_name)
    cmd = 'curl -sS \"%s\" > %s' % (url, path)
    trial = 1
    max_trials = 3

    # test if $HARVESTER_WORKDIR is set
    harvester_workdir = os.environ.get('HARVESTER_WORKDIR')
    if harvester_workdir is not None:
        # skip curl by setting max_trials = 0
        max_trials = 0
        source_path = os.path.join(harvester_workdir, transform_name)
        try:
            copy(source_path, path)
            status = True
        except Exception as error:
            status = False
            diagnostics = "Failed to copy file %s to %s : %s" % (source_path,
                                                                 path, error)
            logger.error(diagnostics)

    # try to download the trf a maximum of 3 times
    while trial <= max_trials:
        logger.info("executing command [trial %d/%d]: %s" %
                    (trial, max_trials, cmd))

        exit_code, stdout, stderr = execute(cmd, mute=True)
        if not stdout:
            stdout = "(None)"
        if exit_code != 0:
            # Analyze exit code / output
            diagnostics = "curl command failed: %d, %s, %s" % (exit_code,
                                                               stdout, stderr)
            logger.warning(diagnostics)
            if trial == max_trials:
                logger.fatal('could not download transform: %s' % stdout)
                status = False
                break
            else:
                logger.info("will try again after 60 s")
                sleep(60)
        else:
            logger.info("curl command returned: %s" % stdout)
            status = True
            break
        trial += 1

    return status, diagnostics
Пример #23
0
def check_work_dir(job):
    """
    Check the size of the work directory.
    The function also updates the workdirsizes list in the job object.

    :param job: job object.
    :return: exit code (int), error diagnostics (string)
    """

    exit_code = 0
    diagnostics = ""

    log = get_logger(job.jobid)

    if os.path.exists(job.workdir):
        # get the limit of the workdir
        maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata)

        if os.path.exists(job.workdir):
            workdirsize = get_directory_size(directory=job.workdir)

            # is user dir within allowed size limit?
            if workdirsize > maxwdirsize:
                exit_code = errors.USERDIRTOOLARGE
                diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \
                              (job.workdir, workdirsize, maxwdirsize)
                log.fatal("%s" % diagnostics)

                cmd = 'ls -altrR %s' % job.workdir
                exit_code, stdout, stderr = execute(cmd, mute=True)
                log.info("%s: %s" % (cmd + '\n', stdout))

                # kill the job
                # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED")
                kill_processes(job.pid)

                # remove any lingering input files from the work dir
                lfns, guids = job.get_lfns_and_guids()
                if lfns:
                    remove_files(job.workdir, lfns)

                    # remeasure the size of the workdir at this point since the value is stored below
                    workdirsize = get_directory_size(directory=job.workdir)
            else:
                log.info(
                    "size of work directory %s: %d B (within %d B limit)" %
                    (job.workdir, workdirsize, maxwdirsize))

            # Store the measured disk space (the max value will later be sent with the job metrics)
            if workdirsize > 0:
                job.add_workdir_size(workdirsize)
        else:
            log.warning('job work dir does not exist: %s' % job.workdir)
    else:
        log.warning(
            'skipping size check of workdir since it has not been created yet')

    return exit_code, diagnostics
Пример #24
0
def get_memory_usage(pid):
    """
    Return the memory usage string (ps auxf <pid>) for the given process.

    :param pid: process id (int).
    :return: ps exit code (int), stderr (strint), stdout (string).
    """

    return execute('ps aux -q %d' % pid)
Пример #25
0
 def test_copy_in_mv(self):
     _, stdout1, stderr1 = execute(' '.join(
         ['ls', self.tmp_src_dir, '|', 'grep', '-v', 'dest']))
     copy_in(self.indata, copy_type='mv', workdir=self.tmp_dst_dir)
     # here check files moved
     self.assertEqual(
         self.__dirs_content_valid(self.tmp_src_dir,
                                   self.tmp_dst_dir,
                                   dir2_expected_content=stdout1), 0)
Пример #26
0
def copy_in(files, **kwargs):
    """
        Download given files using rucio copytool.

        :param files: list of `FileSpec` objects
        :raise: PilotException in case of controlled error
    """

    # don't spoil the output, we depend on stderr parsing
    os.environ[
        'RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]'

    ddmconf = kwargs.pop('ddmconf', {})
    activity = kwargs.pop('activity', None)
    # trace_report = kwargs.get('trace_report')

    for fspec in files:

        cmd = []
        logger.info("To transfer file: %s" % fspec)
        ddm = ddmconf.get(fspec.ddmendpoint)
        if ddm:
            protocol = resolve_protocol(fspec, activity, ddm)
            surls = resolve_surl(fspec, protocol, ddmconf)
            if 'surl' in surls:
                fspec.surl = surls['surl']
            ddm_special_setup = ddm.get_special_setup(protocol.get('id', None))
            if ddm_special_setup:
                cmd += [ddm_special_setup]

        dst = fspec.workdir or kwargs.get('workdir') or '.'
        cmd += [
            '/usr/bin/env', 'rucio', '-v', 'download', '--no-subdir', '--dir',
            dst
        ]
        if require_replicas:
            cmd += ['--rse', fspec.replicas[0][0]]
        if fspec.surl:
            if fspec.ddmendpoint:
                cmd.extend(['--rse', fspec.ddmendpoint])
            cmd.extend(['--pfn', fspec.surl])
        cmd += ['%s:%s' % (fspec.scope, fspec.lfn)]

        rcode, stdout, stderr = execute(" ".join(cmd), **kwargs)

        if rcode:  ## error occurred
            error = resolve_common_transfer_errors(stderr, is_stagein=True)
            fspec.status = 'failed'
            fspec.status_code = error.get('rcode')
            raise PilotException(error.get('error'),
                                 code=error.get('rcode'),
                                 state=error.get('state'))

        fspec.status_code = 0
        fspec.status = 'transferred'

    return files
Пример #27
0
def move(source, destination, recursive=False):
    cmd = None
    if recursive:
        cmd = "gfal-copy -r %s %s" % (source, destination)
    else:
        cmd = "gfal-copy %s %s" % (source, destination)
    print(cmd)
    exit_code, stdout, stderr = execute(cmd)

    return exit_code, stdout, stderr
Пример #28
0
def whoami():
    """
    Return the name of the pilot user.

    :return: whoami output (string).
    """

    exit_code, who_am_i, stderr = execute('whoami', mute=True)

    return who_am_i
Пример #29
0
def _resolve_checksum_option(setup, **kwargs):

    cmd = "%s --version" % copy_command
    if setup:
        cmd = "source %s; %s" % (setup, cmd)

    logger.info("Execute command (%s) to check xrdcp client version" % cmd)

    rcode, stdout, stderr = execute(cmd, **kwargs)
    logger.info("return code: %s" % rcode)
    logger.info("return output: %s" % (stdout + stderr))

    cmd = "%s -h" % copy_command
    if setup:
        cmd = "source %s; %s" % (setup, cmd)

    logger.info(
        "Execute command (%s) to decide which option should be used to calc/verify file checksum.."
        % cmd)

    rcode, stdout, stderr = execute(cmd, **kwargs)
    output = stdout + stderr
    logger.info("return code: %s" % rcode)
    logger.debug("return output: %s" % output)

    coption = ""
    checksum_type = 'adler32'  ## consider only adler32 for now

    if rcode:
        logger.error('FAILED to execute command=%s: %s' % (cmd, output))
    else:
        if "--cksum" in output:
            coption = "--cksum %s:print" % checksum_type
        elif "-adler" in output and checksum_type == 'adler32':
            coption = "-adler"
        elif "-md5" in output and checksum_type == 'md5':
            coption = "-md5"

    if coption:
        logger.info("Use %s option to get the checksum for %s command" %
                    (coption, copy_command))

    return coption
Пример #30
0
def _stagefile(coption,
               source,
               destination,
               filesize,
               is_stagein,
               setup=None,
               **kwargs):
    """
        Stage the file (stagein or stageout)
        :return: destination file details (checksum, checksum_type) in case of success, throw exception in case of failure
        :raise: PilotException in case of controlled error
    """

    filesize_cmd, checksum_cmd, checksum_type = None, None, None

    cmd = '%s -np -f %s %s %s' % (copy_command, coption, source, destination)
    if setup:
        cmd = "source %s; %s" % (setup, cmd)

    #timeout = get_timeout(filesize)
    #logger.info("Executing command: %s, timeout=%s" % (cmd, timeout))

    rcode, stdout, stderr = execute(cmd, **kwargs)
    logger.info('rcode=%d, stdout=%s, stderr=%s' % (rcode, stdout, stderr))

    if rcode:  ## error occurred
        error = resolve_common_transfer_errors(stdout + stderr,
                                               is_stagein=is_stagein)

        #rcode = error.get('rcode')  ## TO BE IMPLEMENTED
        #if not is_stagein and rcode == PilotErrors.ERR_CHKSUMNOTSUP: ## stage-out, on fly checksum verification is not supported .. ignore
        #    logger.info('stage-out: ignore ERR_CHKSUMNOTSUP error .. will explicitly verify uploaded file')
        #    return None, None

        raise PilotException(error.get('error'),
                             code=error.get('rcode'),
                             state=error.get('state'))

    # extract filesize and checksum values from output
    if coption != "":
        filesize_cmd, checksum_cmd, checksum_type = get_file_info_from_output(
            stdout + stderr)

    ## verify transfer by returned checksum or call remote checksum calculation
    ## to be moved at the base level

    is_verified = True  ## TO BE IMPLEMENTED LATER

    if not is_verified:
        rcode = ErrorCodes.GETADMISMATCH if is_stagein else ErrorCodes.PUTADMISMATCH
        raise PilotException("Copy command failed",
                             code=rcode,
                             state='AD_MISMATCH')

    return filesize_cmd, checksum_cmd, checksum_type