Exemplo n.º 1
0
    def submit(self,
               job_file,
               ce=None,
               delegation_id=None,
               retries=0,
               retry_delay=3,
               silent=False):
        # default arguments
        ce = ce or self.ce
        delegation_id = delegation_id or self.delegation_id

        # check arguments
        if not ce:
            raise ValueError("ce must not be empty")

        # prepare round robin for ces and delegations
        ce = make_list(ce)
        if delegation_id:
            delegation_id = make_list(delegation_id)
            if len(ce) != len(delegation_id):
                raise Exception(
                    "numbers of CEs ({}) and delegation ids ({}) do not match".
                    format(len(ce), len(delegation_id)))

        # get the job file location as the submission command is run it the same directory
        job_file_dir, job_file_name = os.path.split(os.path.abspath(job_file))

        # define the actual submission in a loop to simplify retries
        while True:
            # build the command
            i = random.randint(0, len(ce) - 1)
            cmd = ["glite-ce-job-submit", "-r", ce[i]]
            if delegation_id:
                cmd += ["-D", delegation_id[i]]
            cmd += [job_file_name]

            # run the command
            # glite prints everything to stdout
            logger.debug("submit glite job with command '{}'".format(cmd))
            code, out, _ = interruptable_popen(cmd,
                                               stdout=subprocess.PIPE,
                                               stderr=sys.stderr,
                                               cwd=job_file_dir)

            # in some cases, the return code is 0 but the ce did not respond with a valid id
            if code == 0:
                job_id = out.strip().split("\n")[-1].strip()
                if not self.submission_job_id_cre.match(job_id):
                    code = 1
                    out = "bad job id '{}' from output:\n{}".format(
                        job_id, out)

            # retry or done?
            if code == 0:
                return job_id
            else:
                logger.debug("submission of glite job '{}' failed:\n{}".format(
                    job_file, out))
                if retries > 0:
                    retries -= 1
                    time.sleep(retry_delay)
                    continue
                elif silent:
                    return None
                else:
                    raise Exception(
                        "submission of glite job '{}' failed:\n{}".format(
                            job_file, out))
Exemplo n.º 2
0
    def submit(self,
               job_file,
               job_list=None,
               ce=None,
               retries=0,
               retry_delay=3,
               silent=False):
        # default arguments
        if job_list is None:
            job_list = self.job_list
        if ce is None:
            ce = self.ce

        # check arguments
        if not ce:
            raise ValueError("ce must not be empty")
        ce = make_list(ce)

        # arc supports multiple jobs to be submitted with a single arcsub call,
        # so job_file can be a sequence of files
        # when this is the case, we have to make the assumption that their input files are all
        # absolute, or they are relative but all in the same directory
        chunking = isinstance(job_file, (list, tuple))
        job_files = make_list(job_file)
        job_file_dir = os.path.dirname(os.path.abspath(job_files[0]))
        job_file_names = [os.path.basename(jf) for jf in job_files]

        # define the actual submission in a loop to simplify retries
        while True:
            # build the command
            cmd = ["arcsub", "-c", random.choice(ce)]
            if job_list:
                cmd += ["-j", job_list]
            cmd += job_file_names
            cmd = quote_cmd(cmd)

            # run the command
            logger.debug("submit arc job(s) with command '{}'".format(cmd))
            code, out, _ = interruptable_popen(cmd,
                                               shell=True,
                                               executable="/bin/bash",
                                               stdout=subprocess.PIPE,
                                               stderr=sys.stderr,
                                               cwd=job_file_dir)

            # in some cases, the return code is 0 but the ce did not respond valid job ids
            job_ids = []
            if code == 0:
                for line in out.strip().split("\n"):
                    m = self.submission_job_id_cre.match(line.strip())
                    if m:
                        job_id = m.group(1)
                        job_ids.append(job_id)

                if not job_ids:
                    code = 1
                    out = "cannot find job id(s) in output:\n{}".format(out)
                elif len(job_ids) != len(job_files):
                    raise Exception(
                        "number of job ids in output ({}) does not match number of "
                        "jobs to submit ({}) in output:\n{}".format(
                            len(job_ids), len(job_files), out))

            # retry or done?
            if code == 0:
                return job_ids if chunking else job_ids[0]
            else:
                logger.debug(
                    "submission of arc job(s) '{}' failed with code {}:\n{}".
                    format(job_files, code, out))
                if retries > 0:
                    retries -= 1
                    time.sleep(retry_delay)
                    continue
                elif silent:
                    return None
                else:
                    raise Exception(
                        "submission of arc job(s) '{}' failed:\n{}".format(
                            job_files, out))
Exemplo n.º 3
0
    def env(self):
        # strategy: unlike docker, singularity might not allow binding of paths that do not exist
        # in the container, so create a tmp directory on the host system and bind it as /tmp, let
        # python dump its full env into a file, and read the file again on the host system
        if self.image not in self._envs:
            tmp_dir = LocalDirectoryTarget(is_tmp=True)
            tmp_dir.touch()

            tmp = tmp_dir.child("env", type="f")
            tmp.touch()

            # determine whether volume binding is allowed
            allow_binds_cb = getattr(self.task, "singularity_allow_binds",
                                     None)
            if callable(allow_binds_cb):
                allow_binds = allow_binds_cb()
            else:
                cfg = Config.instance()
                allow_binds = cfg.get_expanded(self.get_config_section(),
                                               "allow_binds")

            # arguments to configure the environment
            args = ["-e"]
            if allow_binds:
                args.extend(["-B", "{}:/tmp".format(tmp_dir.path)])
                env_file = "/tmp/{}".format(tmp.basename)
            else:
                env_file = tmp.path

            # get the singularity exec command
            singularity_exec_cmd = self._singularity_exec_cmd() + args

            # build commands to setup the environment
            setup_cmds = self._build_setup_cmds(self._get_env())

            # build the python command that dumps the environment
            py_cmd = "import os,pickle;" \
                + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_file)

            # build the full command
            cmd = quote_cmd(singularity_exec_cmd + [
                self.image,
                "bash",
                "-l",
                "-c",
                "; ".join(
                    flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd]))),
            ])

            # run it
            code, out, _ = interruptable_popen(cmd,
                                               shell=True,
                                               executable="/bin/bash",
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.STDOUT)
            if code != 0:
                raise Exception(
                    "singularity sandbox env loading failed:\n{}".format(out))

            # load the environment from the tmp file
            env = tmp.load(formatter="pickle")

            # cache
            self._envs[self.image] = env

        return self._envs[self.image]
Exemplo n.º 4
0
Arquivo: util.py Projeto: riga/law
def delegate_voms_proxy_glite(endpoint,
                              proxy_file=None,
                              stdout=None,
                              stderr=None,
                              cache=True):
    """
    Delegates the voms proxy via gLite to an *endpoint*, e.g.
    ``grid-ce.physik.rwth-aachen.de:8443``. When *proxy_file* is *None*, it defaults to the result
    of :py:func:`get_voms_proxy_file`. *stdout* and *stderr* are passed to the *Popen* constructor
    for executing the ``glite-ce-delegate-proxy`` command. When *cache* is *True*, a json file is
    created alongside the proxy file, which stores the delegation ids per endpoint. The next time
    the exact same proxy should be delegated to the same endpoint, the cached delegation id is
    returned.
    """
    # get the proxy file
    if not proxy_file:
        proxy_file = get_voms_proxy_file()
    proxy_file = os.path.expandvars(os.path.expanduser(proxy_file))
    if not os.path.exists(proxy_file):
        raise Exception("proxy file '{}' does not exist".format(proxy_file))

    if cache:
        if isinstance(cache, six.string_types):
            cache_file = cache
        else:
            cache_file = proxy_file + "_delegation_cache.json"

        def remove_cache():
            try:
                if os.path.exists(cache_file):
                    os.remove(cache_file)
            except OSError:
                pass

        # create the hash of the proxy file content
        with open(proxy_file, "r") as f:
            proxy_hash = create_hash(f.read())

        # already delegated?
        cache_data = {}
        if os.path.exists(cache_file):
            with open(cache_file, "r") as f:
                try:
                    cache_data = json.load(f)
                except:
                    remove_cache()

        # is the hash up-to-date?
        if cache_data.get("hash") != proxy_hash:
            remove_cache()
            cache_data = {}

        # proxy already delegated to that endpoint?
        elif endpoint in cache_data.get("ids", []):
            return str(cache_data["ids"][endpoint])

    # do the actual delegation
    delegation_id = uuid.uuid4().hex
    cmd = ["glite-ce-delegate-proxy", "-e", endpoint, delegation_id]
    code = interruptable_popen(cmd, stdout=stdout, stderr=stderr)[0]
    if code != 0:
        raise Exception(
            "glite proxy delegation to endpoint {} failed".format(endpoint))

    if cache:
        # write the id back to the delegation file
        cache_data["hash"] = proxy_hash
        cache_data.setdefault("ids", {})[endpoint] = delegation_id
        with open(cache_file, "w") as f:
            json.dump(cache_data, f, indent=4)
        os.chmod(cache_file, 0o0600)

    return delegation_id
Exemplo n.º 5
0
Arquivo: job.py Projeto: bfis/law
    def query(self,
              job_id,
              pool=None,
              scheduler=None,
              user=None,
              silent=False):
        # default arguments
        if pool is None:
            pool = self.pool
        if scheduler is None:
            scheduler = self.scheduler
        if user is None:
            user = self.user

        chunking = isinstance(job_id, (list, tuple))
        job_ids = make_list(job_id)

        # default ClassAds to getch
        ads = "ClusterId,ProcId,JobStatus,ExitCode,ExitStatus,HoldReason,RemoveReason"

        # build the condor_q command
        cmd = ["condor_q"] + job_ids
        if pool:
            cmd += ["-pool", pool]
        if scheduler:
            cmd += ["-name", scheduler]
        cmd += ["-long"]
        # since v8.3.3 one can limit the number of jobs to query
        if self.htcondor_v833:
            cmd += ["-limit", str(len(job_ids))]
        # since v8.5.6 one can define the attributes to fetch
        if self.htcondor_v856:
            cmd += ["-attributes", ads]
        cmd = quote_cmd(cmd)

        logger.debug("query htcondor job(s) with command '{}'".format(cmd))
        code, out, err = interruptable_popen(cmd,
                                             shell=True,
                                             executable="/bin/bash",
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE)

        # handle errors
        if code != 0:
            if silent:
                return None
            else:
                raise Exception(
                    "queue query of htcondor job(s) '{}' failed with code {}:"
                    "\n{}".format(job_id, code, err))

        # parse the output and extract the status per job
        query_data = self.parse_long_output(out)

        # some jobs might already be in the condor history, so query for missing job ids
        missing_ids = [
            _job_id for _job_id in job_ids if _job_id not in query_data
        ]
        if missing_ids:
            # build the condor_history command, which is fairly similar to the condor_q command
            cmd = ["condor_history"] + missing_ids
            if pool:
                cmd += ["-pool", pool]
            if scheduler:
                cmd += ["-name", scheduler]
            cmd += ["-long"]
            # since v8.3.3 one can limit the number of jobs to query
            if self.htcondor_v833:
                cmd += ["-limit", str(len(missing_ids))]
            # since v8.5.6 one can define the attributes to fetch
            if self.htcondor_v856:
                cmd += ["-attributes", ads]
            cmd = quote_cmd(cmd)

            logger.debug(
                "query htcondor job history with command '{}'".format(cmd))
            code, out, err = interruptable_popen(cmd,
                                                 shell=True,
                                                 executable="/bin/bash",
                                                 stdout=subprocess.PIPE,
                                                 stderr=subprocess.PIPE)

            # handle errors
            if code != 0:
                if silent:
                    return None
                else:
                    raise Exception(
                        "history query of htcondor job(s) '{}' failed with code {}:"
                        "\n{}".format(job_id, code, err))

            # parse the output and update query data
            query_data.update(self.parse_long_output(out, job_ids=missing_ids))

        # compare to the requested job ids and perform some checks
        for _job_id in job_ids:
            if _job_id not in query_data:
                if not chunking:
                    if silent:
                        return None
                    else:
                        raise Exception(
                            "htcondor job(s) '{}' not found in query response".
                            format(job_id))
                else:
                    query_data[_job_id] = self.job_status_dict(
                        job_id=_job_id,
                        status=self.FAILED,
                        error="job not found in query response")

        return query_data if chunking else query_data[job_id]
Exemplo n.º 6
0
Arquivo: job.py Projeto: bfis/law
    def submit(self,
               job_file,
               pool=None,
               scheduler=None,
               retries=0,
               retry_delay=3,
               silent=False):
        # default arguments
        if pool is None:
            pool = self.pool
        if scheduler is None:
            scheduler = self.scheduler

        # get the job file location as the submission command is run it the same directory
        job_file_dir, job_file_name = os.path.split(os.path.abspath(job_file))

        # build the command
        cmd = ["condor_submit"]
        if pool:
            cmd += ["-pool", pool]
        if scheduler:
            cmd += ["-name", scheduler]
        cmd += [job_file_name]
        cmd = quote_cmd(cmd)

        # define the actual submission in a loop to simplify retries
        while True:
            # run the command
            logger.debug("submit htcondor job with command '{}'".format(cmd))
            code, out, err = interruptable_popen(cmd,
                                                 shell=True,
                                                 executable="/bin/bash",
                                                 stdout=subprocess.PIPE,
                                                 stderr=subprocess.PIPE,
                                                 cwd=job_file_dir)

            # get the job id(s)
            if code == 0:
                last_line = out.strip().split("\n")[-1].strip()
                m = self.submission_job_id_cre.match(last_line)
                if m:
                    job_ids = [
                        "{}.{}".format(m.group(2), i)
                        for i in range(int(m.group(1)))
                    ]
                else:
                    code = 1
                    err = "cannot parse htcondor job id(s) from output:\n{}".format(
                        out)

            # retry or done?
            if code == 0:
                return job_ids
            else:
                logger.debug(
                    "submission of htcondor job '{}' failed with code {}:\n{}".
                    format(job_file, code, err))
                if retries > 0:
                    retries -= 1
                    time.sleep(retry_delay)
                    continue
                elif silent:
                    return None
                else:
                    raise Exception(
                        "submission of htcondor job '{}' failed:\n{}".format(
                            job_file, err))
Exemplo n.º 7
0
Arquivo: job.py Projeto: riga/law
    def submit(self,
               job_file,
               partition=None,
               retries=0,
               retry_delay=3,
               silent=False):
        # default arguments
        if partition is None:
            partition = self.partition

        # get the job file location as the submission command is run it the same directory
        job_file_dir, job_file_name = os.path.split(os.path.abspath(job_file))

        # build the command
        cmd = ["sbatch"]
        if partition:
            cmd += ["--partition", partition]
        cmd += [job_file_name]
        cmd = quote_cmd(cmd)

        # define the actual submission in a loop to simplify retries
        while True:
            # run the command
            logger.debug("submit slurm job with command '{}'".format(cmd))
            code, out, err = interruptable_popen(cmd,
                                                 shell=True,
                                                 executable="/bin/bash",
                                                 stdout=subprocess.PIPE,
                                                 stderr=subprocess.PIPE,
                                                 cwd=job_file_dir)

            # get the job id(s)
            if code == 0:
                # loop through all lines and try to match the expected pattern
                for line in out.strip().split("\n")[::-1]:
                    m = self.submission_cre.match(line.strip())
                    if m:
                        job_ids = [int(m.group(1))]
                        break
                else:
                    code = 1
                    err = "cannot parse slurm job id(s) from output:\n{}".format(
                        out)

            # retry or done?
            if code == 0:
                return job_ids
            else:
                logger.debug(
                    "submission of slurm job '{}' failed with code {}:\n{}".
                    format(job_file, code, err))
                if retries > 0:
                    retries -= 1
                    time.sleep(retry_delay)
                    continue
                elif silent:
                    return None
                else:
                    raise Exception(
                        "submission of slurm job '{}' failed:\n{}".format(
                            job_file, err))
Exemplo n.º 8
0
Arquivo: util.py Projeto: meliache/law
def hadd_task(task, inputs, output, cwd=None, local=False, force=True):
    """
    This method is intended to be used by tasks that are supposed to merge root files, e.g. when
    inheriting from :py:class:`law.contrib.tasks.MergeCascade`. *inputs* should be a sequence of
    local targets that represent the files to merge into *output*. *cwd* is the working directory
    in which hadd is invoked. When empty, a temporary directory is used. The *task* itself is
    used to print and publish messages via its :py:meth:`law.Task.publish_message` and
    :py:meth:`law.Task.publish_step` methods.

    When *local* is *True*, the input and output targets are assumed to be local and the merging is
    based on their local paths. Otherwise, the targets are fetched first and the output target is
    localized.

    When *force* is *True*, any existing output file is overwritten (by adding the ``-f`` flag to
    ``hadd``).
    """
    # ensure inputs are targets
    inputs = [
        LocalFileTarget(inp) if isinstance(inp, six.string_types) else inp
        for inp in inputs
    ]

    # ensure output is a target
    if isinstance(output, six.string_types):
        output = LocalFileTarget(output)

    # default cwd
    if not cwd:
        cwd = LocalDirectoryTarget(is_tmp=True)
    elif isinstance(cwd, six.string_types):
        cwd = LocalDirectoryTarget(cwd)
    cwd.touch()

    # helper to create the hadd cmd
    def hadd_cmd(input_paths, output_path):
        cmd = ["hadd", "-n", "0"]
        if force:
            cmd.append("-f")
        cmd.extend(["-d", cwd.path])
        cmd.append(output_path)
        cmd.extend(input_paths)
        return quote_cmd(cmd)

    if local:
        # when local, there is no need to download inputs
        input_paths = [inp.path for inp in inputs]

        with task.publish_step("merging ...", runtime=True):
            if len(inputs) == 1:
                output.copy_from_local(inputs[0])
            else:
                # merge using hadd
                cmd = hadd_cmd(input_paths, output.path)
                code = interruptable_popen(cmd, shell=True, executable="/bin/bash")[0]
                if code != 0:
                    raise Exception("hadd failed")

        task.publish_message("merged file size: {}".format(human_bytes(
            output.stat.st_size, fmt=True)))

    else:
        # when not local, we need to fetch files first into the cwd
        with task.publish_step("fetching inputs ...", runtime=True):
            def fetch(inp):
                inp.copy_to_local(cwd.child(inp.unique_basename, type="f"), cache=False)
                return inp.unique_basename

            def callback(i):
                task.publish_message("fetch file {} / {}".format(i + 1, len(inputs)))

            bases = map_verbose(fetch, inputs, every=5, callback=callback)

        # start merging into the localized output
        with output.localize("w", cache=False) as tmp_out:
            with task.publish_step("merging ...", runtime=True):
                if len(bases) == 1:
                    tmp_out.path = cwd.child(bases[0]).path
                else:
                    # merge using hadd
                    cmd = hadd_cmd(bases, tmp_out.path)
                    code = interruptable_popen(cmd, shell=True, executable="/bin/bash",
                        cwd=cwd.path)[0]
                    if code != 0:
                        raise Exception("hadd failed")

                    task.publish_message("merged file size: {}".format(human_bytes(
                        tmp_out.stat.st_size, fmt=True)))
Exemplo n.º 9
0
    def run(self):
        # data
        _my_input_file_name = str(self.input_file_name)

        # ensure that the output directory exists
        output = self.output()
        output.parent.touch()

        # actual payload:
        print("=======================================================")
        print("Starting merge step to finish Herwig-cache and run file")
        print("=======================================================")

        # set environment variables
        my_env = self.set_environment_variables()

        # download the packed files from grid and unpack
        with self.input()['HerwigBuild'].localize('r') as _file:
            os.system('tar -xzf {}'.format(_file.path))

        for branch, target in self.input(
        )['HerwigIntegrate']["collection"].targets.items():
            if branch <= 10:
                print('Getting Herwig integration file: {}'.format(target))
            with target.localize('r') as _file:
                os.system('tar -xzf {}'.format(_file.path))

        # run Herwig build step
        _herwig_exec = ["Herwig", "mergegrids"]
        _herwig_args = [
            "{INPUT_FILE_NAME}.run".format(INPUT_FILE_NAME=_my_input_file_name)
        ]

        print('Executable: {}'.format(" ".join(_herwig_exec + _herwig_args)))

        code, out, error = interruptable_popen(_herwig_exec + _herwig_args,
                                               stdout=PIPE,
                                               stderr=PIPE,
                                               env=my_env)

        # if successful save final Herwig-cache and run-file as tar.gz
        if (code != 0):
            raise Exception(
                'Error: ' + error + 'Output: ' + out +
                '\nHerwig mergegrids returned non-zero exit status {}'.format(
                    code))
        else:
            print('Output: ' + out)

            output_file = "Herwig-cache.tar.gz"

            os.system(
                'tar -czf {OUTPUT_FILE} Herwig-cache {INPUT_FILE_NAME}.run'.
                format(OUTPUT_FILE=output_file,
                       INPUT_FILE_NAME=_my_input_file_name))

            if os.path.exists(output_file):
                output.copy_from_local(output_file)
                os.system(
                    'rm Herwig-cache.tar.gz {INPUT_FILE_NAME}.run'.format(
                        INPUT_FILE_NAME=_my_input_file_name))
            else:
                raise Exception(
                    "Output file '{}' doesn't exist! Abort!".format(
                        output_file))

        print("=======================================================")
Exemplo n.º 10
0
Arquivo: job.py Projeto: riga/law
    def query(self, job_id, partition=None, silent=False):
        # default arguments
        if partition is None:
            partition = self.partition

        chunking = isinstance(job_id, (list, tuple))
        job_ids = make_list(job_id)

        # build the squeue command
        cmd = ["squeue", "--Format", self.squeue_format, "--noheader"]
        if partition:
            cmd += ["--partition", partition]
        cmd += ["--jobs", ",".join(map(str, job_ids))]
        cmd = quote_cmd(cmd)

        logger.debug("query slurm job(s) with command '{}'".format(cmd))
        code, out, err = interruptable_popen(cmd,
                                             shell=True,
                                             executable="/bin/bash",
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE)

        # handle errors
        if code != 0:
            if silent:
                return None
            else:
                raise Exception(
                    "queue query of slurm job(s) '{}' failed with code {}:"
                    "\n{}".format(job_id, code, err))

        # parse the output and extract the status per job
        query_data = self.parse_squeue_output(out)

        # some jobs might already be in the accounting history, so query for missing job ids
        missing_ids = [
            _job_id for _job_id in job_ids if _job_id not in query_data
        ]
        if missing_ids:
            # build the sacct command
            cmd = ["sacct", "--format", self.sacct_format, "--noheader"]
            if partition:
                cmd += ["--partition", partition]
            cmd += ["--jobs", ",".join(map(str, missing_ids))]
            cmd = quote_cmd(cmd)

            logger.debug(
                "query slurm accounting history with command '{}'".format(cmd))
            code, out, err = interruptable_popen(cmd,
                                                 shell=True,
                                                 executable="/bin/bash",
                                                 stdout=subprocess.PIPE,
                                                 stderr=subprocess.PIPE)

            # handle errors
            if code != 0:
                if silent:
                    return None
                else:
                    raise Exception(
                        "accounting query of slurm job(s) '{}' failed with code {}:"
                        "\n{}".format(job_id, code, err))

            # parse the output and update query data
            query_data.update(self.parse_sacct_output(out))

        # compare to the requested job ids and perform some checks
        for _job_id in job_ids:
            if _job_id not in query_data:
                if not chunking:
                    if silent:
                        return None
                    else:
                        raise Exception(
                            "slurm job(s) '{}' not found in query response".
                            format(job_id))
                else:
                    query_data[_job_id] = self.job_status_dict(
                        job_id=_job_id,
                        status=self.FAILED,
                        error="job not found in query response")

        return query_data if chunking else query_data[job_id]
Exemplo n.º 11
0
    def run(self):

        # branch data
        _job_num = str(self.branch)
        _my_config = str(self.input_file_name)
        _num_events = str(self.events_per_job)
        _seed = int(self.branch_data)

        # ensure that the output directory exists
        output = self.output()
        try:
            output.parent.touch()
        except IOError:
            print("Output target doesn't exist!")

        # actual payload:
        print("=======================================================")
        print("Producing events ")
        print("=======================================================")

        # set environment variables
        my_env = os.environ

        # get the prepared Herwig-cache and runfiles and unpack them
        with self.input()['HerwigMerge'].localize('r') as _file:
            os.system('tar -xzf {}'.format(_file.path))

        # run Herwig event generation
        _herwig_exec = ["Herwig", "run"]
        _herwig_args = [
            "-q", "--seed={SEED}".format(SEED=_seed),
            "--numevents={NEVENTS}".format(NEVENTS=_num_events),
            "{INPUT_FILE_NAME}.run".format(INPUT_FILE_NAME=_my_config)
        ]

        # identify the setupfile if specified and copy it to working directory
        work_dir = os.getcwd()
        print("Setupfile: {}".format(self.setupfile))
        _setupfile_suffix = ""
        if all(self.setupfile != defaultval for defaultval in [None, "None"]):
            setupfile_path = os.path.join(os.getenv("ANALYSIS_PATH"),
                                          "generation", "setupfiles",
                                          str(self.setupfile))
            if os.path.exists(setupfile_path):
                print(
                    "Copy setupfile for executable {} to working directory {}".
                    format(setupfile_path, work_dir))
                # for python3 the next two lines can be merged
                shutil.copy(setupfile_path, work_dir)
                setupfile_path = os.path.basename(setupfile_path)
                # end of merge
                if os.path.exists(setupfile_path):
                    _herwig_args.append("--setupfile={SETUPFILE}".format(
                        SETUPFILE=setupfile_path))
                    _setupfile_suffix = "-" + setupfile_path
                else:
                    raise Exception(
                        "Specified setupfile {} doesn't exist! Abort!".format(
                            setupfile_path))
            else:
                raise Exception(
                    "Specified setupfile {} doesn't exist! Abort!".format(
                        setupfile_path))

        print('Executable: {}'.format(" ".join(_herwig_exec + _herwig_args)))

        code, out, error = interruptable_popen(_herwig_exec + _herwig_args,
                                               stdout=PIPE,
                                               stderr=PIPE,
                                               env=my_env)

        # if successful save HEPMC
        if (code != 0):
            raise Exception(
                'Error: ' + error + 'Output: ' + out +
                '\nHerwig run returned non-zero exit status {}'.format(code))
        else:
            print('Output: ' + out)
            print("Seed: {}".format(_seed))

            output_file = "{INPUT_FILE_NAME}.tar.bz2".format(
                INPUT_FILE_NAME=_my_config)
            if int(_seed) is not 0:
                output_file_hepmc = "{INPUT_FILE_NAME}-S{SEED}{SETUPFILE_SUFFIX}.hepmc".format(
                    INPUT_FILE_NAME=_my_config,
                    SEED=_seed,
                    SETUPFILE_SUFFIX=_setupfile_suffix)
                output_file_yoda = "{INPUT_FILE_NAME}-S{SEED}{SETUPFILE_SUFFIX}.yoda".format(
                    INPUT_FILE_NAME=_my_config,
                    SEED=_seed,
                    SETUPFILE_SUFFIX=_setupfile_suffix)
            else:
                output_file_hepmc = "{INPUT_FILE_NAME}{SETUPFILE_SUFFIX}.hepmc".format(
                    INPUT_FILE_NAME=_my_config,
                    SETUPFILE_SUFFIX=_setupfile_suffix)
                output_file_yoda = "{INPUT_FILE_NAME}{SETUPFILE_SUFFIX}.yoda".format(
                    INPUT_FILE_NAME=_my_config,
                    SETUPFILE_SUFFIX=_setupfile_suffix)

            if os.path.exists(output_file_hepmc):
                # tar and compress the output HepMC files to save disk space
                if os.path.exists(output_file_yoda):
                    # also add already existing YODA files if existant
                    os.system(
                        'tar -cvjf {OUTPUT_FILE} {HEPMC_FILE} {YODA_FILE}'.
                        format(OUTPUT_FILE=output_file,
                               HEPMC_FILE=output_file_hepmc,
                               YODA_FILE=output_file_yoda))
                else:
                    os.system('tar -cvjf {OUTPUT_FILE} {HEPMC_FILE}'.format(
                        OUTPUT_FILE=output_file, HEPMC_FILE=output_file_hepmc))
            else:
                os.system("ls -l")
                raise Exception("HepMC file {} doesn't exist! Abort!".format(
                    output_file_hepmc))

            if (os.path.exists(output_file)):
                # copy the compressed outputs to save them
                output.copy_from_local(output_file)
            else:
                raise Exception(
                    "Output file '{}' doesn't exist! Abort!".format(
                        output_file))

        print("=======================================================")
Exemplo n.º 12
0
Arquivo: job.py Projeto: ucontent/law
    def query(self,
              job_id,
              pool=None,
              scheduler=None,
              user=None,
              silent=False):
        # default arguments
        pool = pool or self.pool
        scheduler = scheduler or self.scheduler

        multi = isinstance(job_id, (list, tuple))
        job_ids = make_list(job_id)

        # query the condor queue
        cmd = ["condor_q"]
        # since htcondor 8.5.6, batch mode is default, so use -nobatch
        if self.htcondor_version and self.htcondor_version >= (8, 5, 6):
            cmd += ["-nobatch"]
        if pool:
            cmd += ["-pool", pool]
        if scheduler:
            cmd += ["-name", scheduler]
        cmd += job_ids
        logger.debug("query htcondor job(s) with command '{}'".format(cmd))
        code, out, err = interruptable_popen(cmd,
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE)

        # handle errors
        if code != 0:
            if silent:
                return None
            else:
                raise Exception(
                    "queue query of htcondor job(s) '{}' failed:\n{}".format(
                        job_id, err))

        # parse the output and extract the status per job
        query_data = self.parse_queue_output(out)

        # find missing jobs, and query the condor history for the exit code
        missing_ids = [
            _job_id for _job_id in job_ids if _job_id not in query_data
        ]
        if missing_ids:
            cmd = ["condor_history"]
            cmd += [user or getpass.getuser()] if multi else [job_id]
            cmd += ["-long"]
            # since htcondor 8.5.6, one can define the attributes to fetch
            if self.htcondor_version and self.htcondor_version >= (8, 5, 6):
                cmd += [
                    "-attributes",
                    "ClusterId,ProcId,ExitCode,RemoveReason,HoldReason"
                ]
            if pool:
                cmd += ["-pool", pool]
            if scheduler:
                cmd += ["-name", scheduler]
            logger.debug(
                "query htcondor job history with command '{}'".format(cmd))
            code, out, err = interruptable_popen(cmd,
                                                 stdout=subprocess.PIPE,
                                                 stderr=subprocess.PIPE)

            # handle errors
            if code != 0:
                if silent:
                    return None
                else:
                    raise Exception(
                        "history query of htcondor job(s) '{}' failed:\n{}".
                        format(job_id, err))

            # parse the output and update query data
            query_data.update(
                self.parse_history_output(out, job_ids=missing_ids))

        # compare to the requested job ids and perform some checks
        for _job_id in job_ids:
            if _job_id not in query_data:
                if not multi:
                    if silent:
                        return None
                    else:
                        raise Exception(
                            "htcondor job(s) '{}' not found in query response".
                            format(job_id))
                else:
                    query_data[_job_id] = self.job_status_dict(
                        job_id=_job_id,
                        status=self.FAILED,
                        error="job not found in query response")

        return query_data if multi else query_data[job_id]
Exemplo n.º 13
0
    def run(self):
        # data
        _my_input_file_name = str(self.input_file_name)
        _max_integration_jobs = str(self.integration_maxjobs)
        _config_path = str(self.config_path)

        if (_config_path == "" or _config_path == "default"):
            _my_input_file = os.path.join(os.path.dirname(__file__), "..",
                                          "..", "..", "inputfiles",
                                          "{}.in".format(self.input_file_name))
        else:
            _my_input_file = os.path.join(_config_path,
                                          "{}.in".format(self.input_file_name))

        # ensure that the output directory exists
        output = self.output()
        output.parent.touch()

        # actual payload:
        print("=========================================================")
        print("Starting build step to generate Herwig-cache and run file")
        print("=========================================================")

        # set environment variables
        my_env = self.set_environment_variables()

        # run Herwig build step
        _herwig_exec = ["Herwig", "build"]
        _herwig_args = [
            "--maxjobs={MAXJOBS}".format(MAXJOBS=_max_integration_jobs),
            "{INPUT_FILE}".format(INPUT_FILE=_my_input_file)
        ]

        print('Executable: {}'.format(" ".join(_herwig_exec + _herwig_args)))

        code, out, error = interruptable_popen(_herwig_exec + _herwig_args,
                                               stdout=PIPE,
                                               stderr=PIPE,
                                               env=my_env)

        # if successful save Herwig-cache and run-file as tar.gz
        if (code != 0):
            raise Exception(
                'Error: ' + error + 'Output: ' + out +
                '\nHerwig build returned non-zero exit status {}'.format(code))
        else:
            if (os.path.exists("Herwig-cache")):
                print('Output: ' + out)
                os.system(
                    'tar -czf Herwig-build.tar.gz Herwig-cache {INPUT_FILE_NAME}.run'
                    .format(INPUT_FILE_NAME=_my_input_file_name))
            else:
                Exception(
                    "Something went wrong, Herwig-cache doesn't exist! Abort!")

            if os.path.exists("Herwig-build.tar.gz"):
                output.copy_from_local("Herwig-build.tar.gz")
                os.system(
                    'rm Herwig-build.tar.gz {INPUT_FILE_NAME}.run'.format(
                        INPUT_FILE_NAME=_my_input_file_name))

        print("=======================================================")
Exemplo n.º 14
0
Arquivo: job.py Projeto: riga/law
    def submit(self,
               job_file,
               pool=None,
               scheduler=None,
               retries=0,
               retry_delay=3,
               silent=False):
        # default arguments
        if pool is None:
            pool = self.pool
        if scheduler is None:
            scheduler = self.scheduler

        # when job_file is a sequence of files, merge them all into one and submit it
        # however, this only for job files being located in the same directory or if they have an
        # "initialdir" defined
        def has_initialdir(job_file):
            with open(job_file, "r") as f:
                for line in f.readlines():
                    if line.lower().strip().replace(
                            " ", "").startswith("initialdir="):
                        return True
            return False

        chunking = isinstance(job_file, (list, tuple))
        job_files = make_list(job_file)
        job_file_dir = None
        for i, job_file in enumerate(job_files):
            dirname, basename = os.path.split(job_file)
            if job_file_dir is None:
                if i == len(job_files) - 1 or not has_initialdir(job_file):
                    job_file_dir = dirname
            elif dirname != job_file_dir:
                if not has_initialdir(job_file):
                    raise Exception(
                        "cannot performed chunked submission as job file '{}' is not located in a "
                        "previously seen directory '{}' and has no initialdir".
                        format(
                            job_file,
                            job_file_dir,
                        ), )

        # define the single, merged job file if necessary
        _job_file = job_files[0]
        if len(job_files) > 1:
            _job_file = tempfile.mkstemp(prefix="merged_job_",
                                         suffix=".jdl",
                                         dir=job_file_dir)[1]
            with open(_job_file, "w") as f:
                for job_file in job_files:
                    with open(job_file, "r") as _f:
                        f.write(_f.read() + "\n")

        # build the command
        cmd = ["condor_submit"]
        if pool:
            cmd += ["-pool", pool]
        if scheduler:
            cmd += ["-name", scheduler]
        cmd += [os.path.basename(_job_file)]
        cmd = quote_cmd(cmd)

        # define the actual submission in a loop to simplify retries
        while True:
            # run the command
            logger.debug("submit htcondor job with command '{}'".format(cmd))
            code, out, err = interruptable_popen(
                cmd,
                shell=True,
                executable="/bin/bash",
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                cwd=os.path.dirname(_job_file))

            # get the job id(s)
            if code == 0:
                # loop through all lines and try to match the expected pattern
                job_ids = []
                for line in out.strip().split("\n")[::-1]:
                    m = self.submission_job_id_cre.match(line.strip())
                    if m:
                        job_ids.extend([
                            "{}.{}".format(m.group(2), i)
                            for i in range(int(m.group(1)))
                        ])
                if not job_ids:
                    code = 1
                    err = "cannot parse htcondor job id(s) from output:\n{}".format(
                        out)

            # retry or done?
            if code == 0:
                return job_ids if chunking else job_ids[0]
            else:
                logger.debug(
                    "submission of htcondor job '{}' failed with code {}:\n{}".
                    format(_job_file, code, err))
                if retries > 0:
                    retries -= 1
                    time.sleep(retry_delay)
                    continue
                elif silent:
                    return None
                else:
                    raise Exception(
                        "submission of htcondor job '{}' failed:\n{}".format(
                            _job_file, err))
Exemplo n.º 15
0
    def mergeSingleYodaChunk(self, inputfile_list, inputfile_chunk=None):

        print("-------------------------------------------------------")
        print("Starting merging of chunk {}".format(inputfile_chunk))
        print("-------------------------------------------------------")

        # set environment variables
        my_env = self.set_environment_variables()

        # data
        _my_input_file_name = str(self.input_file_name)

        # merge the YODA files
        if inputfile_chunk == None:
            output_file = "{OUTPUT_FILE_NAME}.yoda".format(
                OUTPUT_FILE_NAME=_my_input_file_name)
        else:
            output_file = "{OUTPUT_FILE_NAME}_Chunk{BUNCH}.yoda".format(
                OUTPUT_FILE_NAME=_my_input_file_name, BUNCH=inputfile_chunk)

        _rivet_exec = ["rivet-merge"]
        _rivet_args = [
            "--output={OUTPUT_FILE}".format(OUTPUT_FILE=output_file)
        ]
        _rivet_in = ["-e"] + [
            "{YODA_FILES}".format(YODA_FILES=_yoda_file)
            for _yoda_file in inputfile_list
        ]

        if len(inputfile_list) > 10:
            print("Input files: {},...,{}".format(inputfile_list[0],
                                                  inputfile_list[-1]))
            print('Executable: {} {}'.format(
                " ".join(_rivet_exec + _rivet_args),
                " ".join([_rivet_in[0], "[...]", _rivet_in[-1]])))
        else:
            print("Input files: {}".format(inputfile_list))
            print('Executable: {}'.format(" ".join(_rivet_exec + _rivet_args +
                                                   _rivet_in)))

        code, out, error = interruptable_popen(_rivet_exec + _rivet_args +
                                               _rivet_in,
                                               stdout=PIPE,
                                               stderr=PIPE,
                                               env=my_env)

        # if successful return merged YODA file
        if (code != 0):
            raise Exception(
                'Error: ' + error + 'Output: ' + out +
                '\nYodaMerge returned non-zero exit status {}'.format(code))
        else:
            print('Output: ' + out)

        try:
            os.path.exists(output_file)
        except:
            print("Could not find output file {}!".format(output_file))

        print("-------------------------------------------------------")

        return output_file