def submit(self, job_file, ce=None, delegation_id=None, retries=0, retry_delay=3, silent=False): # default arguments ce = ce or self.ce delegation_id = delegation_id or self.delegation_id # check arguments if not ce: raise ValueError("ce must not be empty") # prepare round robin for ces and delegations ce = make_list(ce) if delegation_id: delegation_id = make_list(delegation_id) if len(ce) != len(delegation_id): raise Exception( "numbers of CEs ({}) and delegation ids ({}) do not match". format(len(ce), len(delegation_id))) # get the job file location as the submission command is run it the same directory job_file_dir, job_file_name = os.path.split(os.path.abspath(job_file)) # define the actual submission in a loop to simplify retries while True: # build the command i = random.randint(0, len(ce) - 1) cmd = ["glite-ce-job-submit", "-r", ce[i]] if delegation_id: cmd += ["-D", delegation_id[i]] cmd += [job_file_name] # run the command # glite prints everything to stdout logger.debug("submit glite job with command '{}'".format(cmd)) code, out, _ = interruptable_popen(cmd, stdout=subprocess.PIPE, stderr=sys.stderr, cwd=job_file_dir) # in some cases, the return code is 0 but the ce did not respond with a valid id if code == 0: job_id = out.strip().split("\n")[-1].strip() if not self.submission_job_id_cre.match(job_id): code = 1 out = "bad job id '{}' from output:\n{}".format( job_id, out) # retry or done? if code == 0: return job_id else: logger.debug("submission of glite job '{}' failed:\n{}".format( job_file, out)) if retries > 0: retries -= 1 time.sleep(retry_delay) continue elif silent: return None else: raise Exception( "submission of glite job '{}' failed:\n{}".format( job_file, out))
def submit(self, job_file, job_list=None, ce=None, retries=0, retry_delay=3, silent=False): # default arguments if job_list is None: job_list = self.job_list if ce is None: ce = self.ce # check arguments if not ce: raise ValueError("ce must not be empty") ce = make_list(ce) # arc supports multiple jobs to be submitted with a single arcsub call, # so job_file can be a sequence of files # when this is the case, we have to make the assumption that their input files are all # absolute, or they are relative but all in the same directory chunking = isinstance(job_file, (list, tuple)) job_files = make_list(job_file) job_file_dir = os.path.dirname(os.path.abspath(job_files[0])) job_file_names = [os.path.basename(jf) for jf in job_files] # define the actual submission in a loop to simplify retries while True: # build the command cmd = ["arcsub", "-c", random.choice(ce)] if job_list: cmd += ["-j", job_list] cmd += job_file_names cmd = quote_cmd(cmd) # run the command logger.debug("submit arc job(s) with command '{}'".format(cmd)) code, out, _ = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=sys.stderr, cwd=job_file_dir) # in some cases, the return code is 0 but the ce did not respond valid job ids job_ids = [] if code == 0: for line in out.strip().split("\n"): m = self.submission_job_id_cre.match(line.strip()) if m: job_id = m.group(1) job_ids.append(job_id) if not job_ids: code = 1 out = "cannot find job id(s) in output:\n{}".format(out) elif len(job_ids) != len(job_files): raise Exception( "number of job ids in output ({}) does not match number of " "jobs to submit ({}) in output:\n{}".format( len(job_ids), len(job_files), out)) # retry or done? if code == 0: return job_ids if chunking else job_ids[0] else: logger.debug( "submission of arc job(s) '{}' failed with code {}:\n{}". format(job_files, code, out)) if retries > 0: retries -= 1 time.sleep(retry_delay) continue elif silent: return None else: raise Exception( "submission of arc job(s) '{}' failed:\n{}".format( job_files, out))
def env(self): # strategy: unlike docker, singularity might not allow binding of paths that do not exist # in the container, so create a tmp directory on the host system and bind it as /tmp, let # python dump its full env into a file, and read the file again on the host system if self.image not in self._envs: tmp_dir = LocalDirectoryTarget(is_tmp=True) tmp_dir.touch() tmp = tmp_dir.child("env", type="f") tmp.touch() # determine whether volume binding is allowed allow_binds_cb = getattr(self.task, "singularity_allow_binds", None) if callable(allow_binds_cb): allow_binds = allow_binds_cb() else: cfg = Config.instance() allow_binds = cfg.get_expanded(self.get_config_section(), "allow_binds") # arguments to configure the environment args = ["-e"] if allow_binds: args.extend(["-B", "{}:/tmp".format(tmp_dir.path)]) env_file = "/tmp/{}".format(tmp.basename) else: env_file = tmp.path # get the singularity exec command singularity_exec_cmd = self._singularity_exec_cmd() + args # build commands to setup the environment setup_cmds = self._build_setup_cmds(self._get_env()) # build the python command that dumps the environment py_cmd = "import os,pickle;" \ + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_file) # build the full command cmd = quote_cmd(singularity_exec_cmd + [ self.image, "bash", "-l", "-c", "; ".join( flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd]))), ]) # run it code, out, _ = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT) if code != 0: raise Exception( "singularity sandbox env loading failed:\n{}".format(out)) # load the environment from the tmp file env = tmp.load(formatter="pickle") # cache self._envs[self.image] = env return self._envs[self.image]
def delegate_voms_proxy_glite(endpoint, proxy_file=None, stdout=None, stderr=None, cache=True): """ Delegates the voms proxy via gLite to an *endpoint*, e.g. ``grid-ce.physik.rwth-aachen.de:8443``. When *proxy_file* is *None*, it defaults to the result of :py:func:`get_voms_proxy_file`. *stdout* and *stderr* are passed to the *Popen* constructor for executing the ``glite-ce-delegate-proxy`` command. When *cache* is *True*, a json file is created alongside the proxy file, which stores the delegation ids per endpoint. The next time the exact same proxy should be delegated to the same endpoint, the cached delegation id is returned. """ # get the proxy file if not proxy_file: proxy_file = get_voms_proxy_file() proxy_file = os.path.expandvars(os.path.expanduser(proxy_file)) if not os.path.exists(proxy_file): raise Exception("proxy file '{}' does not exist".format(proxy_file)) if cache: if isinstance(cache, six.string_types): cache_file = cache else: cache_file = proxy_file + "_delegation_cache.json" def remove_cache(): try: if os.path.exists(cache_file): os.remove(cache_file) except OSError: pass # create the hash of the proxy file content with open(proxy_file, "r") as f: proxy_hash = create_hash(f.read()) # already delegated? cache_data = {} if os.path.exists(cache_file): with open(cache_file, "r") as f: try: cache_data = json.load(f) except: remove_cache() # is the hash up-to-date? if cache_data.get("hash") != proxy_hash: remove_cache() cache_data = {} # proxy already delegated to that endpoint? elif endpoint in cache_data.get("ids", []): return str(cache_data["ids"][endpoint]) # do the actual delegation delegation_id = uuid.uuid4().hex cmd = ["glite-ce-delegate-proxy", "-e", endpoint, delegation_id] code = interruptable_popen(cmd, stdout=stdout, stderr=stderr)[0] if code != 0: raise Exception( "glite proxy delegation to endpoint {} failed".format(endpoint)) if cache: # write the id back to the delegation file cache_data["hash"] = proxy_hash cache_data.setdefault("ids", {})[endpoint] = delegation_id with open(cache_file, "w") as f: json.dump(cache_data, f, indent=4) os.chmod(cache_file, 0o0600) return delegation_id
def query(self, job_id, pool=None, scheduler=None, user=None, silent=False): # default arguments if pool is None: pool = self.pool if scheduler is None: scheduler = self.scheduler if user is None: user = self.user chunking = isinstance(job_id, (list, tuple)) job_ids = make_list(job_id) # default ClassAds to getch ads = "ClusterId,ProcId,JobStatus,ExitCode,ExitStatus,HoldReason,RemoveReason" # build the condor_q command cmd = ["condor_q"] + job_ids if pool: cmd += ["-pool", pool] if scheduler: cmd += ["-name", scheduler] cmd += ["-long"] # since v8.3.3 one can limit the number of jobs to query if self.htcondor_v833: cmd += ["-limit", str(len(job_ids))] # since v8.5.6 one can define the attributes to fetch if self.htcondor_v856: cmd += ["-attributes", ads] cmd = quote_cmd(cmd) logger.debug("query htcondor job(s) with command '{}'".format(cmd)) code, out, err = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE) # handle errors if code != 0: if silent: return None else: raise Exception( "queue query of htcondor job(s) '{}' failed with code {}:" "\n{}".format(job_id, code, err)) # parse the output and extract the status per job query_data = self.parse_long_output(out) # some jobs might already be in the condor history, so query for missing job ids missing_ids = [ _job_id for _job_id in job_ids if _job_id not in query_data ] if missing_ids: # build the condor_history command, which is fairly similar to the condor_q command cmd = ["condor_history"] + missing_ids if pool: cmd += ["-pool", pool] if scheduler: cmd += ["-name", scheduler] cmd += ["-long"] # since v8.3.3 one can limit the number of jobs to query if self.htcondor_v833: cmd += ["-limit", str(len(missing_ids))] # since v8.5.6 one can define the attributes to fetch if self.htcondor_v856: cmd += ["-attributes", ads] cmd = quote_cmd(cmd) logger.debug( "query htcondor job history with command '{}'".format(cmd)) code, out, err = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE) # handle errors if code != 0: if silent: return None else: raise Exception( "history query of htcondor job(s) '{}' failed with code {}:" "\n{}".format(job_id, code, err)) # parse the output and update query data query_data.update(self.parse_long_output(out, job_ids=missing_ids)) # compare to the requested job ids and perform some checks for _job_id in job_ids: if _job_id not in query_data: if not chunking: if silent: return None else: raise Exception( "htcondor job(s) '{}' not found in query response". format(job_id)) else: query_data[_job_id] = self.job_status_dict( job_id=_job_id, status=self.FAILED, error="job not found in query response") return query_data if chunking else query_data[job_id]
def submit(self, job_file, pool=None, scheduler=None, retries=0, retry_delay=3, silent=False): # default arguments if pool is None: pool = self.pool if scheduler is None: scheduler = self.scheduler # get the job file location as the submission command is run it the same directory job_file_dir, job_file_name = os.path.split(os.path.abspath(job_file)) # build the command cmd = ["condor_submit"] if pool: cmd += ["-pool", pool] if scheduler: cmd += ["-name", scheduler] cmd += [job_file_name] cmd = quote_cmd(cmd) # define the actual submission in a loop to simplify retries while True: # run the command logger.debug("submit htcondor job with command '{}'".format(cmd)) code, out, err = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=job_file_dir) # get the job id(s) if code == 0: last_line = out.strip().split("\n")[-1].strip() m = self.submission_job_id_cre.match(last_line) if m: job_ids = [ "{}.{}".format(m.group(2), i) for i in range(int(m.group(1))) ] else: code = 1 err = "cannot parse htcondor job id(s) from output:\n{}".format( out) # retry or done? if code == 0: return job_ids else: logger.debug( "submission of htcondor job '{}' failed with code {}:\n{}". format(job_file, code, err)) if retries > 0: retries -= 1 time.sleep(retry_delay) continue elif silent: return None else: raise Exception( "submission of htcondor job '{}' failed:\n{}".format( job_file, err))
def submit(self, job_file, partition=None, retries=0, retry_delay=3, silent=False): # default arguments if partition is None: partition = self.partition # get the job file location as the submission command is run it the same directory job_file_dir, job_file_name = os.path.split(os.path.abspath(job_file)) # build the command cmd = ["sbatch"] if partition: cmd += ["--partition", partition] cmd += [job_file_name] cmd = quote_cmd(cmd) # define the actual submission in a loop to simplify retries while True: # run the command logger.debug("submit slurm job with command '{}'".format(cmd)) code, out, err = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=job_file_dir) # get the job id(s) if code == 0: # loop through all lines and try to match the expected pattern for line in out.strip().split("\n")[::-1]: m = self.submission_cre.match(line.strip()) if m: job_ids = [int(m.group(1))] break else: code = 1 err = "cannot parse slurm job id(s) from output:\n{}".format( out) # retry or done? if code == 0: return job_ids else: logger.debug( "submission of slurm job '{}' failed with code {}:\n{}". format(job_file, code, err)) if retries > 0: retries -= 1 time.sleep(retry_delay) continue elif silent: return None else: raise Exception( "submission of slurm job '{}' failed:\n{}".format( job_file, err))
def hadd_task(task, inputs, output, cwd=None, local=False, force=True): """ This method is intended to be used by tasks that are supposed to merge root files, e.g. when inheriting from :py:class:`law.contrib.tasks.MergeCascade`. *inputs* should be a sequence of local targets that represent the files to merge into *output*. *cwd* is the working directory in which hadd is invoked. When empty, a temporary directory is used. The *task* itself is used to print and publish messages via its :py:meth:`law.Task.publish_message` and :py:meth:`law.Task.publish_step` methods. When *local* is *True*, the input and output targets are assumed to be local and the merging is based on their local paths. Otherwise, the targets are fetched first and the output target is localized. When *force* is *True*, any existing output file is overwritten (by adding the ``-f`` flag to ``hadd``). """ # ensure inputs are targets inputs = [ LocalFileTarget(inp) if isinstance(inp, six.string_types) else inp for inp in inputs ] # ensure output is a target if isinstance(output, six.string_types): output = LocalFileTarget(output) # default cwd if not cwd: cwd = LocalDirectoryTarget(is_tmp=True) elif isinstance(cwd, six.string_types): cwd = LocalDirectoryTarget(cwd) cwd.touch() # helper to create the hadd cmd def hadd_cmd(input_paths, output_path): cmd = ["hadd", "-n", "0"] if force: cmd.append("-f") cmd.extend(["-d", cwd.path]) cmd.append(output_path) cmd.extend(input_paths) return quote_cmd(cmd) if local: # when local, there is no need to download inputs input_paths = [inp.path for inp in inputs] with task.publish_step("merging ...", runtime=True): if len(inputs) == 1: output.copy_from_local(inputs[0]) else: # merge using hadd cmd = hadd_cmd(input_paths, output.path) code = interruptable_popen(cmd, shell=True, executable="/bin/bash")[0] if code != 0: raise Exception("hadd failed") task.publish_message("merged file size: {}".format(human_bytes( output.stat.st_size, fmt=True))) else: # when not local, we need to fetch files first into the cwd with task.publish_step("fetching inputs ...", runtime=True): def fetch(inp): inp.copy_to_local(cwd.child(inp.unique_basename, type="f"), cache=False) return inp.unique_basename def callback(i): task.publish_message("fetch file {} / {}".format(i + 1, len(inputs))) bases = map_verbose(fetch, inputs, every=5, callback=callback) # start merging into the localized output with output.localize("w", cache=False) as tmp_out: with task.publish_step("merging ...", runtime=True): if len(bases) == 1: tmp_out.path = cwd.child(bases[0]).path else: # merge using hadd cmd = hadd_cmd(bases, tmp_out.path) code = interruptable_popen(cmd, shell=True, executable="/bin/bash", cwd=cwd.path)[0] if code != 0: raise Exception("hadd failed") task.publish_message("merged file size: {}".format(human_bytes( tmp_out.stat.st_size, fmt=True)))
def run(self): # data _my_input_file_name = str(self.input_file_name) # ensure that the output directory exists output = self.output() output.parent.touch() # actual payload: print("=======================================================") print("Starting merge step to finish Herwig-cache and run file") print("=======================================================") # set environment variables my_env = self.set_environment_variables() # download the packed files from grid and unpack with self.input()['HerwigBuild'].localize('r') as _file: os.system('tar -xzf {}'.format(_file.path)) for branch, target in self.input( )['HerwigIntegrate']["collection"].targets.items(): if branch <= 10: print('Getting Herwig integration file: {}'.format(target)) with target.localize('r') as _file: os.system('tar -xzf {}'.format(_file.path)) # run Herwig build step _herwig_exec = ["Herwig", "mergegrids"] _herwig_args = [ "{INPUT_FILE_NAME}.run".format(INPUT_FILE_NAME=_my_input_file_name) ] print('Executable: {}'.format(" ".join(_herwig_exec + _herwig_args))) code, out, error = interruptable_popen(_herwig_exec + _herwig_args, stdout=PIPE, stderr=PIPE, env=my_env) # if successful save final Herwig-cache and run-file as tar.gz if (code != 0): raise Exception( 'Error: ' + error + 'Output: ' + out + '\nHerwig mergegrids returned non-zero exit status {}'.format( code)) else: print('Output: ' + out) output_file = "Herwig-cache.tar.gz" os.system( 'tar -czf {OUTPUT_FILE} Herwig-cache {INPUT_FILE_NAME}.run'. format(OUTPUT_FILE=output_file, INPUT_FILE_NAME=_my_input_file_name)) if os.path.exists(output_file): output.copy_from_local(output_file) os.system( 'rm Herwig-cache.tar.gz {INPUT_FILE_NAME}.run'.format( INPUT_FILE_NAME=_my_input_file_name)) else: raise Exception( "Output file '{}' doesn't exist! Abort!".format( output_file)) print("=======================================================")
def query(self, job_id, partition=None, silent=False): # default arguments if partition is None: partition = self.partition chunking = isinstance(job_id, (list, tuple)) job_ids = make_list(job_id) # build the squeue command cmd = ["squeue", "--Format", self.squeue_format, "--noheader"] if partition: cmd += ["--partition", partition] cmd += ["--jobs", ",".join(map(str, job_ids))] cmd = quote_cmd(cmd) logger.debug("query slurm job(s) with command '{}'".format(cmd)) code, out, err = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE) # handle errors if code != 0: if silent: return None else: raise Exception( "queue query of slurm job(s) '{}' failed with code {}:" "\n{}".format(job_id, code, err)) # parse the output and extract the status per job query_data = self.parse_squeue_output(out) # some jobs might already be in the accounting history, so query for missing job ids missing_ids = [ _job_id for _job_id in job_ids if _job_id not in query_data ] if missing_ids: # build the sacct command cmd = ["sacct", "--format", self.sacct_format, "--noheader"] if partition: cmd += ["--partition", partition] cmd += ["--jobs", ",".join(map(str, missing_ids))] cmd = quote_cmd(cmd) logger.debug( "query slurm accounting history with command '{}'".format(cmd)) code, out, err = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE) # handle errors if code != 0: if silent: return None else: raise Exception( "accounting query of slurm job(s) '{}' failed with code {}:" "\n{}".format(job_id, code, err)) # parse the output and update query data query_data.update(self.parse_sacct_output(out)) # compare to the requested job ids and perform some checks for _job_id in job_ids: if _job_id not in query_data: if not chunking: if silent: return None else: raise Exception( "slurm job(s) '{}' not found in query response". format(job_id)) else: query_data[_job_id] = self.job_status_dict( job_id=_job_id, status=self.FAILED, error="job not found in query response") return query_data if chunking else query_data[job_id]
def run(self): # branch data _job_num = str(self.branch) _my_config = str(self.input_file_name) _num_events = str(self.events_per_job) _seed = int(self.branch_data) # ensure that the output directory exists output = self.output() try: output.parent.touch() except IOError: print("Output target doesn't exist!") # actual payload: print("=======================================================") print("Producing events ") print("=======================================================") # set environment variables my_env = os.environ # get the prepared Herwig-cache and runfiles and unpack them with self.input()['HerwigMerge'].localize('r') as _file: os.system('tar -xzf {}'.format(_file.path)) # run Herwig event generation _herwig_exec = ["Herwig", "run"] _herwig_args = [ "-q", "--seed={SEED}".format(SEED=_seed), "--numevents={NEVENTS}".format(NEVENTS=_num_events), "{INPUT_FILE_NAME}.run".format(INPUT_FILE_NAME=_my_config) ] # identify the setupfile if specified and copy it to working directory work_dir = os.getcwd() print("Setupfile: {}".format(self.setupfile)) _setupfile_suffix = "" if all(self.setupfile != defaultval for defaultval in [None, "None"]): setupfile_path = os.path.join(os.getenv("ANALYSIS_PATH"), "generation", "setupfiles", str(self.setupfile)) if os.path.exists(setupfile_path): print( "Copy setupfile for executable {} to working directory {}". format(setupfile_path, work_dir)) # for python3 the next two lines can be merged shutil.copy(setupfile_path, work_dir) setupfile_path = os.path.basename(setupfile_path) # end of merge if os.path.exists(setupfile_path): _herwig_args.append("--setupfile={SETUPFILE}".format( SETUPFILE=setupfile_path)) _setupfile_suffix = "-" + setupfile_path else: raise Exception( "Specified setupfile {} doesn't exist! Abort!".format( setupfile_path)) else: raise Exception( "Specified setupfile {} doesn't exist! Abort!".format( setupfile_path)) print('Executable: {}'.format(" ".join(_herwig_exec + _herwig_args))) code, out, error = interruptable_popen(_herwig_exec + _herwig_args, stdout=PIPE, stderr=PIPE, env=my_env) # if successful save HEPMC if (code != 0): raise Exception( 'Error: ' + error + 'Output: ' + out + '\nHerwig run returned non-zero exit status {}'.format(code)) else: print('Output: ' + out) print("Seed: {}".format(_seed)) output_file = "{INPUT_FILE_NAME}.tar.bz2".format( INPUT_FILE_NAME=_my_config) if int(_seed) is not 0: output_file_hepmc = "{INPUT_FILE_NAME}-S{SEED}{SETUPFILE_SUFFIX}.hepmc".format( INPUT_FILE_NAME=_my_config, SEED=_seed, SETUPFILE_SUFFIX=_setupfile_suffix) output_file_yoda = "{INPUT_FILE_NAME}-S{SEED}{SETUPFILE_SUFFIX}.yoda".format( INPUT_FILE_NAME=_my_config, SEED=_seed, SETUPFILE_SUFFIX=_setupfile_suffix) else: output_file_hepmc = "{INPUT_FILE_NAME}{SETUPFILE_SUFFIX}.hepmc".format( INPUT_FILE_NAME=_my_config, SETUPFILE_SUFFIX=_setupfile_suffix) output_file_yoda = "{INPUT_FILE_NAME}{SETUPFILE_SUFFIX}.yoda".format( INPUT_FILE_NAME=_my_config, SETUPFILE_SUFFIX=_setupfile_suffix) if os.path.exists(output_file_hepmc): # tar and compress the output HepMC files to save disk space if os.path.exists(output_file_yoda): # also add already existing YODA files if existant os.system( 'tar -cvjf {OUTPUT_FILE} {HEPMC_FILE} {YODA_FILE}'. format(OUTPUT_FILE=output_file, HEPMC_FILE=output_file_hepmc, YODA_FILE=output_file_yoda)) else: os.system('tar -cvjf {OUTPUT_FILE} {HEPMC_FILE}'.format( OUTPUT_FILE=output_file, HEPMC_FILE=output_file_hepmc)) else: os.system("ls -l") raise Exception("HepMC file {} doesn't exist! Abort!".format( output_file_hepmc)) if (os.path.exists(output_file)): # copy the compressed outputs to save them output.copy_from_local(output_file) else: raise Exception( "Output file '{}' doesn't exist! Abort!".format( output_file)) print("=======================================================")
def query(self, job_id, pool=None, scheduler=None, user=None, silent=False): # default arguments pool = pool or self.pool scheduler = scheduler or self.scheduler multi = isinstance(job_id, (list, tuple)) job_ids = make_list(job_id) # query the condor queue cmd = ["condor_q"] # since htcondor 8.5.6, batch mode is default, so use -nobatch if self.htcondor_version and self.htcondor_version >= (8, 5, 6): cmd += ["-nobatch"] if pool: cmd += ["-pool", pool] if scheduler: cmd += ["-name", scheduler] cmd += job_ids logger.debug("query htcondor job(s) with command '{}'".format(cmd)) code, out, err = interruptable_popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # handle errors if code != 0: if silent: return None else: raise Exception( "queue query of htcondor job(s) '{}' failed:\n{}".format( job_id, err)) # parse the output and extract the status per job query_data = self.parse_queue_output(out) # find missing jobs, and query the condor history for the exit code missing_ids = [ _job_id for _job_id in job_ids if _job_id not in query_data ] if missing_ids: cmd = ["condor_history"] cmd += [user or getpass.getuser()] if multi else [job_id] cmd += ["-long"] # since htcondor 8.5.6, one can define the attributes to fetch if self.htcondor_version and self.htcondor_version >= (8, 5, 6): cmd += [ "-attributes", "ClusterId,ProcId,ExitCode,RemoveReason,HoldReason" ] if pool: cmd += ["-pool", pool] if scheduler: cmd += ["-name", scheduler] logger.debug( "query htcondor job history with command '{}'".format(cmd)) code, out, err = interruptable_popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # handle errors if code != 0: if silent: return None else: raise Exception( "history query of htcondor job(s) '{}' failed:\n{}". format(job_id, err)) # parse the output and update query data query_data.update( self.parse_history_output(out, job_ids=missing_ids)) # compare to the requested job ids and perform some checks for _job_id in job_ids: if _job_id not in query_data: if not multi: if silent: return None else: raise Exception( "htcondor job(s) '{}' not found in query response". format(job_id)) else: query_data[_job_id] = self.job_status_dict( job_id=_job_id, status=self.FAILED, error="job not found in query response") return query_data if multi else query_data[job_id]
def run(self): # data _my_input_file_name = str(self.input_file_name) _max_integration_jobs = str(self.integration_maxjobs) _config_path = str(self.config_path) if (_config_path == "" or _config_path == "default"): _my_input_file = os.path.join(os.path.dirname(__file__), "..", "..", "..", "inputfiles", "{}.in".format(self.input_file_name)) else: _my_input_file = os.path.join(_config_path, "{}.in".format(self.input_file_name)) # ensure that the output directory exists output = self.output() output.parent.touch() # actual payload: print("=========================================================") print("Starting build step to generate Herwig-cache and run file") print("=========================================================") # set environment variables my_env = self.set_environment_variables() # run Herwig build step _herwig_exec = ["Herwig", "build"] _herwig_args = [ "--maxjobs={MAXJOBS}".format(MAXJOBS=_max_integration_jobs), "{INPUT_FILE}".format(INPUT_FILE=_my_input_file) ] print('Executable: {}'.format(" ".join(_herwig_exec + _herwig_args))) code, out, error = interruptable_popen(_herwig_exec + _herwig_args, stdout=PIPE, stderr=PIPE, env=my_env) # if successful save Herwig-cache and run-file as tar.gz if (code != 0): raise Exception( 'Error: ' + error + 'Output: ' + out + '\nHerwig build returned non-zero exit status {}'.format(code)) else: if (os.path.exists("Herwig-cache")): print('Output: ' + out) os.system( 'tar -czf Herwig-build.tar.gz Herwig-cache {INPUT_FILE_NAME}.run' .format(INPUT_FILE_NAME=_my_input_file_name)) else: Exception( "Something went wrong, Herwig-cache doesn't exist! Abort!") if os.path.exists("Herwig-build.tar.gz"): output.copy_from_local("Herwig-build.tar.gz") os.system( 'rm Herwig-build.tar.gz {INPUT_FILE_NAME}.run'.format( INPUT_FILE_NAME=_my_input_file_name)) print("=======================================================")
def submit(self, job_file, pool=None, scheduler=None, retries=0, retry_delay=3, silent=False): # default arguments if pool is None: pool = self.pool if scheduler is None: scheduler = self.scheduler # when job_file is a sequence of files, merge them all into one and submit it # however, this only for job files being located in the same directory or if they have an # "initialdir" defined def has_initialdir(job_file): with open(job_file, "r") as f: for line in f.readlines(): if line.lower().strip().replace( " ", "").startswith("initialdir="): return True return False chunking = isinstance(job_file, (list, tuple)) job_files = make_list(job_file) job_file_dir = None for i, job_file in enumerate(job_files): dirname, basename = os.path.split(job_file) if job_file_dir is None: if i == len(job_files) - 1 or not has_initialdir(job_file): job_file_dir = dirname elif dirname != job_file_dir: if not has_initialdir(job_file): raise Exception( "cannot performed chunked submission as job file '{}' is not located in a " "previously seen directory '{}' and has no initialdir". format( job_file, job_file_dir, ), ) # define the single, merged job file if necessary _job_file = job_files[0] if len(job_files) > 1: _job_file = tempfile.mkstemp(prefix="merged_job_", suffix=".jdl", dir=job_file_dir)[1] with open(_job_file, "w") as f: for job_file in job_files: with open(job_file, "r") as _f: f.write(_f.read() + "\n") # build the command cmd = ["condor_submit"] if pool: cmd += ["-pool", pool] if scheduler: cmd += ["-name", scheduler] cmd += [os.path.basename(_job_file)] cmd = quote_cmd(cmd) # define the actual submission in a loop to simplify retries while True: # run the command logger.debug("submit htcondor job with command '{}'".format(cmd)) code, out, err = interruptable_popen( cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(_job_file)) # get the job id(s) if code == 0: # loop through all lines and try to match the expected pattern job_ids = [] for line in out.strip().split("\n")[::-1]: m = self.submission_job_id_cre.match(line.strip()) if m: job_ids.extend([ "{}.{}".format(m.group(2), i) for i in range(int(m.group(1))) ]) if not job_ids: code = 1 err = "cannot parse htcondor job id(s) from output:\n{}".format( out) # retry or done? if code == 0: return job_ids if chunking else job_ids[0] else: logger.debug( "submission of htcondor job '{}' failed with code {}:\n{}". format(_job_file, code, err)) if retries > 0: retries -= 1 time.sleep(retry_delay) continue elif silent: return None else: raise Exception( "submission of htcondor job '{}' failed:\n{}".format( _job_file, err))
def mergeSingleYodaChunk(self, inputfile_list, inputfile_chunk=None): print("-------------------------------------------------------") print("Starting merging of chunk {}".format(inputfile_chunk)) print("-------------------------------------------------------") # set environment variables my_env = self.set_environment_variables() # data _my_input_file_name = str(self.input_file_name) # merge the YODA files if inputfile_chunk == None: output_file = "{OUTPUT_FILE_NAME}.yoda".format( OUTPUT_FILE_NAME=_my_input_file_name) else: output_file = "{OUTPUT_FILE_NAME}_Chunk{BUNCH}.yoda".format( OUTPUT_FILE_NAME=_my_input_file_name, BUNCH=inputfile_chunk) _rivet_exec = ["rivet-merge"] _rivet_args = [ "--output={OUTPUT_FILE}".format(OUTPUT_FILE=output_file) ] _rivet_in = ["-e"] + [ "{YODA_FILES}".format(YODA_FILES=_yoda_file) for _yoda_file in inputfile_list ] if len(inputfile_list) > 10: print("Input files: {},...,{}".format(inputfile_list[0], inputfile_list[-1])) print('Executable: {} {}'.format( " ".join(_rivet_exec + _rivet_args), " ".join([_rivet_in[0], "[...]", _rivet_in[-1]]))) else: print("Input files: {}".format(inputfile_list)) print('Executable: {}'.format(" ".join(_rivet_exec + _rivet_args + _rivet_in))) code, out, error = interruptable_popen(_rivet_exec + _rivet_args + _rivet_in, stdout=PIPE, stderr=PIPE, env=my_env) # if successful return merged YODA file if (code != 0): raise Exception( 'Error: ' + error + 'Output: ' + out + '\nYodaMerge returned non-zero exit status {}'.format(code)) else: print('Output: ' + out) try: os.path.exists(output_file) except: print("Could not find output file {}!".format(output_file)) print("-------------------------------------------------------") return output_file