def cat_job(self, jobids, jobinfo, print_stderr=None, store=False): """ print standard output of a given job""" dir_name = self.get_stdout_dir_name( self.get_local_dir_name(jobinfo["runcard"], jobinfo["runfolder"])) # jobids = length 1 for SLURM jobs - just take the only element here jobid = jobids[0] output = [] if jobinfo["jobtype"] == "Production" or "Socket" in jobinfo["jobtype"]: for subjobno in range(1, int(jobinfo["no_runs"]) + 1): stdoutfile = os.path.join( dir_name, "slurm-{0}_{1}.out".format(jobid, subjobno)) if print_stderr: stdoutfile = stdoutfile.replace(".out", ".err") cmd = ["cat", stdoutfile] if not store: util.spCall(cmd) else: output.append( util.getOutputCall(cmd, suppress_errors=True, include_return_code=False)) else: stdoutfile = os.path.join(dir_name, F"slurm-{jobid}.out") if print_stderr: stdoutfile = stdoutfile.replace(".out", ".err") cmd = ["cat", stdoutfile] if not store: util.spCall(cmd) else: output.append( util.getOutputCall(cmd, suppress_errors=True, include_return_code=False)) if store: return output
def cat_log_job(self, jobids, jobinfo): """Sometimes the std output doesn't get updated but we can choose to access the logs themselves""" output_folder = ["file:///tmp/"] cmd_base = ["arccp", "-i"] cmd_str = "cat /tmp/" for jobid in jobids: files = util.getOutputCall(["arcls", jobid]).split() logfiles = [i for i in files if i.endswith(".log")] for logfile in logfiles: cmd = cmd_base + [os.path.join(jobid, logfile)] + output_folder output = util.getOutputCall(cmd).split() for text in output: if ".log" in text: util.spCall((cmd_str + text).split())
def _do_stats_job(self, jobid_raw): """ version of stats job multithread ready """ if isinstance(jobid_raw, tuple): if (jobid_raw[1] == self.cDONE or jobid_raw[1] == self.cFAIL or jobid_raw[1] == self.cMISS): return jobid_raw[1] else: jobid = jobid_raw[0] else: jobid = jobid_raw cmd = [self.cmd_stat, jobid.strip(), "-j", header.arcbase] strOut = util.getOutputCall(cmd, suppress_errors=True, include_return_code=False) if "Done" in strOut or "Finished" in strOut: return self.cDONE elif "Waiting" in strOut or "Queuing" in strOut: return self.cWAIT elif "Running" in strOut: return self.cRUN elif "Failed" in strOut: # if we still have a return code 0 something is odd if "Exit Code: 0" in strOut: return self.cMISS return self.cFAIL else: return self.cUNK
def _run_JDL(self, filename): """ Sends JDL file to the dirac management system """ cmd = "dirac-wms-job-submit {}".format(filename) output = util.getOutputCall(cmd.split(), include_return_code=False) jobids = output.rstrip().strip().split("]")[0].split("[")[-1] jobids = jobids.split(", ") return jobids
def get_status(self, jobid, status): stat = len([ i for i in util.getOutputCall( ["squeue", "-j{0}".format(jobid), "-r", "-t", status], suppress_errors=True).split("\n")[1:] if "error" not in i ]) #strip header from results if stat > 0: stat = stat - 1 return stat
def get_status(self, jobid, status): stat = len([ i for i in util.getOutputCall( ["squeue", F"-j{jobid}", "-r", "-t", status], suppress_errors=True, include_return_code=False).split("\n")[1:] if "error" not in i ]) # strip header from results if stat > 0: stat = stat - 1 return stat
def get_status(self, status, date): output = set( util.getOutputCall([ 'dirac-wms-select-jobs', '--Status={0}'.format(status), '--Owner={0}'.format(header.dirac_name), '--Maximum=0', # 0 lists ALL jobs, which is nice :) '--Date={0}'.format(date) ]).split("\n")[-2].split(",")) header.logger.debug(output) return output
def _run_XRSL(self, filename, ce): """ Sends XRSL to the queue defined in header """ from pyHepGrid.src.header import arc_direct cmd = "arcsub -c {0} {1} -j {2}".format(ce, filename, self.arcbd) # Can only use direct in Durham. Otherwise fails! # Speeds up submission (according to Stephen) if arc_direct and ".dur.scotgrid.ac.uk" in ce: cmd += " -S org.nordugrid.gridftpjob --direct " output = util.getOutputCall(cmd.split(), include_return_code=True) jobid = output[0].split("jobid:")[-1].rstrip().strip() return jobid, output[1]
def cat_job(self, jobids, jobinfo, print_stderr=None, store=False): """ print standard output of a given job""" out = [] for jobid in jobids: cmd = [self.cmd_print, "-j", header.arcbase, jobid.strip()] if print_stderr: cmd += ["-e"] if not store: util.spCall(cmd) else: out.append(util.getOutputCall(cmd, include_return_code=False)) if store: return out
def get_status(self, status, date): output = set( util.getOutputCall( [ 'dirac-wms-select-jobs', F'--Status={status}', F'--Owner={header.dirac_name}', '--Maximum=0', # 0 lists ALL jobs, which is nice :) F'--Date={date}' ], include_return_code=False).split("\n")[-2].split(",")) header.logger.debug(output) return output
def _run_XRSL(self, filename, test=False, include_retcode=False): """ Sends XRSL to the queue defined in header If test = True, use test queue """ import random from pyHepGrid.src.header import arc_direct from pyHepGrid.src.header import split_dur_ce if test: from pyHepGrid.src.header import ce_test as ce else: from pyHepGrid.src.header import ce_base as ce # Randomise ce at submission time to reduce load if split_dur_ce and ".dur.scotgrid.ac.uk" in ce: ce = random.choice( ["ce1.dur.scotgrid.ac.uk", "ce2.dur.scotgrid.ac.uk"]) # if split_dur_ce: # ce = random.choice( # ["ce01.tier2.hep.manchester.ac.uk", "ce02.tier2.hep.manchester.ac.uk"]) # if split_dur_ce: # ce = random.choice( # ["arc-ce03.gridpp.rl.ac.uk", "arc-ce04.gridpp.rl.ac.uk"]) cmd = "arcsub -c {0} {1} -j {2}".format(ce, filename, self.arcbd) print(cmd) # Can only use direct in Durham. Otherwise fails! # Speeds up submission (according to Stephen) if arc_direct and ".dur.scotgrid.ac.uk" in ce: cmd += " -S org.nordugrid.gridftpjob --direct " if include_retcode: output = util.getOutputCall(cmd.split(), include_return_code=True) jobid = output[0].split("jobid:")[-1].rstrip().strip() return jobid, output[1] else: output = util.getOutputCall(cmd.split()) jobid = output.split("jobid:")[-1].rstrip().strip() return jobid
def _run_SLURM(self, filename, args, queue, test=False, socket=None, n_sockets=1): """ Takes a slurm runfile and submits it to the SLURM batch system. Returns the jobid and queue used for submission""" if queue is not None: queuetag = "-p {0}".format(queue) else: queuetag = "" cmd = "sbatch {0} {1}".format(filename, queuetag) header.logger.debug(cmd) output = util.getOutputCall(cmd.split(), include_return_code=False) jobid = output.strip().split()[-1] return jobid, queue
def _get_data_warmup(self, db_id): """ Given a database entry, retrieve its data from the warmup folder to the folder defined in said database entry For arc jobs stdoutput will be downloaded in said folder as well """ # Retrieve data from database from pyHepGrid.src.header import arcbase, grid_warmup_dir fields = ["runcard", "runfolder", "jobid", "pathfolder"] data = self.dbase.list_data(self.table, fields, db_id)[0] runfolder = data["runfolder"] finfolder = data["pathfolder"] + "/" + runfolder runcard = data["runcard"] jobids = data["jobid"].split() util.spCall(["mkdir", "-p", finfolder]) logger.info("Retrieving ARC output into " + finfolder) try: # Retrieve ARC standard output for every job of this run for jobid in jobids: logger.info(jobid) cmd = [self.cmd_get, "-j", arcbase, jobid.strip()] output = util.getOutputCall(cmd, include_return_code=False) outputfol = output.split("Results stored at: ")[1].rstrip() outputfolder = outputfol.split("\n")[0] if outputfolder == "" or (len(outputfolder.split(" ")) > 1): logger.info("Running mv and rm command is not safe here") logger.info("Found blank spaces in the output folder") logger.info( "Nothing will be moved to the warmup global folder") else: destination = finfolder + "/" + "arc_out_" + runcard + \ outputfolder util.spCall(["mv", outputfolder, destination]) # util.spCall(["rm", "-rf", outputfolder]) except BaseException: logger.info("Couldn't find job output in the ARC server") logger.info("jobid: " + jobid) logger.info("Run arcstat to check the state of the job") logger.info("Trying to retrieve data from grid storage anyway") # Retrieve warmup from the grid storage warmup folder wname = self.warmup_name(runcard, runfolder) self.gridw.bring(wname, grid_warmup_dir, finfolder + "/" + wname)
def __check_grid_pdf(self, use_cvmfs=False, cvmfs_loc=""): import json infofile = os.path.join(os.path.dirname(os.path.realpath(__file__)), ".pdfinfo") pdf, member = self.parse_pdf_entry() if not use_cvmfs: try: with open(infofile, "r") as f: data = json.load(f) try: members = data[pdf] self.debug("PDF set found") except KeyError: self.critical( "PDF set {0} is not included in currently " "initialised version of LHAPDF".format(pdf)) try: assert int(member) in members self.debug("PDF member found") except AssertionError: self.critical( "PDF member {1} for PDF set {0} is not included in " "currently initialised version of LHAPDF".format( pdf, member)) except FileNotFoundError: self.warning("No PDF info file found. Skipping check.") else: sharedir = "{0}/share/LHAPDF/".format(cvmfs_loc) bindir = "{0}/bin/".format(cvmfs_loc) os.environ["LHA_DATA_PATH"] = sharedir os.environ["LHAPATH"] = sharedir cvmfs_pdfs = util.getOutputCall( [bindir + "lhapdf", "ls", "--installed"]) cvmfs_pdfs = [i.strip() for i in cvmfs_pdfs.split()] if pdf not in cvmfs_pdfs: self.critical("PDF set {0} is not included in cvmfs LHAPDF. " "Turn cvmfs PDF off and use your own one " "(or ask the admins nicely...".format(pdf)) else: self.debug("PDF set found in cvmfs LHAPDF setup")
def _do_stats_job(self, jobid_raw): """ version of stats job multithread ready """ if isinstance(jobid_raw, tuple): if jobid_raw[1] == self.cDONE or jobid_raw[1] == self.cFAIL: return jobid_raw[1] else: jobid = jobid_raw[0] else: jobid = jobid_raw cmd = [self.cmd_stat, jobid.strip(), "-j", header.arcbase] strOut = util.getOutputCall(cmd, suppress_errors=True) if "Done" in strOut or "Finished" in strOut: return self.cDONE elif "Waiting" in strOut or "Queuing" in strOut: return self.cWAIT elif "Running" in strOut: return self.cRUN elif "Failed" in strOut: return self.cFAIL else: return self.cUNK