def cat_job(self, jobids, jobinfo, print_stderr=None, store=False): """ print standard output of a given job""" dir_name = self.get_stdout_dir_name( self.get_local_dir_name(jobinfo["runcard"], jobinfo["runfolder"])) # jobids = length 1 for SLURM jobs - just take the only element here jobid = jobids[0] output = [] if jobinfo["jobtype"] == "Production" or "Socket" in jobinfo["jobtype"]: for subjobno in range(1, int(jobinfo["no_runs"]) + 1): stdoutfile = os.path.join( dir_name, "slurm-{0}_{1}.out".format(jobid, subjobno)) if print_stderr: stdoutfile = stdoutfile.replace(".out", ".err") cmd = ["cat", stdoutfile] if not store: util.spCall(cmd) else: output.append( util.getOutputCall(cmd, suppress_errors=True, include_return_code=False)) else: stdoutfile = os.path.join(dir_name, F"slurm-{jobid}.out") if print_stderr: stdoutfile = stdoutfile.replace(".out", ".err") cmd = ["cat", stdoutfile] if not store: util.spCall(cmd) else: output.append( util.getOutputCall(cmd, suppress_errors=True, include_return_code=False)) if store: return output
def cat_log_job(self, jobids, jobinfo, *args, **kwargs): import re import glob run_dir = self.get_local_dir_name(jobinfo["runcard"], jobinfo["runfolder"]) log_files = [i for i in os.listdir(run_dir) if i.endswith(".log")] if jobinfo["iseed"] is None: jobinfo["iseed"] = 1 expected_seeds = set( range(int(jobinfo["iseed"]), int(jobinfo["iseed"]) + int(jobinfo["no_runs"]))) logseed_regex = re.compile(r".s([0-9]+)\.[^\.]+$") logseeds_in_dir = set([ int(logseed_regex.search(i).group(1)) for i in glob.glob('{0}/*.log'.format(run_dir)) ]) seeds_to_print = (logseeds_in_dir.union(expected_seeds)) cat_logs = [] for log_file in log_files: for seed in seeds_to_print: if F".s{seed}." in log_file: cat_logs.append(log_file) seeds_to_print.remove(seed) break for log in cat_logs: cmd = ["cat", os.path.join(run_dir, log)] util.spCall(cmd)
def run_test(args, runcard): # header.debug_level = 99999 if args.runArc: from pyHepGrid.src.runArcjob import testWrapper elif args.runArcProduction: from pyHepGrid.src.runArcjob import testWrapperProduction as testWrapper elif args.runDirac: from pyHepGrid.src.runDiracjob import testWrapper elif args.runSlurm: from pyHepGrid.src.runSlurmjob import testWrapper elif args.runSlurmProduction: from pyHepGrid.src.runSlurmjob import testWrapperProduction \ as testWrapper else: raise Exception("Choose what you want to test -(A/B/D/E/F)") rncards, dCards = util.expandCard(runcard) # if args.runSlurm: # header.runfile = header.SLURMSCRIPTDEFAULT # if args.runSlurmProduction: # header.runfile = header.SLURMSCRIPTDEFAULT_PRODUCTION setup() for r in rncards: nnlojob_args = testWrapper(r, dCards).replace("\"", "").split() runfile = os.path.basename(header.runfile) util.spCall(["chmod", "+x", runfile]) util.spCall(["./{0}".format(runfile)] + nnlojob_args)
def clean_job(self, jobids): """ remove the sandbox of a given job (including its stdout!) from the arc storage """ self._press_yes_to_continue( " \033[93m WARNING:\033[0m You are about to clean the job!") for jobid in jobids: cmd = [self.cmd_clean, "-j", header.arcbase, jobid.strip()] util.spCall(cmd)
def kill_job(self, jobids, jobinfo): header.logger.debug(jobids, jobinfo) if len(jobids) == 0: header.logger.critical( "No jobids stored associated with this database entry, " "therefore nothing to kill.") for jobid in jobids: util.spCall(["scancel", str(jobid)])
def status_job(self, jobids, verbose=False): """ print the current status of a given job """ cmd = [self.cmd_stat, "-j", header.arcbase] jobids = [jobid.strip() for jobid in jobids] if len(jobids) == 0: header.logger.critical("No jobs selected") cmd = cmd + jobids if verbose: cmd += ["-l"] util.spCall(cmd)
def cat_job(self, jobids, jobinfo, print_stderr=None, store=False): """ print standard output of a given job""" out = [] for jobid in jobids: cmd = [self.cmd_print, "-j", header.arcbase, jobid.strip()] if print_stderr: cmd += ["-e"] if not store: util.spCall(cmd) else: out.append(util.getOutputCall(cmd, include_return_code=False)) if store: return out
def kill_job(self, jobids, jobinfo): """ kill all jobs associated with this run """ self._press_yes_to_continue( " \033[93m WARNING:\033[0m You are about to kill all jobs for " "this run!") if len(jobids) == 0: header.logger.critical( "No jobids stored associated with this database entry, " "therefore nothing to kill.") cmd = [self.cmd_kill] + jobids util.spCall(cmd)
def status_job(self, jobids, verbose=False): """ print the current status of a given job """ # for jobid in jobids: # cmd = [self.cmd_stat, "-j", header.arcbase, jobid.strip()] # if verbose: # cmd += ["-l"] # util.spCall(cmd) cmd = [self.cmd_stat, "-j", header.arcbase] print(header.arcbase) jobids = [jobid.strip() for jobid in jobids] cmd = cmd + jobids if verbose: cmd += ["-l"] util.spCall(cmd)
def cat_log_job(self, jobids, jobinfo): """Sometimes the std output doesn't get updated but we can choose to access the logs themselves""" output_folder = ["file:///tmp/"] cmd_base = ["arccp", "-i"] cmd_str = "cat /tmp/" for jobid in jobids: files = util.getOutputCall(["arcls", jobid]).split() logfiles = [i for i in files if i.endswith(".log")] for logfile in logfiles: cmd = cmd_base + [os.path.join(jobid, logfile)] + output_folder output = util.getOutputCall(cmd).split() for text in output: if ".log" in text: util.spCall((cmd_str + text).split())
def _do_extract_outputData(self, tarfile): """ Multithread wrapper used in get_data_production for untaring files """ # It assumes log and dat folder are already there if not os.path.isfile(tarfile): logger.info("{0} not found".format(tarfile)) return -1 out_dict = {".log" : "log/", ".dat" : "dat/" } self.tarw.extract_extension_to(tarfile, out_dict) util.spCall(["rm", tarfile]) return 0
def kill_job(self, jobids, jobinfo): """ kills given job """ self._press_yes_to_continue( " \033[93m WARNING:\033[0m You are about to kill the job!") if len(jobids) == 0: header.logger.critical( "No jobids stored associated with this database entry, " "therefore nothing to kill.") # Kill in groups of 150 for speeeed for jobid_set in util.batch_gen(jobids, 150): stripped_set = [i.strip() for i in jobid_set] cmd = [self.cmd_kill, "-j", header.arcbase] + stripped_set header.logger.debug("job_kill batch length:{0}".format( len(stripped_set))) util.spCall(cmd)
def bring_current_warmup(self, db_id): """ Sometimes we want to retrieve the warmup before the job finishes """ output_folder = ["file:///tmp/"] cmd_base = ["gfal-copy", "-v"] fields = ["pathfolder", "runfolder", "jobid"] data = self.dbase.list_data(self.table, fields, db_id)[0] runfolder = data["runfolder"] finfolder = data["pathfolder"] + "/" + runfolder + "/" if header.finalisation_script is not None: finfolder = header.default_runfolder jobids = data["jobid"].split() output_folder = ["file://" + finfolder] for jobid in jobids: cmd = cmd_base + [jobid + "/*.y*"] + output_folder util.spCall(cmd) cmd = cmd_base + [jobid + "/*.log"] + output_folder util.spCall(cmd) print("Warmup stored at {0}".format(finfolder))
def _get_data_warmup(self, db_id): """ Given a database entry, retrieve its data from the warmup folder to the folder defined in said database entry For arc jobs stdoutput will be downloaded in said folder as well """ # Retrieve data from database from pyHepGrid.src.header import arcbase, grid_warmup_dir fields = ["runcard", "runfolder", "jobid", "pathfolder"] data = self.dbase.list_data(self.table, fields, db_id)[0] runfolder = data["runfolder"] finfolder = data["pathfolder"] + "/" + runfolder runcard = data["runcard"] jobids = data["jobid"].split() util.spCall(["mkdir", "-p", finfolder]) logger.info("Retrieving ARC output into " + finfolder) try: # Retrieve ARC standard output for every job of this run for jobid in jobids: logger.info(jobid) cmd = [self.cmd_get, "-j", arcbase, jobid.strip()] output = util.getOutputCall(cmd, include_return_code=False) outputfol = output.split("Results stored at: ")[1].rstrip() outputfolder = outputfol.split("\n")[0] if outputfolder == "" or (len(outputfolder.split(" ")) > 1): logger.info("Running mv and rm command is not safe here") logger.info("Found blank spaces in the output folder") logger.info( "Nothing will be moved to the warmup global folder") else: destination = finfolder + "/" + "arc_out_" + runcard + \ outputfolder util.spCall(["mv", outputfolder, destination]) # util.spCall(["rm", "-rf", outputfolder]) except BaseException: logger.info("Couldn't find job output in the ARC server") logger.info("jobid: " + jobid) logger.info("Run arcstat to check the state of the job") logger.info("Trying to retrieve data from grid storage anyway") # Retrieve warmup from the grid storage warmup folder wname = self.warmup_name(runcard, runfolder) self.gridw.bring(wname, grid_warmup_dir, finfolder + "/" + wname)
def update_stdout(self): """ retrieves stdout of all running jobs and store the current state into its correspondent folder """ fields = ["rowid", "jobid", "pathfolder", "runfolder"] dictC = self._db_list(fields) for job in dictC: # Retrieve data from database jobid = str(job['jobid']) rfold = str(job['runfolder']) pfold = str(job['pathfolder']) + "/" + rfold flnam = pfold + "/stdout" # Create target folder if it doesn't exist if not os.path.exists(pfold): os.makedirs(pfold) cmd = self.cmd_print + ' ' + jobid.strip() # It seems script is the only right way to save data with arc stripcm = ['script', '-c', cmd, '-a', 'tmpscript.txt'] mvcmd = ['mv', 'tmpscript.txt', flnam] util.spCall(stripcm) util.spCall(mvcmd)
def init_production(self, provided_warmup=None, continue_warmup=False, local=False): """ Initialises a production run. If a warmup file is provided retrieval step is skipped Steps are: 0 - Retrieve warmup from the grid/local 1 - tar up executable, runcard and necessary files 2 - sent it to the grid storage """ from shutil import copy import tempfile from pyHepGrid.src.header import runcardDir as runFol from pyHepGrid.src.header import executable_exe, executable_src_dir, logger if local: self.init_local_production(provided_warmup=provided_warmup) return rncards, dCards = util.expandCard() path_to_exe_full = self._exe_fullpath(executable_src_dir, executable_exe) origdir = os.path.abspath(os.getcwd()) tmpdir = tempfile.mkdtemp() # if provided warmup is a relative path, ensure we have the full path # before we change to the tmp directory if provided_warmup: if provided_warmup[0] != "/": provided_warmup = "{0}/{1}".format(origdir, provided_warmup) os.chdir(tmpdir) logger.debug("Temporary directory: {0}".format(tmpdir)) if not os.path.isfile(path_to_exe_full): logger.critical("Could not find executable at {0}".format(path_to_exe_full)) copy(path_to_exe_full, os.getcwd()) files = [executable_exe] for idx, i in enumerate(rncards): logger.info("Initialising {0} [{1}/{2}]".format(i, idx+1, len(rncards))) local = False # Check whether warmup/production is active in the runcard runcard_file = os.path.join(runFol, i) runcard_obj = PROGRAMruncard(runcard_file, logger=logger, use_cvmfs=header.use_cvmfs_lhapdf, cvmfs_loc=header.cvmfs_lhapdf_location) multichannel = self.check_runcard_multichannel(runcard_obj) self._check_production(runcard_obj) rname = dCards[i] tarfile = i + rname + ".tar.gz" copy(os.path.join(runFol, i), os.getcwd()) if provided_warmup: match, local = self._get_local_warmup_name(runcard_obj.warmup_filename(), provided_warmup) warmupFiles = [match] elif header.provided_warmup_dir: match, local = self._get_local_warmup_name(runcard_obj.warmup_filename(), header.provided_warmup_dir) warmupFiles = [match] else: logger.info("Retrieving warmup file from grid") warmupFiles = self._bring_warmup_files(i, rname, shell=True, multichannel=multichannel) self.tarw.tarFiles(files + [i] + warmupFiles, tarfile) if self.gridw.checkForThis(tarfile, header.grid_input_dir): logger.info("Removing old version of {0} from Grid Storage".format(tarfile)) self.gridw.delete(tarfile, header.grid_input_dir) logger.info("Sending {0} to GFAL {1}/".format(tarfile, header.grid_input_dir)) self.gridw.send(tarfile, header.grid_input_dir, shell=True) if local: util.spCall(["rm", i, tarfile]) else: util.spCall(["rm", i, tarfile] + warmupFiles) os.remove(executable_exe) os.chdir(origdir)
def _get_data_production(self, db_id): """ Given a database entry, retrieve its data from the output folder to the folder defined in said db entry """ logger.info("You are going to download all folders corresponding to this runcard from grid output") logger.info("Make sure all runs are finished using the -s or -S options!") fields = ["runfolder", "runcard", "jobid", "pathfolder", "iseed"] data = self.dbase.list_data(self.table, fields, db_id)[0] self.rcard = data["runcard"] self.rfolder = data["runfolder"] pathfolderTp = data["pathfolder"] initial_seed = data["iseed"] pathfolder = util.sanitiseGeneratedPath(pathfolderTp, self.rfolder) jobids = data["jobid"].split(" ") finalSeed = int(initial_seed) + len(jobids) if initial_seed == "None": initial_seed = self.bSeed else: initial_seed = int(initial_seed) while True: firstName = self.output_name(self.rcard, self.rfolder, initial_seed) finalName = self.output_name(self.rcard, self.rfolder, finalSeed) logger.info("The starting filename is {}".format(firstName)) logger.info("The final filename is {}".format(finalName)) yn = self._press_yes_to_continue("If you are ok with this, press y", fallback = -1) if yn == 0: break initial_seed = int(input("Please, introduce the starting seed (ex: 400): ")) finalSeed = int(input("Please, introduce the final seed (ex: 460): ")) try: os.makedirs(self.rfolder) except OSError as err: if err.errno == 17: logger.info("Tried to create folder %s in this directory".format(self.rfolder)) logger.info("to no avail. We are going to assume the directory was already there") self._press_yes_to_continue("", "Folder {} already exists".format(self.rfolder)) else: raise os.chdir(self.rfolder) try: os.makedirs("log") os.makedirs("dat") except: # todo: macho... this is like mkdir -p :P pass seeds = range(initial_seed, finalSeed) # If we are only act on a subrange of jobids (ie, the ones which are done...) choose only those seeds if self.act_only_on_done: old_status = self._get_old_status(db_id) if old_status: new_seed = [] for seed, stat in zip(seeds, old_status): if stat == self.cDONE: new_seed.append(seed) seeds = new_seed from pyHepGrid.src.header import finalise_no_cores as n_threads # Check which of the seeds actually produced some data all_remote = self.output_name_array(self.rcard, self.rfolder, seeds) all_output = self.gridw.get_dir_contents(header.grid_output_dir).split() remote_tarfiles = list(set(all_remote) & set(all_output)) logger.info("Found data for {0} of the {1} seeds.".format(len(remote_tarfiles), len(seeds))) # Download said data tarfiles = self._multirun(self._do_get_data, remote_tarfiles, n_threads, use_counter = True) tarfiles = list(filter(None, tarfiles)) logger.info("Downloaded 0 files", end ='\r') logger.info("Downloaded {0} files, extracting...".format(len(tarfiles))) # Extract some information from the first tarfile for tarfile in tarfiles: if self._extract_output_warmup_data(tarfile): break # Extract all dummy = self._multirun(self._do_extract_outputData, tarfiles, n_threads) os.chdir("..") logger.info("Everything saved at {0}".format(pathfolder)) util.spCall(["mv", self.rfolder, pathfolder])
def renew_proxy(self, jobids): """ renew proxy for a given job """ for jobid in jobids: cmd = [self.cmd_renew, jobid.strip()] util.spCall(cmd)
def cat_job(self, jobids, jobinfo, print_stderr=None): print("Printing the last 20 lines of the last job") jobid = jobids[-1] cmd = [self.cmd_print, jobid.strip()] util.spCall(cmd)
def do_status_job(self, jobid): """ multiproc wrapper for status_job """ cmd = [self.cmd_stat, jobid] util.spCall(cmd, suppress_errors=True) return 0
def _bring_warmup_files(self, runcard, rname, shell=False, check_only=False, multichannel=False): """ Download the warmup file for a run to local directory extracts Vegas grid and log file and returns a list with their names check_only flag doesn't error out if the warmup doesn't exist, instead just returns and empty list for later use [intended for checkwarmup mode so multiple warmups can be checked consecutively. """ from pyHepGrid.src.header import grid_warmup_dir, logger gridFiles = [] suppress_errors = False if check_only: suppress_errors = True ## First bring the warmup .tar.gz outnm = self.warmup_name(runcard, rname) logger.debug("Warmup GFAL name: {0}".format(outnm)) tmpnm = "tmp.tar.gz" logger.debug("local tmp tar name: {0}".format(tmpnm)) success = self.gridw.bring(outnm, grid_warmup_dir, tmpnm, shell = shell, suppress_errors=suppress_errors) success == self.__check_pulled_warmup(success, tmpnm, warmup_extensions) if not success and not check_only: if self._press_yes_to_continue("Grid files failed to copy. Try backups from individual sockets?") == 0: backup_dir = os.path.join(grid_warmup_dir,outnm.replace(".tar.gz","")) backups = self.gridw.get_dir_contents(backup_dir) if len(backups) == 0: logger.critical("No backups found. Did the warmup complete successfully?") else: backup_files = backups.split() for idx, backup in enumerate(backup_files): logger.info("Attempting backup {1} [{0}]".format(idx+1, backup)) success = self.gridw.bring(backup, backup_dir, tmpnm, shell = shell, force=True) success == self.__check_pulled_warmup(success, tmpnm, warmup_extensions) if success: break if not success: logger.critical("Grid files failed to copy. Did the warmup complete successfully?") else: logger.info("Grid files copied ok.") ## Now extract only the Vegas grid files and log file gridp = warmup_extensions gridp += [i+"_channel" for i in gridp] extractFiles = self.tarw.extract_extensions(tmpnm, gridp+[".log",".txt","channels"]) try: gridFiles = [i for i in extractFiles if ".log" not in i] logfile = [i for i in extractFiles if ".log" in i][0] except IndexError as e: if not check_only: logger.critical("Logfile not found. Did the warmup complete successfully?") else: return [] if multichannel and len([i for i in gridFiles if "channels" in i]) ==0: logger.critical("No multichannel warmup found, but multichannel is set in the runcard.") elif multichannel: logger.info("Multichannel warmup files found.") if gridFiles == [] and not check_only: # No grid files found logger.critical("Grid files not found in warmup tarfile. Did the warmup complete successfully?") elif gridFiles == []: return [] ## Tag log file as -warmup newlog = logfile + "-warmup" os.rename(logfile, newlog) # Remove temporary tar files os.remove(tmpnm) gridFiles.append(newlog) # Make sure access to the file is correct! for i in gridFiles: util.spCall(["chmod", "a+wrx", i]) return gridFiles