def __get_job_options(self, runNumbers): if not self.engine().submit_hook(): logging.warning( "A job with the name {j} has already been submitted.".format( j=self.engine().job_name())) return CreateDirectory(self.engine().config_dir(), True) for r in runNumbers: jobFolder = os.path.join(self.__joboptions_dir, "{ddd}xxx".format(ddd=str(r)[:3])) if not os.path.isdir(jobFolder): logging.warning( "Job option folder {f} for DSID {r} does not exist. Skipping {r}..." .format(f=jobFolder, r=r)) continue dir_to_copy = os.path.join(jobFolder, str(r)) if len(dir_to_copy) == 0: continue shutil.copytree(dir_to_copy, os.path.join(self.engine().config_dir(), str(r))) # assemble the config file for the job option seeds = [] while len(seeds) < self.__nJobs: s = random.uniform(100000, 500000) if s not in seeds: seeds += [s] jo = [os.path.join(self.engine().config_dir(), str(r))][0] out_dir = os.path.join(self.evgen_dir(), str(r)) WriteList( (ReadListFromFile(self.seed_file()) if os.path.exists( self.seed_file()) else []) + ["%d" % (i) for i in seeds], self.seed_file(), ) WriteList( (ReadListFromFile(self.run_file()) if os.path.exists(self.run_file()) else []) + ["%d" % (r) for i in range(self.__nJobs)], self.run_file(), ) WriteList( (ReadListFromFile(self.job_file()) if os.path.exists(self.job_file()) else []) + [jo for i in range(self.__nJobs)], self.job_file(), ) WriteList( (ReadListFromFile(self.out_file()) if os.path.exists(self.out_file()) else []) + [out_dir for i in range(self.__nJobs)], self.out_file(), ) # submit the job array self.__n_scheduled += self.__nJobs logging.info("INFO <__get_job_options> Found %s" % (jo))
def __extract_seeds(self, run): try: EVNT_DIR = [ os.path.join(self.evgen_dir(), R) for R in os.listdir(self.evgen_dir()) if R.startswith(str(run)) ][0] except: return logging.info( "<__extract_seeds> Searching {evntdir} for EVNT files not already processed in derivation format {d}." .format(evntdir=EVNT_DIR, d=self.__derivation)) DERIVATION_DIR = os.path.join(self.aod_dir(), EVNT_DIR[EVNT_DIR.rfind("/") + 1:]) CreateDirectory(DERIVATION_DIR, False) Evnt_Seeds = [ int(E[E.find("EVNT") + 5:E.find(".pool")]) for E in os.listdir(EVNT_DIR) if E.endswith(".root") ] DAOD_Seeds = [ int(A.split(".")[-2]) for A in os.listdir(DERIVATION_DIR) if A.find(self.__derivation) != -1 and A.endswith(".root") ] Non_ProcSeeds = [seed for seed in Evnt_Seeds if seed not in DAOD_Seeds] if len(Non_ProcSeeds) == 0: return logging.info("Extracted seeds for run {r}:".format(r=run)) logging.info(" +-=- {s}".format( s=", ".join([str(seed) for seed in Non_ProcSeeds]))) WriteList( (ReadListFromFile(self.seed_file()) if os.path.exists(self.seed_file()) else []) + [str(seed) for seed in Non_ProcSeeds], self.seed_file(), ) WriteList( (ReadListFromFile(self.run_file()) if os.path.exists(self.run_file()) else []) + [str(run) for seed in Non_ProcSeeds], self.run_file(), ) WriteList( (ReadListFromFile(self.in_file()) if os.path.exists(self.in_file()) else []) + [EVNT_DIR for seed in Non_ProcSeeds], self.in_file(), ) self.__n_scheduled += len(Non_ProcSeeds)
def getPRWblackList(): FileName = ResolvePath("XAMPPbase/BlackListedPRWdatasets.txt") if not FileName: print "ERROR: The file XAMPPbase/data/BlackListedPRWdatasets.txt could not be found in the repository" print "ERROR: Did you delete it by accident? Please check!!!!" sys.exit(1) return sorted(ReadListFromFile(FileName))
def getUsersSubmittedPRW(): FileName = ResolvePath("XAMPPbase/UsersWhoSubmittedPRW.txt") if not FileName: print "ERROR: The file XAMPPbase/data/UsersWhoSubmittedPRW.txt could not be found in the repository" print "ERROR: Did you delete it by accident? Please check!!!!" sys.exit(1) return sorted(ReadListFromFile(FileName))
def __init__(self, campaign, stype, rtag, datasets, temp_dir, outdir, check_consistency=False, notDownloadAgain=True): threading.Thread.__init__(self) self.__campaign = campaign self.__stype = stype self.__rtag = rtag self.__datasets = datasets self.__dsids = ClearFromDuplicates( [GetPRW_datasetID(ds) for ds in self.__datasets]) self.__purged = [] self.__tempdir = temp_dir self.__outdir = outdir self.__check_consistency = check_consistency self.__to_black_list = [] self.__ds_to_submit = [] self.__inconsistent_log = [] self.__already_on_disk = [] if not notDownloadAgain or not os.path.exists( "%s/Finished.txt" % (self.download_dir())) else ReadListFromFile("%s/Finished.txt" % (self.download_dir())) if check_consistency: getAMIDataBase().getMCDataSets(channels=self.dsids(), campaign="%s" % (self.campaign()), derivations=[])
def OpenFiles(MyList): ROOTFiles = [] for Entry in MyList: if IsROOTFile(Entry): ROOTFiles.append(ROOT.TFile.Open(Entry)) elif IsTextFile(Entry): #### Adapt for the possibility that someone passes a XAMPPplotting config if Entry.endswith(".conf"): ROOTFiles += [ROOT.TFile.Open(File) for File in readXAMPPplottingInputConfig(Entry)] else: ROOTFiles += [ROOT.TFile.Open(Line) for Line in ReadListFromFile(Entry)] return ROOTFiles
def print_log_file(self, last_lines=10): if not os.path.exists(self.log_file()): return log_content = ReadListFromFile(self.log_file()) n_lines = len(log_content) for i in range(max(0, n_lines - last_lines), n_lines): if self.thread_number() == -1: logging.info("<%s> %s" % (self.name(), log_content[i])) else: logging.info("<%s - %d/%d> %s" % (self.name(), self.thread_number(), self.thread_engine().get_array_size( task_name=self.name()), log_content[i]))
def download_ci_files(options): ### Retrieve first the EOS token getEOS_token(options) ### Check first whether the CI dir actually exits smp_dir = "%s/datasamples/" % (options.ciDir) if not os.path.isdir(smp_dir): print "ERROR: The path to look up for the data samples %s does not exists. Where is my data" % ( smp_dir) exit(1) ### Create first the directory to store the temporary files in there ### Clean the old remants CreateDirectory(options.TEMPdir, True) downloaded_smp = [] for smp in os.listdir(smp_dir): smp_name = smp[:smp.rfind(".")] print "INFO: Download the files from sample %s" % (smp_name) download_to = "%s/%s" % (options.TEMPdir, smp_name) CreateDirectory(download_to, False) ### Download the files first for file_to_load in ReadListFromFile("%s/%s" % (smp_dir, smp)): destination_file = "%s/%s" % ( download_to, file_to_load[file_to_load.rfind("/") + 1:]) CopyCmd = "xrdcp %s/%s %s" % (options.EOSpath, file_to_load, destination_file) if os.path.exists(destination_file): print "INFO: Omit do download %s" % (file_to_load) elif os.system(CopyCmd) != 0: print "ERROR: Failed to download %s" % (file_to_load) exit(1) ### Write the file list for the analysis file_list = "%s/FileList_%s.txt" % (options.TEMPdir, smp_name) WriteList([ "%s/%s" % (download_to, f[f.rfind("/") + 1:]) for f in ReadListFromFile("%s/%s" % (smp_dir, smp)) ], file_list) downloaded_smp += [smp_name] return downloaded_smp
def main(): """Request datasets to RSE location.""" CheckRucioSetup() CheckRemainingProxyTime() RunOptions = getArgumentParser().parse_args() List = ClearFromDuplicates(ReadListFromFile(RunOptions.list)) ### Start replication of the datasets initiateReplication(ListOfDataSets=List, Rucio=RunOptions.rucio, RSE=RunOptions.RSE, lifeTime=RunOptions.lifetime, approve=RunOptions.askapproval, comment=RunOptions.comment)
def __extract_root_files(self, file_list=""): content = ReadListFromFile(file_list) if len(content) == 0: print "ERROR: The file %s is empty" % (in_ds) return [] n_files_in_cont = len(content) - len( [c for c in content if IsROOTFile(c)]) ### The list contains a list of root_files if n_files_in_cont == 0: return content ### It's a mixture elif n_files_in_cont != len(content): print "ERROR: You've a mixture of ROOT files and other stuff in %s" % ( file_list) return [] root_files = [] for ds in content: root_files += self.__find_on_dcache(ds) return root_files
def submit_job(self): WriteList( [ D.replace(self.evgen_dir(), self.aod_dir()) for D in ReadListFromFile(self.in_file()) ], self.out_file(), ) extra_args = "" if len(self.__preExec) > 0: extra_args += ' --preExec "%s" ' % (self.__preExec) if len(self.__preInclude) > 0: extra_args += ' --preInclude "%s" ' % (self.__preInclude) if len(self.__postExec) > 0: extra_args += ' --postExec "%s" ' % (self.__postExec) if len(self.__postInclude) > 0: extra_args += ' --postInclude "%s" ' % (self.__postInclude) if not self.engine().submit_array( sub_job=self.__derivation, script="SubmitMC/batch_derivation.sh", mem=self.__mem, env_vars=[ ("SeedFile", self.seed_file()), ("RunFile", self.run_file()), ("InFile", self.in_file()), ("OutFile", self.out_file()), ("DERIVATION_DIR", self.aod_dir()), ("DerivationRelease", self.__derivRelease), ("DerivationCache", self.__derivCache), ("ReductionConf", self.__derivation), ("ExtraArgs", extra_args), ], hold_jobs=self.hold_jobs(), run_time=self.__run_time, array_size=self.n_scheduled(), ): return False return True
def main(): Options = setupScriptSubmitParser().parse_args() submit_engine = setup_engine(Options) list_of_cmds = submit_engine.link_to_copy_area(Options.ListOfCmds) if not list_of_cmds: logging.error( "Please give a valid file with list of commands to execute") exit(1) if not submit_engine.submit_build_job(): logging.error("Submission failed") exit(1) submit_engine.submit_array(script="ClusterSubmission/Run.sh", mem=Options.vmem, env_vars=[("ListOfCmds", list_of_cmds)], hold_jobs=Options.HoldJob, run_time=Options.RunTime, array_size=len(ReadListFromFile(list_of_cmds))) submit_engine.submit_clean_all(hold_jobs=[submit_engine.job_name()]) submit_engine.finish()
Sample_Dir = ResolvePath(RunOptions.ListDir) No_AOD = [] TO_REQUEST = [] if not Sample_Dir: logging.error("ERROR: Please give a valid directory") exit(1) for File in os.listdir(Sample_Dir): if os.path.isdir("%s/%s" % (Sample_Dir, File)): continue logging.info("Update file list %s" % (File)) DataSets = sorted( ClearFromDuplicates([ GetPRW_datasetID(DS) for DS in ReadListFromFile("%s/%s" % (Sample_Dir, File)) if DS.find("data") == -1 ])) if len(DataSets) == 0: continue logging.info("Call the AMI database") DERIVATIONS = [] NO_DERIVARTION = [] AODs = [] getAMIDataBase().getMCDataSets( channels=DataSets, derivations=["DAOD_%s" % (RunOptions.derivation)]) #### Find the AODs for each DSID first for DSID in DataSets: Found_MC16a = False Found_MC16d = False
parser.add_argument( "--log_file", help="Define the location of the log-file from the consistency check", default="%s/Merged_NTUP.log" % (os.getcwd())) parser.add_argument("--mergeAllTags", help="Merge everything which is available", default=False, action='store_true') RunOptions = parser.parse_args() Required_DS = [] if len(RunOptions.requestedDataSets) > 0: for requestedDS in RunOptions.requestedDataSets: Required_DS.extend( [convertToAOD(DS) for DS in ReadListFromFile(requestedDS)]) Datasets = [] if RunOptions.readFromList: if len(RunOptions.inputFile) == 0: print 'ERROR: Please give a file containing PRW files list when using --readFromList option!' sys.exit(1) for inputFile in RunOptions.inputFile: datasetsInList.extend(ReadListFromFile(inputFile)) Datasets.extend(datasetsInList) else: print 'INFO: Looking for NTUP_PILEUP datasets in rucio...' for c in RunOptions.campaign: Datasets += GetDatasets(campaign=c,
default=[], nargs="+") parser.add_argument("--remainingSplit", help="Specify a remaining split of the files", default=1) parser.add_argument("--nFilesPerJob", help="Specify number of files per merge job", default=10) parser.add_argument("--HoldJob", help="Specify a list of jobs to hold on", default=[]) RunOptions = parser.parse_args() submit_engine = setup_engine(RunOptions) merging = [ submit_engine.create_merge_interface( out_name=L[L.rfind("/") + 1:L.rfind(".")], files_to_merge=ReadListFromFile(L), files_per_job=RunOptions.nFilesPerJob, hold_jobs=RunOptions.HoldJob, final_split=RunOptions.remainingSplit) for L in RunOptions.fileLists ] for merge in merging: merge.submit_job() clean_hold = [ submit_engine.subjob_name("merge-%s" % (merge.outFileName())) for merge in merging ] submit_engine.submit_clean_all(clean_hold)
def AssembleAthenaOptions(RunOptions, Parser=None, IsRemote=False): """ @brief Assemble athena options from run options and argument parser. The athena arguments work like this (as documented here: https://gitlab.cern.ch/atlas/athena/blob/21.2/Control/AthenaCommon/python/AthArgumentParser.py#L2) The command line arguments in the athena call are first passed to athena. Every argument that should be passed to the user code needs to be prepended by a single additional `-`. Example: athena.py XAMPPbase/runXAMPPbase.py --maxEvt 100 - --noSys ----------------------------------------------------------- | job option | athena arg | user arg @param RunOptions The run options @param Parser The parser @param IsRemote Flag to toggle option parsing for pathena instead of athena for running on the grid @return List with athena command line options """ Options = [] if not IsRemote and RunOptions.testJob: RunOptions.noSyst = True RunOptions.parseFilesForPRW = True athena_args = ["skipEvents", "evtMax", "filesInput"] local_only = ["outFile", "parseFilesForPRW"] + athena_args from XAMPPbase.SubmitToBatch import exclusiveBatchOpt from XAMPPbase.SubmitToGrid import exclusiveGridOpts black_listed = ["jobOptions", "valgrind"] + [ x.dest for x in exclusiveBatchOpt()._actions ] + [x.dest for x in exclusiveGridOpts()._actions] attributes = [ att for att in dir(RunOptions) if not att.startswith("_") and att not in black_listed ] attributes.sort(key=lambda x: x not in athena_args) ath_delimiter = False l_delim = -1 for att in attributes: if ath_delimiter and att in athena_args: ath_delimiter = False if not ath_delimiter and not att in athena_args: ath_delimiter = True Options += ["-"] l_delim = len(Options) ### Skip all arguments which are default from the parser if IsArgumentDefault(getattr(RunOptions, att), att, Parser): continue if IsRemote and att in local_only: continue ### Attributed if att == "filesInput" and (os.path.isfile(RunOptions.filesInput) and not IsROOTFile(RunOptions.filesInput) or os.path.isdir(RunOptions.filesInput)): Options += [ "--%s '%s'" % (att, ",".join( ReadListFromFile(RunOptions.filesInput) if not os.path.isdir(RunOptions.filesInput) else [ "%s/%s" % (RunOptions.filesInput, item) for item in os.listdir(RunOptions.filesInput) if IsROOTFile(item) ])) ] elif isinstance(getattr(RunOptions, att), bool): Options += ["--%s" % (att)] elif isinstance(getattr(RunOptions, att), list): Options += ["--%s %s" % (att, " ".join(getattr(RunOptions, att)))] else: Options += ["--%s %s" % (att, getattr(RunOptions, att))] ### No extra options were parsed. Get rid of the trailing - if len(Options) == l_delim: Options.pop() return Options
def __prepare_input(self, in_ds=""): print "INFO <_prepare_input>: Assemble configuration for %s" % (in_ds) ### Name to be piped to the job out_name = in_ds[in_ds.rfind("/") + 1:in_ds.rfind(".")] if IsTextFile( in_ds) or IsROOTFile(in_ds) else in_ds split_dir = "%s/Datasets/%s" % (self.split_cfg_dir(), out_name) root_files = [] ### Now we need to find the corresponding ROOT files ### 1) The dataset is a root file itself if IsROOTFile(in_ds): root_files += [in_ds] ### 2) The given dataset is a .txt file elif IsTextFile(in_ds): ### Find the root files from there root_files = self.__extract_root_files(in_ds) if len(root_files) == 0: return False ### 3) The given dataset is a directory elif os.path.isdir(in_ds): if in_ds.endswith("/"): in_ds = in_ds[:in_ds.rfind("/")] out_name = in_ds[in_ds.rfind("/") + 1:] split_dir = "%s/Directory/%s" % (self.split_cfg_dir(), out_name) root_files = [ "%s/%s" % (in_ds, F) for F in os.listdir(in_ds) if IsROOTFile(F) ] ### 4) It's a logical dataset stored on d-cache else: root_files = self.__find_on_dcache(in_ds) if len(root_files) == 0: print "ERROR: Could not associate anything to %s" % (in_ds) return False if len(out_name) == 0: print "ERROR: How should the output be called %s" % (in_ds) return False ### Assemble the splitting of the jobs main_list = "%s/AllROOTFiles.main" % (split_dir) files_in_main = ReadListFromFile(main_list) if os.path.exists( main_list) else [] ### The list is unkown or the content of ROOT files has changed ### Redo the splitting again ;-) if len(files_in_main) != len(root_files) or not IsListIn( files_in_main, root_files): print "INFO: Assemble new split for %s" % (in_ds) CreateDirectory(split_dir, True) WriteList(root_files, main_list) os.system("CreateBatchJobSplit -I %s -O %s -EpJ %i" % (main_list, split_dir, self.__events_per_job)) ### Each of the lists contains the ROOT files to process per each sub job split_lists = [ "%s/%s" % (split_dir, F) for F in os.listdir(split_dir) if IsTextFile(F) ] n_jobs = len(split_lists) subjob_outs = [ "%s/%s_%d.root" % (self.engine().tmp_dir(), out_name, d) for d in range(n_jobs) ] assembled_in = [] if not os.path.exists( self.job_input()) else ReadListFromFile(self.job_input()) assembled_out = [] if not os.path.exists( self.job_out_names()) else ReadListFromFile(self.job_out_names()) start_reg = len(assembled_in) ### Write what we've WriteList(assembled_in + split_lists, self.job_input()) WriteList(assembled_out + subjob_outs, self.job_out_names()) #### Submit the merge jobs self.__merge_interfaces += [ self.engine().create_merge_interface( out_name=out_name, files_to_merge=subjob_outs, hold_jobs=[(self.engine().job_name(), [start_reg + i + 1 for i in range(n_jobs)])], files_per_job=self.__files_per_merge_itr, final_split=self.__final_split) ] self.__nsheduled += n_jobs return True
def AssembleIO(): #-------------------------------------------------------------- # Reduce the event loop spam a bit #-------------------------------------------------------------- from AthenaCommon.Logging import logging recoLog = logging.getLogger('MuonAnalysis I/O') recoLog.info('****************** STARTING the job *****************') if os.path.exists("%s/athfile-cache.ascii.gz" % (os.getcwd())): recoLog.info( "Old athfile-cache found. Will delete it otherwise athena just freaks out. This little boy." ) os.system("rm %s/athfile-cache.ascii.gz" % (os.getcwd())) from GaudiSvc.GaudiSvcConf import THistSvc from AthenaCommon.JobProperties import jobproperties import AthenaPoolCnvSvc.ReadAthenaPool from AthenaCommon.AthenaCommonFlags import athenaCommonFlags as acf from AthenaServices.AthenaServicesConf import AthenaEventLoopMgr from AthenaCommon.AppMgr import ServiceMgr from ClusterSubmission.Utils import ReadListFromFile, ResolvePath, IsROOTFile from MuonAnalysis.Utils import IsTextFile ServiceMgr += AthenaEventLoopMgr(EventPrintoutInterval=1000000) ServiceMgr += THistSvc() OutFileName = "AnalysisOutput.root" if not "outFile" in globals( ) else outFile ServiceMgr.THistSvc.Output += [ "MuonAnalysis DATAFILE='{}' OPT='RECREATE'".format(OutFileName) ] recoLog.info("Will save the job's output to " + OutFileName) ROOTFiles = [] if "inputFile" in globals(): recoLog.info("Use the following %s as input" % (inputFile)) ROOTFiles = [] ResolvedInFile = ResolvePath(inputFile) if inputFile.startswith('root://'): ROOTFiles.append(inputFile) elif ResolvedInFile and os.path.isfile(ResolvedInFile): if IsTextFile(ResolvedInFile): ROOTFiles = ReadListFromFile(ResolvedInFile) else: ROOTFiles.append(ResolvedInFile) elif ResolvedInFile and os.path.isdir(ResolvedInFile): for DirEnt in os.listdir(ResolvedInFile): if IsROOTFile(DirEnt): if DirEnt.find(ResolvedInFile) != -1: ROOTFiles.append(DirEnt) else: ROOTFiles.append("%s/%s" % (ResolvedInFile, DirEnt)) else: raise RuntimeError("Invalid input " + inputFile) if len(ROOTFiles) == 0: raise RuntimeError("No ROOT files could be loaded as input") ServiceMgr.EventSelector.InputCollections = ROOTFiles acf.FilesInput = ROOTFiles if "nevents" in globals(): recoLog.info("Only run on %i events" % (int(nevents))) theApp.EvtMax = int(nevents) if "nskip" in globals(): recoLog.info("Skip the first %i events" % (int(nskip))) ServiceMgr.EventSelector.SkipEvents = int(nskip) """if isData(): recoLog.info("We're running over data today")
parser.add_argument( '-l', '--list', help='specify a list containing the datasets to be requested', required=True) parser.add_argument("--rucio", help="With this option you can set the rucio_account", default=RUCIO_ACCOUNT) parser.add_argument( "--lifetime", help= "Defines a lifetime after which the rules are automatically deleted", type=int, default=-1) parser.add_argument("--askapproval", help="Asks for approval of the request", default=False, action="store_true") parser.add_argument("--comment", help="Comment", default="") RunOptions = parser.parse_args() List = ReadListFromFile(RunOptions.list) ### Start replication of the datasets initiateReplication(ListOfDataSets=List, Rucio=RunOptions.rucio, RSE=RunOptions.RSE, lifeTime=RunOptions.lifetime, approve=RunOptions.askapproval, comment=RunOptions.comment)
help='Changes the RunTime of the analysis Jobs', default='07:59:59') parser.add_argument('--vmem', help='Changes the virtual memory needed by each jobs', type=int, default=2000) return parser if __name__ == '__main__': Options = setupScriptSubmitParser().parse_args() submit_engine = setup_engine(Options) list_of_cmds = submit_engine.link_to_copy_area(Options.ListOfCmds) if not list_of_cmds: print "ERROR: Please give a valid file with list of commands to execute" exit(1) if not submit_engine.submit_build_job(): print "ERROR: Submission failed" exit(1) submit_engine.submit_array(script="ClusterSubmission/Run.sh", mem=Options.vmem, env_vars=[("ListOfCmds", list_of_cmds)], hold_jobs=Options.HoldJob, run_time=Options.RunTime, array_size=len(ReadListFromFile(list_of_cmds))) submit_engine.submit_clean_all(hold_jobs=[submit_engine.job_name()]) submit_engine.finish()
OutDir = os.getcwd() parser = argparse.ArgumentParser( description= 'This script converts DAOD filelists to AOD filelists which then can be used for creating pileup reweighting files.', prog='CreateAODFromDAODList', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--datasets', '-d', '-D', help='DAOD filelist to be converted into AOD', required=True) parser.add_argument('--outFile', help="pipe the output into a script into a file", default='') RunOptions = parser.parse_args() print 'The following DAODs are converted into ADOs:\n' DAODsToConvert = [ convertToAOD(daod) for daod in ReadListFromFile(RunOptions.datasets) ] print '\nThe ADOs are:\n' for daod in DAODsToConvert: print " --- %s" % (daod) if len(RunOptions.outFile) > 0: WriteList(DAODsToConvert, RunOptions.outFile)
def main(): """Merge files from a list using the MergeClass in ClusterEngine.""" RunOptions = getArgumentParser().parse_args() if RunOptions.fileListsFolder != "": if len(RunOptions.fileLists) > 0: logging.warning( 'You gave both a folder containing filelists and separate filelists, will merge both!' ) if not os.path.isdir(RunOptions.fileListsFolder): logging.error(' %s is not a directory, exiting...' % RunOptions.fileListsFolder) sys.exit(1) for l in os.listdir(RunOptions.fileListsFolder): if not os.path.isdir('%s/%s' % (RunOptions.fileListsFolder, l)): RunOptions.fileLists.append('%s/%s' % (RunOptions.fileListsFolder, l)) submit_engine = setup_engine(RunOptions) merging = [ submit_engine.create_merge_interface( out_name=L[L.rfind("/") + 1:L.rfind(".")], files_to_merge=ReadListFromFile(L), files_per_job=RunOptions.nFilesPerJob, hold_jobs=RunOptions.HoldJob, final_split=RunOptions.remainingSplit) for L in RunOptions.fileLists ] ### Rucio lists if len(RunOptions.RucioDSList) > 0: CheckRucioSetup() CheckRemainingProxyTime() #### Check that we can actually obtain the datasets if len(RunOptions.RucioRSE) == 0 and not RunOptions.download: logging.error( "Please specifiy either the RSE on which the datasets are stored via --RucioRSE or activate the download option" ) exit(1) ds_to_merge = ReadListFromFile(RunOptions.RucioDSList) download_dir = submit_engine.tmp_dir() + "TMP_DOWNLOAD/" if RunOptions.download: downloadDataSets(InputDatasets=ds_to_merge, Destination=download_dir, RSE=RunOptions.RucioRSE, use_singularity=False) to_wait = [] hold_jobs = [] for ds in ds_to_merge: ds_name = ds[ds.find(":") + 1:] if RunOptions.batch_size <= 0: merging += [ submit_engine.create_merge_interface( out_name=ds_name, files_to_merge=GetDataSetFiles(dsname=ds, RSE=RunOptions.RucioRSE, protocols="root") if not RunOptions.download else [ download_dir + ds_name + "/" + x for x in os.listdir(download_dir + ds_name) ], files_per_job=RunOptions.nFilesPerJob, hold_jobs=RunOptions.HoldJob + hold_jobs, final_split=RunOptions.remainingSplit) ] else: merging += [ DataSetFileHandler(rucio_container=ds, dest_rse=RunOptions.RucioRSE, download=RunOptions.download, merge=True, download_dir=download_dir, destination_dir=submit_engine.out_dir(), cluster_engine=submit_engine, max_merged_size=RunOptions.batch_size * 1024 * 1024 * 1024, hold_jobs=RunOptions.HoldJob + hold_jobs, files_per_merge_job=2) ] to_wait += [submit_engine.subjob_name(merging[-1].job_name())] if len(to_wait) % 5 == 0: hold_jobs = [w for w in to_wait] to_wait = [] for merge in merging: merge.submit_job() clean_hold = [ submit_engine.subjob_name(merge.job_name()) for merge in merging ] submit_engine.submit_clean_all(clean_hold) submit_engine.finish()