def get_inglob(outdir): dirname = path.dirname(outdir) run = get_run(outdir) newbasename = f"*{run}*.root" newglob = path.join(dirname, newbasename) glob_and_check(newglob) return newglob
def check_telescope_files(rootdir=None, globstr1=None, globstr2=None, replacer=("_M1_", "_M2_"), force=False): """Checks whether the files under globstr1 and globstr2 are equal (the filenames should all be identical once the replacer is swapped out).""" # needs implementation If force is set to False it aborts if the files are not equal. # If force is set to True it simply returns the union of filenames if rootdir is not None: globstr1 = path.join(rootdir, globstr1) globstr2 = path.join(rootdir, globstr2) if isinstance(globstr1, str): files1 = glob_and_check(globstr1) else: files1 = globstr1 if isinstance(globstr2, str): files2 = glob_and_check(globstr2) else: files2 = globstr2 bnames1 = set([ path.basename(fname.replace(replacer[0], replacer[1])) for fname in files1 ]) bnames2 = set([ path.basename(fname.replace(replacer[0], replacer[1])) for fname in files2 ]) assert bnames1 == bnames2, \ (f"Telescope files not equal, set difference: len files1 {len(files1)} len files2 {len(files2)}\n\n" f"dirs: {path.dirname(files1[0])}, {bnames1.symmetric_difference(bnames2)}")
def process_melibea(superstar, nnenergies, outdir, njobs, mode): ssfiles = glob_and_check(superstar) if "nn" in mode or "second" in mode: if isinstance(nnenergies, str): nnfiles = glob_and_check(nnenergies) else: nnfiles = sorted(nnenergies) assert np.all( [".root" not in fname for fname in nnfiles] ), f"nnenergy files are not supposed to be rootfiles, aborting. nnglob: {nnenergies}" assert_filenames(ssfiles, nnfiles) else: nnfiles = ssfiles assert np.all( [".root" in fname for fname in ssfiles] ), f"superstar files are supposed to be rootfiles, aborting. superstarglob: {superstar}" newssfiles, newnnfiles = make_fnames_equal(ssfiles, nnfiles) assert len(newnnfiles) >= len( ssfiles), "Not all ssfiles get processed, something is wrong" setup_dirs(outdir) filezip = zip(ssfiles, nnfiles) Parallel(n_jobs=njobs)\ (delayed(run_melibea)(ssfile, nnfile, outdir, mode) for ssfile, nnfile in filezip)
def check_globs(glob1, glob2): if isinstance(glob1, str): fnames1 = glob_and_check(glob1) fnames2 = glob_and_check(glob2) elif isinstance(glob1, list): fnames1 = glob1 fnames2 = glob2 runs1 = sorted([get_run(fname) for fname in fnames1]) runs2 = sorted([get_run(fname) for fname in fnames2]) assert np.array_equal(runs1, runs2), "runs are not equal, aborting."
def convert_multiple_dirs(rootdir, mergecolsonly, parallelprocessing): """Descends into multiple dirs and converts their root files to csv. Dirnames need to be formatted like: YYYY_MM_DD or YYYY_DD_MM""" # only works for data as of now, not for mc subdirs = glob_and_check(path.join(rootdir, r"20\d\d_\d\d_\d\d/")) # not used, setting globstr1 and 2 to None and subsequent none handling in call_c_converter needs to be implemented first Parallel(n_jobs=njobs)(delayed(convert_to_csv)( rootdirectory, mergecolsonly, parallelprocessing) for rootdirectory in subdirs) subdirGlobs = sorted( [path.join(subdir, "/*" + MERGECOLENDING) for subdir in subdirs]) filenames = [glob_and_check(subdirGlob) for subdirGlob in subdirGlobs] outFilename = merge_csv_files(filenames) return outFilename
def merge_wrapper(processdir, basedir, starglob, superstarglob, calibrootglob, njobs=2, invert=False): """extracts the mergecols from the _S_ root (superstarglob) files and merges the energies""" for glob in [starglob, superstarglob, calibrootglob]: assert path.dirname(glob), \ f"Glob : {glob} should be/contain a subdirectory" superstarGlobNew = get_glob_strings(superstarglob) calibrootGlob1, calibrootGlob2 = get_glob_strings(calibrootglob) superstardir = get_dir_from_glob(processdir, superstarglob) calibdir = get_dir_from_glob(basedir, calibrootglob) starglob = processdir + starglob # ssmcolfnames = converter(superstardir, # globstr1=superstarGlobNew, # globstr2=superstarGlobNew, # njobs=42, # mergecolsonly=True) # yecho("SuperStarfiles done.") # tofiltercalibglob = converter(processdir, # globstr1=calibrootGlob1, # globstr2=calibrootGlob2, # njobs=42, # mergecolsonly=False) # yecho("Extracting done.") tofiltercalibglob = "./csv/*.csv" ssmcolfnames = glob_and_check("./superstar/mergecols/*.csv") yecho("Removing events.") if njobs > 1: splitcalib = split_by_dates(tofiltercalibglob) splitstar = split_by_dates(starglob) splitss = split_by_dates(ssmcolfnames) # needs filename output assert len(splitcalib) == len(splitstar) == len( splitss ), "only works the first time when no calibfiles got moved, for everything else this needs a new function with more logic" Parallel(n_jobs=njobs)\ (delayed(single_remove_events)(calibglob, starglob, ssglob, njobs, invert) for calibglob, starglob, ssglob in zip(splitcalib, splitstar, splitss)) # filteredFiles = [f for arr in filteredFiles for f in arr] else: check_telescope_files(rootdir=None, globstr1=ssmcolfnames, globstr2=calibmcolfnames, replacer=("_Y_", "_I_")) remover = EventRemover(tofiltercalibglob=tofiltercalibglob, starglob=starglob, superstarmcolglob=ssmcolfnames) remover.remove_events() filteredFiles = remover.outfilenames yecho( "Removed events that get thrown out during image cleaning and superstar processing and wrote the merged runs to:" ) yecho(f"{path.basename(filteredFiles[0])}")
def split_by_dates(glob): if isinstance(glob, str): fullnames = glob_and_check(glob) else: fullnames = glob dirnames = [path.dirname(f) for f in fullnames] fnames = [path.basename(f) for f in fullnames] if not fnames[0].startswith("GA_"): dates = set([fname.split("_")[0] for fname in fnames]) else: # example GA_M2_za05to35_8_1740969_Y_wr.root # first 5 numbers of run for splitting if "_S_" not in fnames[0]: dates = set(["*" + f.split("_")[4][:5] for f in fnames]) # example GA_za05to35_8_1740969_Y_wr.root else: dates = set(["*" + f.split("_")[3][:5] for f in fnames]) dates = sorted(list(dates)) identifiers = [get_identifier(fname) for fname in fnames] newglobs = [ date + identifier for date, identifier in zip(dates, identifiers) ] newglobs = [ path.join(dname, glob) for dname, glob in zip(dirnames, newglobs) ] return sorted(newglobs)
def main(glob): filenames = glob_and_check(glob) ish5 = np.all(["h5" in fname for fname in filenames]) ishdf5 = np.all(["hdf5" in fname for fname in filenames]) ishdf = np.all(["hdf" in fname for fname in filenames]) if not (ish5 or ishdf5 or ishdf): print( "files are not ending with .hdf or something like that, this should fail soon." ) for fname in filenames: print_linenumber(fname)
def generate_imputed_luts(basedir, superstar, lutfnames): """processes superstar with melibea, taking the energies from the lutfnames text files for imputation the extracts the lutenergies from the newly imputed files and returns their filenames""" outdir = path.join(basedir, "tmp") os.makedirs(outdir, exist_ok=True) process_melibea(superstar=superstar, nnenergies=lutfnames, outdir=outdir, njobs=10, mode="nn") glob_and_check(path.join(outdir, "*_Q_*.root")) imputedluts = path.join(outdir, "*_Q_*.root") lutenergies = converter(outdir, globstr1=path.basename(imputedluts), globstr2=path.basename(imputedluts), multidir=False, njobs=8, mergecolsonly=True, parallelprocessing=True) check_globs(lutenergies, lutfnames) return sorted(lutenergies)
def convert_to_csv(rootdir, globstr1, globstr2, mergecolsonly, parallelprocessing, star, njobs=1): """Converts the rootfiles that are found in the globstrings to csv and puts them in MERGECOLDIR. If there are multiple csvfiles it merges them with prefix MERGEDCSVPREFIX. Returns the name of the merged csv file.""" setup_dirs(rootdir) #maybe implement parellel processing here so the filenames in globstring get split into subarrays # and each processed seperately # the arguments can be found in globRoot2.csv the third one should be whether to glob or not # the 1 is for doGlob if (star is False) and (mergecolsonly is False): check_telescope_files(rootdir, globstr1, globstr2) elif (star is True) and (mergecolsonly is False): check_telescope_files(rootdir, globstr1, globstr2, replacer=("_Y_", "_I_")) call_c_converter(rootdir, globstr1, globstr2, mergecolsonly, parallelprocessing, star, njobs) if mergecolsonly is False: outFilenames = path.dirname(globstr1) # used to be else, idk yet if that works with star mode elif star is False: newdir = path.join(rootdir, MERGECOLDIR) newglobstr1 = get_new_globstr(globstr1) newglobstr2 = get_new_globstr(globstr2) check_telescope_files(newdir, newglobstr1, newglobstr2) mergecolFilenames1 = glob_and_check(path.join(newdir, newglobstr1)) mergecolFilenames2 = glob_and_check(path.join(newdir, newglobstr2)) mergecolFilenames1.extend(mergecolFilenames2) outFilenames = mergecolFilenames1 return list(set(outFilenames))
def call_c_converter(rootdir, globstr1, globstr2, mergecolsonly, parallelprocessing, star, njobs=1): """Function that encapsulates the call to the root macro that converts the root files to csv.""" if isinstance(globstr1, str): filenames1 = glob_and_check(path.join(rootdir, globstr1)) filenames2 = glob_and_check(path.join(rootdir, globstr2)) elif isinstance(globstr1, list): filenames1 = globstr1 filenames2 = globstr2 if parallelprocessing is False: fcall(rootdir, globstr1, globstr2, mergecolsonly, parallelprocessing, star) else: Parallel(n_jobs=njobs)(delayed(fcall)( rootdir, file1, file2, mergecolsonly, parallelprocessing, star) for file1, file2 in zip(filenames1, filenames2))
def get_file_shape(subglob): fname = glob_and_check(path.join(rootdir, subglob))[0] h5file = h5.File(fname) shape = h5file["data"].shape return shape
def split_into_telescopes(fnames): if isinstance(fnames, basestring): fnames = sorted(glob_and_check(fnames))
def _rootdir_glob(self, subdir): globname = join(self.rootdir, subdir) filenames = cth.glob_and_check(globname) return sorted(filenames)[self.entrypoint:]
def setup_currentfiles(self, currfilesglob): if currfilesglob is None: self.__currentfiles = None else: self.__currentfiles = cth.glob_and_check(currfilesglob)[self.entrypoint:] self._cleanup()