def dump_rle(input_file, output_file, tree_name='Events', run_br='run', lumi_br='luminosityBlock', event_br='event'): with open(output_file, 'w') as f: ch_root = ROOT.TChain(tree_name) ch_root.AddFile(input_file) run_a = array.array('I', [0]) lumi_a = array.array('I', [0]) evt_a = array.array('L', [0]) ch_root.SetBranchAddress(run_br, run_a) ch_root.SetBranchAddress(lumi_br, lumi_a) ch_root.SetBranchAddress(event_br, evt_a) nof_entries = ch_root.GetEntries() rle_i_arr = [] for i in range(nof_entries): ch_root.GetEntry(i) rle_i_arr.append(':'.join(map(str, [run_a[0], lumi_a[0], evt_a[0]]))) f.write("{rle_lines}\n".format(rle_lines='\n'.join(rle_i_arr))) logging.debug("Wrote {nof_bytes} kB to {filename}".format( nof_bytes=os.path.getsize(output_file) / 1000, filename=output_file, )) return
def memJobList(self, inputFileList, rle_whitelist): ''' Args: inputFileList:{ int, array of strings }; i.e. fileset* ID and the list of files * if the script were to generate configuration files, this number would correspond to job ID Returns: { int : { str : int, str : [str, str, ...], str : [int, int] } } | | | | job id "fileset_id" "input_fileset" "event_range" The function reads a given set of files and determines the event range ''' memJobDict = {} jobId = 0 apply_rle_filter = bool(self.rle_filter_file) for filesetId, inputFileSet in inputFileList.iteritems(): memJobDict_common = { 'fileset_id' : filesetId, 'input_fileset' : inputFileSet } print("Processing file %s" % inputFileSet) ch = ROOT.TChain(self.treeName) for fn in inputFileSet: # chaining a file logging.debug("Processing file {fileName}".format(fileName = fn)) ch.AddFile(fn) nof_entries = ch.GetEntries() memJobDict_common['nof_entries'] = nof_entries if nof_entries == 0: jobId += 1 memJobDict[jobId] = dict({ 'event_range' : [0, 0], 'nof_int' : 0, 'nof_int_pass' : 0, 'nof_events_pass' : 0, 'nof_zero' : 0, }, **memJobDict_common) continue current_pos = 0 evt_ranges = [] counter, counter_arr = 0, [] nof_events_pass_counter, nof_events_pass = 0, [] nof_int_pass_counter, nof_int_pass = 0, [] nof_zero_integrations, nof_events_zero = 0, [] whitelist_all, whitelist_running = [], [] run = array.array('I', [0]) luminosityBlock = array.array('I', [0]) event = array.array('L', [0]) maxPermutations_addMEM = array.array('i', [0]) ch.SetBranchAddress("run", run) ch.SetBranchAddress("luminosityBlock", luminosityBlock) ch.SetBranchAddress("event", event) if self.maxPermutations_branchName is not None and self.maxPermutations_branchName != "": ch.SetBranchAddress(self.maxPermutations_branchName, maxPermutations_addMEM) else: maxPermutations_addMEM[0] = 1 for i in range(nof_entries): ch.GetEntry(i) if i > 0 and i % 10000 == 0: print(" Processing event %i/%i" % (i, nof_entries)) logging.debug("Processing event %i/%i" % (i, nof_entries)) rle = ':'.join(map(lambda nr: str(nr[0]), [ run, luminosityBlock, event ])) nof_integrations = maxPermutations_addMEM[0] if apply_rle_filter: if rle in rle_whitelist: if not (nof_integrations > 0): logging.error("Expected non-zero # integrations in event {}, but got {}".format(rle, nof_integrations)) nof_integrations = 1 else: nof_integrations = 0 if nof_integrations < 0: nof_integrations = 0 if nof_integrations >= 1: nof_events_pass_counter += 1 nof_int_pass_counter += nof_integrations else: nof_zero_integrations += 1 if nof_integrations > self.mem_integrations_per_job: raise ValueError("Too many nof_integrations = %d in file(s) %s at %d:%d:%d" % (nof_integrations, ', '.join(inputFileSet), ch.run, ch.lumi, ch.evt)) if (counter + nof_integrations) > self.mem_integrations_per_job: if evt_ranges: evt_ranges.append([evt_ranges[-1][1], current_pos]) else: evt_ranges.append([0, current_pos]) counter_arr.append(counter) counter = 0 nof_events_pass.append(nof_events_pass_counter) nof_events_pass_counter = 0 nof_int_pass.append(nof_int_pass_counter) nof_int_pass_counter = 0 nof_events_zero.append(nof_zero_integrations) nof_zero_integrations = 0 if apply_rle_filter: whitelist_all.append(whitelist_running) whitelist_running = [] if rle in rle_whitelist: whitelist_running.append(rle) counter += nof_integrations current_pos += 1 if counter <= self.mem_integrations_per_job and counter >= 0: if evt_ranges: evt_ranges.append([evt_ranges[-1][1], int(nof_entries)]) else: evt_ranges.append([0, int(nof_entries)]) counter_arr.append(counter) nof_events_pass.append(nof_events_pass_counter) nof_int_pass.append(nof_int_pass_counter) nof_events_zero.append(nof_zero_integrations) if apply_rle_filter: whitelist_all.append(whitelist_running) # ensure that the event ranges won't overlap (i.e. there won't be any double-processing of any event) evt_ranges_cat = [] for v in [range(x[0], x[1]) for x in evt_ranges]: evt_ranges_cat += v assert(evt_ranges_cat == range(nof_entries)) assert(bool(evt_ranges)) for i in range(len(evt_ranges)): if self.max_jobs_per_sample == -1 or jobId < self.max_jobs_per_sample: jobId += 1 memJobDict[jobId] = dict({ 'event_range' : evt_ranges[i], 'nof_int' : counter_arr[i], 'nof_int_pass' : nof_int_pass[i], 'nof_events_pass' : nof_events_pass[i], 'nof_zero' : nof_events_zero[i], 'whitelist' : whitelist_all[i] if apply_rle_filter else [], }, **memJobDict_common) # we now have all event ranges per one file, let's add them to the dictionary del ch return memJobDict
sample_path, '000%d' % (file_idx / 1000), 'tree_{i}.root'.format(i=file_idx)) rles[rle].append(grep_result) else: # instead of forming a list of files let's loop over the subfolders and the files therein instead logging.debug('Looping over the files in {sample_path}'.format( sample_path=sample_path)) for subdir in hdfs.listdir(sample_path): logging.debug( 'Found subdirectory {subdir}'.format(subdir=subdir)) for rootfile in hdfs.listdir(subdir): logging.debug("Processing file '{rootfile}'".format( rootfile=rootfile, )) # open the file ch_root = ROOT.TChain("Events") ch_root.AddFile(rootfile) run_a = array.array('I', [0]) lumi_a = array.array('I', [0]) evt_a = array.array('L', [0]) ch_root.SetBranchAddress("run", run_a) ch_root.SetBranchAddress("luminosityBlock", lumi_a) ch_root.SetBranchAddress("event", evt_a) nof_entries = ch_root.GetEntries() for i in range(nof_entries): ch_root.GetEntry(i) rle_i = ':'.join( map(str, [run_a[0], lumi_a[0], evt_a[0]]))
def memJobList(self, inputFileList): ''' Args: inputFileList:{ int, array of strings }; i.e. fileset* ID and the list of files * if the script were to generate configuration files, this number would correspond to job ID Returns: { int : { str : int, str : [str, str, ...], str : [int, int] } } | | | | job id "fileset_id" "input_fileset" "event_range" The function reads a given set of files and determines the event range ''' memJobDict = {} jobId = 0 for filesetId, inputFileSet in inputFileList.iteritems(): memJobDict_common = { 'fileset_id': filesetId, 'input_fileset': inputFileSet } ch = ROOT.TChain(self.treeName) for fn in inputFileSet: # chaining a file logging.debug("Processing file {fileName}".format(fileName=fn)) ch.AddFile(fn) nof_entries = ch.GetEntries() memJobDict_common['nof_entries'] = nof_entries if nof_entries == 0: jobId += 1 memJobDict[jobId] = dict( { 'event_range': [0, 0], 'nof_int': 0, 'nof_int_pass': 0, 'nof_events_pass': 0, 'nof_zero': 0, }, **memJobDict_common) continue current_pos = 0 evt_ranges = [] counter, counter_arr = 0, [] nof_events_pass_counter, nof_events_pass = 0, [] nof_int_pass_counter, nof_int_pass = 0, [] nof_zero_integrations, nof_events_zero = 0, [] maxPermutations_addMEM = array.array('i', [0]) ch.SetBranchAddress(self.maxPermutations_branchName, maxPermutations_addMEM) for i in range(nof_entries): ch.GetEntry(i) if i > 0 and i % 10000 == 0: logging.debug("Processing event %i/%i" % (i, nof_entries)) nof_integrations = maxPermutations_addMEM[0] if nof_integrations < 0: nof_integrations = 0 if nof_integrations >= 1: nof_events_pass_counter += 1 nof_int_pass_counter += nof_integrations else: nof_zero_integrations += 1 if nof_integrations > self.mem_integrations_per_job: raise ValueError( "Too many nof_integrations = %d in file(s) %s at %d:%d:%d" % (nof_integrations, ', '.join(inputFileSet), ch.run, ch.lumi, ch.evt)) if (counter + nof_integrations) > self.mem_integrations_per_job: if evt_ranges: evt_ranges.append([evt_ranges[-1][1], current_pos]) else: evt_ranges.append([0, current_pos]) counter_arr.append(counter) counter = 0 nof_events_pass.append(nof_events_pass_counter) nof_events_pass_counter = 0 nof_int_pass.append(nof_int_pass_counter) nof_int_pass_counter = 0 nof_events_zero.append(nof_zero_integrations) nof_zero_integrations = 0 counter += nof_integrations current_pos += 1 if counter <= self.mem_integrations_per_job and counter >= 0: if evt_ranges: evt_ranges.append([evt_ranges[-1][1], int(nof_entries)]) else: evt_ranges.append([0, int(nof_entries)]) counter_arr.append(counter) nof_events_pass.append(nof_events_pass_counter) nof_int_pass.append(nof_int_pass_counter) nof_events_zero.append(nof_zero_integrations) # ensure that the event ranges won't overlap (i.e. there won't be any double-processing of any event) evt_ranges_cat = [] for v in [range(x[0], x[1]) for x in evt_ranges]: evt_ranges_cat += v assert (evt_ranges_cat == range(nof_entries)) assert (bool(evt_ranges)) for i in range(len(evt_ranges)): jobId += 1 memJobDict[jobId] = dict( { 'event_range': evt_ranges[i], 'nof_int': counter_arr[i], 'nof_int_pass': nof_int_pass[i], 'nof_events_pass': nof_events_pass[i], 'nof_zero': nof_events_zero[i], }, **memJobDict_common) # we now have all event ranges per one file, let's add them to the dictionary return memJobDict