def find_hadd_stage_files(input_path, regions, find_hadd_stage1): path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(path_split) if not (5 < nof_levels < 11): raise ValueError("Invalid path: %s" % input_path) current_paths = [input_path] if nof_levels == 6: assert (len(current_paths) == 1) current_path = os.path.join(current_paths.pop(), 'histograms') if not hdfs.isdir(current_path): return [] current_paths = [current_path] nof_levels += 1 if nof_levels == 7: assert (len(current_paths) == 1) current_path = current_paths.pop() current_paths = hdfs.listdir(current_path) nof_levels += 1 if nof_levels == 8: next_paths = [] for current_path in current_paths: region_paths = hdfs.listdir(current_path) for region_path in region_paths: if os.path.basename(region_path).startswith( tuple(ANALYSIS_REGIONS[region] for region in regions)): next_paths.append(region_path) current_paths = next_paths nof_levels += 1 if nof_levels == 9: next_paths = [] for current_path in current_paths: for next_path in hdfs.listdir(current_path): next_path_basename = os.path.basename(next_path) if not (find_hadd_stage1 != (next_path_basename != 'hadd')): next_paths.append(next_path) current_paths = next_paths nof_levels += 1 if nof_levels == 10: next_paths = [] for current_path in current_paths: candidate_files = [] metadata = extract_metadata(current_path) if metadata['region_key'] not in regions: continue for candidate_file in hdfs.listdir(current_path): if not hdfs.isfile(candidate_file): continue if is_hadd_stage_file(candidate_file, find_hadd_stage1, metadata): candidate_files.append(candidate_file) if candidate_files: assert (len(candidate_files) == 1) next_paths.append(candidate_files[0]) current_paths = next_paths return current_paths
def get_job_dir(self): if self.use_home: prefix = os.path.join('/home', getpass.getuser(), 'jobs') else: prefix = os.path.join('/scratch', getpass.getuser()) if not hdfs.isdir(prefix): run_cmd('/scratch/mkscratch') job_dir = os.path.join( prefix, "%s_%s" % (self.analysisName, datetime.date.today().isoformat()), ) return job_dir
def get_paths(input_paths, whitelist, blacklist): valid_paths = {} for input_path in input_paths: input_path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(input_path_split) if nof_levels == 6: input_path_subdir = os.path.join(input_path, OUTPUT_RLE) if not hdfs.isdir(input_path_subdir): raise ValueError("No such directory: %s" % input_path_subdir) for channel_dir in sorted(hdfs.listdir(input_path_subdir)): channel_name = os.path.basename(channel_dir) if whitelist and channel_name not in whitelist: logging.info("Excluding channel: {}".format(channel_name)) continue if channel_name in blacklist: logging.info("Excluding channel: {}".format(channel_name)) continue if channel_name in valid_paths: raise ValueError( "Found duplicate paths for the same channel: %s and %s" % (valid_paths[channel_name], input_path)) logging.debug('Found channel {} at path {}'.format( channel_name, channel_dir)) valid_paths[channel_name] = channel_dir elif nof_levels == 8: if input_path_split[-2] != OUTPUT_RLE: raise ValueError("Invalid path: %s" % input_path) channel_name = input_path_split[-1] if whitelist and channel_name not in whitelist: raise ValueError("Path %s conflicting with whitelist: %s" % (input_path, ', '.join(whitelist))) if channel_name in blacklist: raise ValueError("Path %s conflicting with blacklist: %s" % (input_path, ', '.join(blacklist))) if channel_name in valid_paths: raise ValueError( "Found duplicate paths for the same channel: %s and %s" % (valid_paths[channel_name], input_path)) logging.debug('Found channel {} at path {}'.format( channel_name, input_path)) valid_paths[channel_name] = input_path else: raise ValueError("Invalid path: %s" % input_path) assert (len(set(valid_paths.values())) == len(valid_paths)) return valid_paths
) parser.add_argument('-v', '--verbose', dest = 'verbose', action = 'store_true', default = False, help = 'R|Enable verbose output', ) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG if args.verbose else logging.INFO) pattern = args.input if '{sample_name}' not in pattern: raise ValueError('No {sample_name} found in pattern %s' % pattern) input_dir = os.path.dirname(pattern) if not hdfs.isdir(input_dir): raise ValueError('No such input directory: %s' % input_dir) if args.era == '2017': from tthAnalysis.HiggsToTauTau.samples.tthAnalyzeSamples_2017 import samples_2017 as samples from tthAnalysis.HiggsToTauTau.analysisSettings import lumi_2017 as lumi samples_to_sum = samples_to_sum_2017 samples_lut = {} for sample_key, sample_entry in samples.items(): if sample_key == 'sum_events': continue sample_name = sample_entry['process_name_specific'] assert(sample_name not in samples_lut) samples_lut[sample_name] = sample_key else: raise RuntimeError('Invalid era: %s' % args.era)
def testIsNotDir(self): # The function returns False even if there is no such path on the file system self.assertFalse(hdfs.isdir(self.nonExistingHDFSdir)) self.assertFalse(hdfs.isdir(self.nonExistingHomeDir))
def testIsDir(self): self.assertTrue(hdfs.isdir(self.userHDFSdir)) self.assertTrue(hdfs.isdir(self.userHomeDir))
def testNegative(self): testArgs = self.testArguments['negative'] self.manager.submitJob( inputFiles=[], executable=testArgs['cmd'], command_line_parameter="", outputFilePath="", outputFiles=[], scriptFile=os.path.join(testDir, '{}.sh'.format(testArgs['name'])), ) # if passes, true negative; otherwise true positive self.assertRaises(sbatchManagerRuntimeError, self.manager.waitForJobs) def suite(): testSuite = unittest.TestSuite() testSuite.addTest(unittest.makeSuite(SbatchTestCase)) return testSuite if not hdfs.isdir(testDir): hdfs.mkdirs(testDir) suite_instance = suite() runner = unittest.TextTestRunner() runner.run(suite_instance) if hdfs.isdir(testDir): shutil.rmtree(testDir)
'SR' : 'Tight_OS', 'Fake AR' : 'Fakeable_wFakeRateWeights_OS', 'MC closure e' : 'Fakeable_mcClosure_e_wFakeRateWeights_OS', 'MC closure m' : 'Fakeable_mcClosure_m_wFakeRateWeights_OS', }, } cfg_options = { '2l_2tau' : '/hdfs/local/karl/ttHAnalysis/2017/2018Jun26', } rles = {} for channel in cfg_options: logging.info('Inspecting channel {}'.format(channel)) base_path = os.path.join(cfg_options[channel], 'output_rle', channel) if not hdfs.isdir(base_path): raise ValueError('No such directory: %s' % base_path) rles[channel] = {} for region_name, region in CHANNEL_OPTIONS[channel].items(): region_path = os.path.join(base_path, region) if not hdfs.isdir(region_path): continue logging.info('Inspecting region {}'.format(region_name)) rles[channel][region_name] = {} for sample_path in hdfs.listdir(region_path): sample_name = os.path.basename(sample_path) if sample_name != 'ttHJetToNonbb_M125_amcatnlo': continue logging.info('Inspecting sample {}'.format(sample_name)) rles[channel][region_name][sample_name] = {} for rle_file_path in hdfs.listdir(sample_path): rle_file = os.path.basename(rle_file_path)
# files from the second chunk are added sequentially offset = get_file_idx(file_list_1[-1]) new_idx = offset for root_file in file_list_2: new_idx += 1 root_file_basename = 'tree_%d.root' % new_idx root_file_idx = new_idx // 1000 dst_subdir = os.path.join(destination, '%04d' % root_file_idx) if dst_subdir not in missing_subdirs: missing_subdirs.append(dst_subdir) copy_relations[root_file] = os.path.join(dst_subdir, root_file_basename) if args.copy: for missing_subdir in missing_subdirs: if not hdfs.isdir(missing_subdir): logging.info('Created subdirectory {}'.format(missing_subdir)) if hdfs.mkdirs(missing_subdir) != 0: raise RuntimeError("Unable to create directory: %s" % missing_subdir) for src_file, dst_file in copy_relations.items(): logging.debug('Copying file {} to {}'.format(src_file, dst_file)) if args.copy: if hdfs.copy(src_file, dst_file, overwrite = False) != 0: raise RuntimeError("Unable to copy file from %s to %s" % (src_file, dst_file)) logging.info('Copying done') new_lines[os.path.dirname(destination)] = ( chunks[chunk_1] * len(file_list_1) / 100. + chunks[chunk_2] * len(file_list_2) / 100. ) / (len(file_list_1) + len(file_list_2)) * 100.
sample_name_re = re.compile(sample_name) except: logging.error( "Argument {arg} not a valid regex".format(arg=sample_name)) sys.exit(1) if grep_individually and not grep_directory: logging.warning( 'Option -a/--all has no effect unless you specify -d/--directory') if not hdfs.isfile(rle_file): logging.error("No such file: '{rle_filename}'".format( rle_filename=rle_file, )) sys.exit(1) if output_file and not hdfs.isdir(os.path.dirname(output_file)): logging.error( "Parent directory of '{output_file}' doesn't exist".format( output_file=output_file, )) sys.exit(1) if grep_directory and not hdfs.isdir(grep_directory): logging.error("Grep directory '{grep_directory}' doesn't exist".format( grep_directory=grep_directory, )) sys.exit(1) sample_keys = {} for s_key, s_value in samples.iteritems(): if sample_name_re.match(s_value['process_name_specific']): sample_keys[s_key] = s_value['process_name_specific']
logging.error("Unable to parse output directory from file: %s" % crab_logfile) continue if nof_jobs < 0: logging.error("Unable to parse total number of jobs from file: %s" % crab_logfile) continue version = os.path.basename(output_dir) version_date = version.split('_')[1] prefix = '{}_{}'.format(version, chunk_str) if chunk_str else version userName = os.path.basename(os.path.dirname(output_dir)) dataset_requestName = '%s__%s' % (dataset_match.group(1), dataset_match.group(2)) requestName = '%s_%s' % (prefix, dataset_requestName) max_requestname_len = 160 - len(userName) if len(requestName) > max_requestname_len: requestName = requestName[:max_requestname_len] crab_path = os.path.join('/hdfs', 'cms', output_dir[1:], dataset_match.group(1), requestName) if hdfs.isdir(crab_path): logging.debug("Found directory: {}".format(crab_path)) subdirs = hdfs.listdir(crab_path) if len(subdirs) != 1: logging.error("Expected exactly one subdir in {} but found {}: {}".format( crab_path, len(subdirs), ', '.join(subdirs) )) continue subdir = subdirs[0] root_files = [ root_file for subsubdir in hdfs.listdir(subdir) for root_file in hdfs.listdir(subsubdir) if root_file.endswith('.root') ] root_idxs = set(map(lambda fn: int(TREE_REGEX.match(os.path.basename(fn)).group('idx')), root_files)) assert(not (root_idxs & expected_fails)) root_idxs = root_idxs | expected_fails nof_completed = len(root_idxs) * 100. / nof_jobs
def get_hadd_stage2(input_paths): results = {} for input_path in input_paths: path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(path_split) if not (5 < nof_levels < 11): raise ValueError("Invalid path: %s" % input_path) current_paths = [input_path] if nof_levels == 6: assert (len(current_paths) == 1) current_path = os.path.join(current_paths.pop(), 'histograms') if not hdfs.isdir(current_path): return [] current_paths = [current_path] nof_levels += 1 if nof_levels == 7: assert (len(current_paths) == 1) current_path = current_paths.pop() current_paths = hdfs.listdir(current_path) nof_levels += 1 if nof_levels == 8: next_paths = [] for current_path in current_paths: region_paths = hdfs.listdir(current_path) for region_path in region_paths: next_paths.append(region_path) current_paths = next_paths nof_levels += 1 if nof_levels == 9: next_paths = [] for current_path in current_paths: for next_path in hdfs.listdir(current_path): next_path_basename = os.path.basename(next_path) if next_path_basename == 'hadd': next_paths.append(next_path) current_paths = next_paths nof_levels += 1 if nof_levels == 10: next_paths = [] for current_path in current_paths: candidate_files = [] for candidate_file in hdfs.listdir(current_path): if not hdfs.isfile(candidate_file): continue candidate_file_basename = os.path.basename(candidate_file) if candidate_file_basename.startswith('hadd_stage2') and \ not HADD_STAGE2_RE.match(candidate_file_basename.split('.')[0]): candidate_files.append(candidate_file) if candidate_files: assert (len(candidate_files) == 1) next_paths.append(candidate_files[0]) current_paths = next_paths for current_path in current_paths: current_path_split = [ subpath for subpath in current_path.split(os.path.sep) if subpath != '' ] channel = current_path_split[7] region = current_path_split[8] channel_region = '{}_{}'.format(channel, region) if channel_region in results: raise RuntimeError( "Found two paths corresponding to the same channel (%s) and region (%s): %s and %s" % \ (channel, region, current_path, results[channel_region]) ) results[channel_region] = current_path logging.debug( 'Found hadd stage2 file corresponding to channel {} and region {}: {}' .format(channel, region, current_path)) return [results[k] for k in sorted(results.keys())]
type=str, dest='output', metavar='file', required=True, help='R|Output file', ) parser.add_argument( '-v', '--verbose', dest='verbose', action='store_true', default=False, help='R|Enable verbose output', ) args = parser.parse_args() logging.getLogger().setLevel( logging.DEBUG if args.verbose else logging.INFO) output_dir = os.path.dirname(os.path.abspath(args.output)) if not hdfs.isdir(output_dir): raise RuntimeError("No such directory: %s" % output_dir) fns = get_hadd_stage2(args.input) foptr = ROOT.TFile.Open(args.output, 'recreate') logging.info('Creating file: {}'.format(foptr.GetName())) for fn in fns: copy_dirs(fn, foptr) logging.info('Writing file: {}'.format(foptr.GetName())) foptr.Write() foptr.Close()
rle_filename = args.input out_filename = os.path.abspath(args.output) grep_dir = args.directory sample_name = args.sample_name force = args.force debug_output = args.debug nof_files = args.nof_files # check if input RLE file exists if not hdfs.isfile(rle_filename): logging.error("File {rle_filename} does not exist or is not a file!".format(rle_filename = rle_filename)) sys.exit(1) # check if the directory into which we have to write the output ROOT file already exists out_parent_dir = os.path.dirname(out_filename) if not hdfs.isdir(out_parent_dir): if not force: logging.error("Parent directory of the output file {out_filename} does not exist".format( out_filename = out_filename), ) sys.exit(1) else: logging.debug("Output directory {out_parent_dir} does not exist, attempting to create one".format( out_parent_dir = out_parent_dir, )) try: hdfs.mkdirs(out_parent_dir) except IOError as err: logging.error("Could not create directory {out_parent_dir}".format(out_parent_dir = out_parent_dir)) sys.exit(1)