Пример #1
0
def find_hadd_stage_files(input_path, regions, find_hadd_stage1):
    path_split = [
        subpath for subpath in input_path.split(os.path.sep) if subpath != ''
    ]
    nof_levels = len(path_split)
    if not (5 < nof_levels < 11):
        raise ValueError("Invalid path: %s" % input_path)

    current_paths = [input_path]
    if nof_levels == 6:
        assert (len(current_paths) == 1)
        current_path = os.path.join(current_paths.pop(), 'histograms')
        if not hdfs.isdir(current_path):
            return []
        current_paths = [current_path]
        nof_levels += 1
    if nof_levels == 7:
        assert (len(current_paths) == 1)
        current_path = current_paths.pop()
        current_paths = hdfs.listdir(current_path)
        nof_levels += 1
    if nof_levels == 8:
        next_paths = []
        for current_path in current_paths:
            region_paths = hdfs.listdir(current_path)
            for region_path in region_paths:
                if os.path.basename(region_path).startswith(
                        tuple(ANALYSIS_REGIONS[region] for region in regions)):
                    next_paths.append(region_path)
        current_paths = next_paths
        nof_levels += 1
    if nof_levels == 9:
        next_paths = []
        for current_path in current_paths:
            for next_path in hdfs.listdir(current_path):
                next_path_basename = os.path.basename(next_path)
                if not (find_hadd_stage1 != (next_path_basename != 'hadd')):
                    next_paths.append(next_path)
        current_paths = next_paths
        nof_levels += 1
    if nof_levels == 10:
        next_paths = []
        for current_path in current_paths:
            candidate_files = []
            metadata = extract_metadata(current_path)
            if metadata['region_key'] not in regions:
                continue
            for candidate_file in hdfs.listdir(current_path):
                if not hdfs.isfile(candidate_file):
                    continue
                if is_hadd_stage_file(candidate_file, find_hadd_stage1,
                                      metadata):
                    candidate_files.append(candidate_file)
            if candidate_files:
                assert (len(candidate_files) == 1)
                next_paths.append(candidate_files[0])
        current_paths = next_paths
    return current_paths
Пример #2
0
 def get_job_dir(self):
     if self.use_home:
         prefix = os.path.join('/home', getpass.getuser(), 'jobs')
     else:
         prefix = os.path.join('/scratch', getpass.getuser())
         if not hdfs.isdir(prefix):
             run_cmd('/scratch/mkscratch')
     job_dir = os.path.join(
         prefix,
         "%s_%s" % (self.analysisName, datetime.date.today().isoformat()),
     )
     return job_dir
Пример #3
0
def get_paths(input_paths, whitelist, blacklist):
    valid_paths = {}
    for input_path in input_paths:
        input_path_split = [
            subpath for subpath in input_path.split(os.path.sep)
            if subpath != ''
        ]
        nof_levels = len(input_path_split)
        if nof_levels == 6:
            input_path_subdir = os.path.join(input_path, OUTPUT_RLE)
            if not hdfs.isdir(input_path_subdir):
                raise ValueError("No such directory: %s" % input_path_subdir)
            for channel_dir in sorted(hdfs.listdir(input_path_subdir)):
                channel_name = os.path.basename(channel_dir)
                if whitelist and channel_name not in whitelist:
                    logging.info("Excluding channel: {}".format(channel_name))
                    continue
                if channel_name in blacklist:
                    logging.info("Excluding channel: {}".format(channel_name))
                    continue
                if channel_name in valid_paths:
                    raise ValueError(
                        "Found duplicate paths for the same channel: %s and %s"
                        % (valid_paths[channel_name], input_path))
                logging.debug('Found channel {} at path {}'.format(
                    channel_name, channel_dir))
                valid_paths[channel_name] = channel_dir
        elif nof_levels == 8:
            if input_path_split[-2] != OUTPUT_RLE:
                raise ValueError("Invalid path: %s" % input_path)
            channel_name = input_path_split[-1]
            if whitelist and channel_name not in whitelist:
                raise ValueError("Path %s conflicting with whitelist: %s" %
                                 (input_path, ', '.join(whitelist)))
            if channel_name in blacklist:
                raise ValueError("Path %s conflicting with blacklist: %s" %
                                 (input_path, ', '.join(blacklist)))
            if channel_name in valid_paths:
                raise ValueError(
                    "Found duplicate paths for the same channel: %s and %s" %
                    (valid_paths[channel_name], input_path))
            logging.debug('Found channel {} at path {}'.format(
                channel_name, input_path))
            valid_paths[channel_name] = input_path
        else:
            raise ValueError("Invalid path: %s" % input_path)
    assert (len(set(valid_paths.values())) == len(valid_paths))
    return valid_paths
Пример #4
0
)
parser.add_argument('-v', '--verbose',
  dest = 'verbose', action = 'store_true', default = False,
  help = 'R|Enable verbose output',
)

args = parser.parse_args()

logging.getLogger().setLevel(logging.DEBUG if args.verbose else logging.INFO)

pattern = args.input
if '{sample_name}' not in pattern:
  raise ValueError('No {sample_name} found in pattern %s' % pattern)

input_dir = os.path.dirname(pattern)
if not hdfs.isdir(input_dir):
  raise ValueError('No such input directory: %s' % input_dir)

if args.era == '2017':
  from tthAnalysis.HiggsToTauTau.samples.tthAnalyzeSamples_2017 import samples_2017 as samples
  from tthAnalysis.HiggsToTauTau.analysisSettings import lumi_2017 as lumi
  samples_to_sum = samples_to_sum_2017

  samples_lut = {}
  for sample_key, sample_entry in samples.items():
    if sample_key == 'sum_events': continue
    sample_name = sample_entry['process_name_specific']
    assert(sample_name not in samples_lut)
    samples_lut[sample_name] = sample_key
else:
  raise RuntimeError('Invalid era: %s' % args.era)
Пример #5
0
 def testIsNotDir(self):
     # The function returns False even if there is no such path on the file system
     self.assertFalse(hdfs.isdir(self.nonExistingHDFSdir))
     self.assertFalse(hdfs.isdir(self.nonExistingHomeDir))
Пример #6
0
 def testIsDir(self):
     self.assertTrue(hdfs.isdir(self.userHDFSdir))
     self.assertTrue(hdfs.isdir(self.userHomeDir))
Пример #7
0
    def testNegative(self):
        testArgs = self.testArguments['negative']
        self.manager.submitJob(
            inputFiles=[],
            executable=testArgs['cmd'],
            command_line_parameter="",
            outputFilePath="",
            outputFiles=[],
            scriptFile=os.path.join(testDir, '{}.sh'.format(testArgs['name'])),
        )
        # if passes, true negative; otherwise true positive
        self.assertRaises(sbatchManagerRuntimeError, self.manager.waitForJobs)


def suite():
    testSuite = unittest.TestSuite()
    testSuite.addTest(unittest.makeSuite(SbatchTestCase))
    return testSuite


if not hdfs.isdir(testDir):
    hdfs.mkdirs(testDir)

suite_instance = suite()
runner = unittest.TextTestRunner()
runner.run(suite_instance)

if hdfs.isdir(testDir):
    shutil.rmtree(testDir)
Пример #8
0
    'SR'           : 'Tight_OS',
    'Fake AR'      : 'Fakeable_wFakeRateWeights_OS',
    'MC closure e' : 'Fakeable_mcClosure_e_wFakeRateWeights_OS',
    'MC closure m' : 'Fakeable_mcClosure_m_wFakeRateWeights_OS',
  },
}

cfg_options = {
  '2l_2tau' : '/hdfs/local/karl/ttHAnalysis/2017/2018Jun26',
}

rles = {}
for channel in cfg_options:
  logging.info('Inspecting channel {}'.format(channel))
  base_path = os.path.join(cfg_options[channel], 'output_rle', channel)
  if not hdfs.isdir(base_path):
    raise ValueError('No such directory: %s' % base_path)
  rles[channel] = {}
  for region_name, region in CHANNEL_OPTIONS[channel].items():
    region_path = os.path.join(base_path, region)
    if not hdfs.isdir(region_path):
      continue
    logging.info('Inspecting region {}'.format(region_name))
    rles[channel][region_name] = {}
    for sample_path in hdfs.listdir(region_path):
      sample_name = os.path.basename(sample_path)
      if sample_name != 'ttHJetToNonbb_M125_amcatnlo': continue
      logging.info('Inspecting sample {}'.format(sample_name))
      rles[channel][region_name][sample_name] = {}
      for rle_file_path in hdfs.listdir(sample_path):
        rle_file = os.path.basename(rle_file_path)
Пример #9
0
    # files from the second chunk are added sequentially
    offset = get_file_idx(file_list_1[-1])
    new_idx = offset
    for root_file in file_list_2:
      new_idx += 1
      root_file_basename = 'tree_%d.root' % new_idx
      root_file_idx = new_idx // 1000
      dst_subdir = os.path.join(destination, '%04d' % root_file_idx)
      if dst_subdir not in missing_subdirs:
        missing_subdirs.append(dst_subdir)
      copy_relations[root_file] = os.path.join(dst_subdir, root_file_basename)

    if args.copy:
      for missing_subdir in missing_subdirs:
        if not hdfs.isdir(missing_subdir):
          logging.info('Created subdirectory {}'.format(missing_subdir))
          if hdfs.mkdirs(missing_subdir) != 0:
            raise RuntimeError("Unable to create directory: %s" % missing_subdir)

    for src_file, dst_file in copy_relations.items():
      logging.debug('Copying file {} to {}'.format(src_file, dst_file))
      if args.copy:
        if hdfs.copy(src_file, dst_file, overwrite = False) != 0:
          raise RuntimeError("Unable to copy file from %s to %s" % (src_file, dst_file))

    logging.info('Copying done')
    new_lines[os.path.dirname(destination)] = (
        chunks[chunk_1] * len(file_list_1) / 100. + chunks[chunk_2] * len(file_list_2) / 100.
      ) / (len(file_list_1) + len(file_list_2)) * 100.
Пример #10
0
        sample_name_re = re.compile(sample_name)
    except:
        logging.error(
            "Argument {arg} not a valid regex".format(arg=sample_name))
        sys.exit(1)

    if grep_individually and not grep_directory:
        logging.warning(
            'Option -a/--all has no effect unless you specify -d/--directory')

    if not hdfs.isfile(rle_file):
        logging.error("No such file: '{rle_filename}'".format(
            rle_filename=rle_file, ))
        sys.exit(1)

    if output_file and not hdfs.isdir(os.path.dirname(output_file)):
        logging.error(
            "Parent directory of '{output_file}' doesn't exist".format(
                output_file=output_file, ))
        sys.exit(1)

    if grep_directory and not hdfs.isdir(grep_directory):
        logging.error("Grep directory '{grep_directory}' doesn't exist".format(
            grep_directory=grep_directory, ))
        sys.exit(1)

    sample_keys = {}
    for s_key, s_value in samples.iteritems():
        if sample_name_re.match(s_value['process_name_specific']):
            sample_keys[s_key] = s_value['process_name_specific']
Пример #11
0
   logging.error("Unable to parse output directory from file: %s" % crab_logfile)
   continue
 if nof_jobs < 0:
   logging.error("Unable to parse total number of jobs from file: %s" % crab_logfile)
   continue
 version = os.path.basename(output_dir)
 version_date = version.split('_')[1]
 prefix = '{}_{}'.format(version, chunk_str) if chunk_str else version
 userName = os.path.basename(os.path.dirname(output_dir))
 dataset_requestName = '%s__%s' % (dataset_match.group(1), dataset_match.group(2))
 requestName = '%s_%s' % (prefix, dataset_requestName)
 max_requestname_len = 160 - len(userName)
 if len(requestName) > max_requestname_len:
   requestName = requestName[:max_requestname_len]
 crab_path = os.path.join('/hdfs', 'cms', output_dir[1:], dataset_match.group(1), requestName)
 if hdfs.isdir(crab_path):
   logging.debug("Found directory: {}".format(crab_path))
   subdirs = hdfs.listdir(crab_path)
   if len(subdirs) != 1:
     logging.error("Expected exactly one subdir in {} but found {}: {}".format(
       crab_path, len(subdirs), ', '.join(subdirs)
     ))
     continue
   subdir = subdirs[0]
   root_files = [
     root_file for subsubdir in hdfs.listdir(subdir) for root_file in hdfs.listdir(subsubdir) if root_file.endswith('.root')
   ]
   root_idxs = set(map(lambda fn: int(TREE_REGEX.match(os.path.basename(fn)).group('idx')), root_files))
   assert(not (root_idxs & expected_fails))
   root_idxs = root_idxs | expected_fails
   nof_completed = len(root_idxs) * 100. / nof_jobs
Пример #12
0
def get_hadd_stage2(input_paths):
    results = {}
    for input_path in input_paths:
        path_split = [
            subpath for subpath in input_path.split(os.path.sep)
            if subpath != ''
        ]
        nof_levels = len(path_split)
        if not (5 < nof_levels < 11):
            raise ValueError("Invalid path: %s" % input_path)

        current_paths = [input_path]
        if nof_levels == 6:
            assert (len(current_paths) == 1)
            current_path = os.path.join(current_paths.pop(), 'histograms')
            if not hdfs.isdir(current_path):
                return []
            current_paths = [current_path]
            nof_levels += 1
        if nof_levels == 7:
            assert (len(current_paths) == 1)
            current_path = current_paths.pop()
            current_paths = hdfs.listdir(current_path)
            nof_levels += 1
        if nof_levels == 8:
            next_paths = []
            for current_path in current_paths:
                region_paths = hdfs.listdir(current_path)
                for region_path in region_paths:
                    next_paths.append(region_path)
            current_paths = next_paths
            nof_levels += 1
        if nof_levels == 9:
            next_paths = []
            for current_path in current_paths:
                for next_path in hdfs.listdir(current_path):
                    next_path_basename = os.path.basename(next_path)
                    if next_path_basename == 'hadd':
                        next_paths.append(next_path)
            current_paths = next_paths
            nof_levels += 1
        if nof_levels == 10:
            next_paths = []
            for current_path in current_paths:
                candidate_files = []
                for candidate_file in hdfs.listdir(current_path):
                    if not hdfs.isfile(candidate_file):
                        continue
                    candidate_file_basename = os.path.basename(candidate_file)
                    if candidate_file_basename.startswith('hadd_stage2') and \
                       not HADD_STAGE2_RE.match(candidate_file_basename.split('.')[0]):
                        candidate_files.append(candidate_file)
                if candidate_files:
                    assert (len(candidate_files) == 1)
                    next_paths.append(candidate_files[0])
            current_paths = next_paths
        for current_path in current_paths:
            current_path_split = [
                subpath for subpath in current_path.split(os.path.sep)
                if subpath != ''
            ]
            channel = current_path_split[7]
            region = current_path_split[8]
            channel_region = '{}_{}'.format(channel, region)
            if channel_region in results:
                raise RuntimeError(
                  "Found two paths corresponding to the same channel (%s) and region (%s): %s and %s" % \
                  (channel, region, current_path, results[channel_region])
                )
            results[channel_region] = current_path
            logging.debug(
                'Found hadd stage2 file corresponding to channel {} and region {}: {}'
                .format(channel, region, current_path))

    return [results[k] for k in sorted(results.keys())]
Пример #13
0
        type=str,
        dest='output',
        metavar='file',
        required=True,
        help='R|Output file',
    )
    parser.add_argument(
        '-v',
        '--verbose',
        dest='verbose',
        action='store_true',
        default=False,
        help='R|Enable verbose output',
    )
    args = parser.parse_args()
    logging.getLogger().setLevel(
        logging.DEBUG if args.verbose else logging.INFO)

    output_dir = os.path.dirname(os.path.abspath(args.output))
    if not hdfs.isdir(output_dir):
        raise RuntimeError("No such directory: %s" % output_dir)

    fns = get_hadd_stage2(args.input)
    foptr = ROOT.TFile.Open(args.output, 'recreate')
    logging.info('Creating file: {}'.format(foptr.GetName()))
    for fn in fns:
        copy_dirs(fn, foptr)
    logging.info('Writing file: {}'.format(foptr.GetName()))
    foptr.Write()
    foptr.Close()
Пример #14
0
  rle_filename = args.input
  out_filename = os.path.abspath(args.output)
  grep_dir     = args.directory
  sample_name  = args.sample_name
  force        = args.force
  debug_output = args.debug
  nof_files    = args.nof_files

  # check if input RLE file exists
  if not hdfs.isfile(rle_filename):
    logging.error("File {rle_filename} does not exist or is not a file!".format(rle_filename = rle_filename))
    sys.exit(1)

  # check if the directory into which we have to write the output ROOT file already exists
  out_parent_dir = os.path.dirname(out_filename)
  if not hdfs.isdir(out_parent_dir):
    if not force:
      logging.error("Parent directory of the output file {out_filename} does not exist".format(
        out_filename = out_filename),
      )
      sys.exit(1)
    else:
      logging.debug("Output directory {out_parent_dir} does not exist, attempting to create one".format(
        out_parent_dir = out_parent_dir,
      ))
      try:
        hdfs.mkdirs(out_parent_dir)
      except IOError as err:
        logging.error("Could not create directory {out_parent_dir}".format(out_parent_dir = out_parent_dir))
        sys.exit(1)