Exemplo n.º 1
0
def has_rles(input_filename, rles):
  if not hdfs.isfile(input_filename):
    raise RuntimeError("No such file: %s" % input_filename)
  input_file = ROOT.TFile.Open(input_filename, 'read')
  assert(input_file)
  events_tree = input_file.Get('Events')
  assert (events_tree)
  run_branch = array.array('I', [0])
  ls_branch = array.array('I', [0])
  events_branch = array.array('L', [0])
  events_tree.SetBranchAddress('run', run_branch)
  events_tree.SetBranchAddress('luminosityBlock', ls_branch)
  events_tree.SetBranchAddress('event', events_branch)

  rle_matches = []
  nof_events = events_tree.GetEntries()
  for idx in range(nof_events):
    events_tree.GetEntry(idx)
    rle = ':'.join(map(lambda x: str(x[0]), [ run_branch, ls_branch, events_branch ]))
    if rle in rles:
      rle_matches.append(rle)
    if len(rle_matches) == len(rles):
      break
  input_file.Close()
  return rle_matches
Exemplo n.º 2
0
def get_evt_subdir_names(fn):
    assert (hdfs.isfile(fn))
    fptr = ROOT.TFile.Open(fn, 'read')
    assert (fptr)
    logging.info('Opened file {}'.format(fptr.GetName()))

    results = {}
    root_keys = [k.GetName() for k in fptr.GetListOfKeys()]
    for root_key in root_keys:
        dir_cand = fptr.Get(root_key)
        if type(dir_cand) != ROOT.TDirectoryFile:
            continue
        evt_dir_name = os.path.join(root_key, 'sel', 'evt')
        evt_dir = fptr.Get(evt_dir_name)
        if not evt_dir:
            continue
        evt_dir_keys = sorted(
            os.path.join(evt_dir_name, k.GetName())
            for k in evt_dir.GetListOfKeys()
            if k.GetName().startswith('htxs_'))
        results[root_key] = evt_dir_keys

    logging.info('Closing file: {}'.format(fptr.GetName()))
    fptr.Close()
    return results
Exemplo n.º 3
0
def project(input_file, output_file, binnings):
    if not hdfs.isfile(input_file):
        raise RuntimeError('No such file: %s' % input_file)
    root_file = ROOT.TFile.Open(input_file, 'read')
    if not root_file:
        print('Unable to read file %s' % input_file)
        return False
    events = root_file.Get('Events')
    assert (events)

    histograms = []
    for branch_name, binning_array in binnings.items():
        binning = array.array('f', binning_array)
        histogram = ROOT.TH1F(branch_name, branch_name,
                              len(binning) - 1, binning)
        assert (histogram)
        events.Project(branch_name, branch_name)
        histograms.append(histogram)

    out_file = ROOT.TFile.Open(output_file, 'recreate')
    out_file.cd()
    for histogram in histograms:
        histogram.Write()

    out_file.Close()
    root_file.Close()
    return True
Exemplo n.º 4
0
def find_hadd_stage_files(input_path, regions, find_hadd_stage1):
    path_split = [
        subpath for subpath in input_path.split(os.path.sep) if subpath != ''
    ]
    nof_levels = len(path_split)
    if not (5 < nof_levels < 11):
        raise ValueError("Invalid path: %s" % input_path)

    current_paths = [input_path]
    if nof_levels == 6:
        assert (len(current_paths) == 1)
        current_path = os.path.join(current_paths.pop(), 'histograms')
        if not hdfs.isdir(current_path):
            return []
        current_paths = [current_path]
        nof_levels += 1
    if nof_levels == 7:
        assert (len(current_paths) == 1)
        current_path = current_paths.pop()
        current_paths = hdfs.listdir(current_path)
        nof_levels += 1
    if nof_levels == 8:
        next_paths = []
        for current_path in current_paths:
            region_paths = hdfs.listdir(current_path)
            for region_path in region_paths:
                if os.path.basename(region_path).startswith(
                        tuple(ANALYSIS_REGIONS[region] for region in regions)):
                    next_paths.append(region_path)
        current_paths = next_paths
        nof_levels += 1
    if nof_levels == 9:
        next_paths = []
        for current_path in current_paths:
            for next_path in hdfs.listdir(current_path):
                next_path_basename = os.path.basename(next_path)
                if not (find_hadd_stage1 != (next_path_basename != 'hadd')):
                    next_paths.append(next_path)
        current_paths = next_paths
        nof_levels += 1
    if nof_levels == 10:
        next_paths = []
        for current_path in current_paths:
            candidate_files = []
            metadata = extract_metadata(current_path)
            if metadata['region_key'] not in regions:
                continue
            for candidate_file in hdfs.listdir(current_path):
                if not hdfs.isfile(candidate_file):
                    continue
                if is_hadd_stage_file(candidate_file, find_hadd_stage1,
                                      metadata):
                    candidate_files.append(candidate_file)
            if candidate_files:
                assert (len(candidate_files) == 1)
                next_paths.append(candidate_files[0])
        current_paths = next_paths
    return current_paths
Exemplo n.º 5
0
def skim_debug(out_filename, rle_list, tree_name = "tree"):
  '''Checks if skimming was successful by comparing RLE number in the output file to the given list of RLE numbers
  Args:
    out_filename: string,       Path to the file the RLE numbers of which is compared against the RLE array
    rle_list:     string array, List of RLE numbers as strings
    tree_name:    string,       TTree name (default: tree)

  Returns:
    True,  if the RLE numbers in the file match exactly to the given input list of RLE numbers
    False, otherwise
  '''
  logging.debug("Checking if {out_filename} contains exactly the same events as provided by the RLE file".format(
    out_filename = out_filename,
  ))
  if not hdfs.isfile(out_filename):
    return False

  out_rle_list = get_rle(out_filename, tree_name)

  missing_from_file = list(set(rle_list) - set(out_rle_list))
  excess_in_file    = list(set(out_rle_list) - set(rle_list))

  ret_val = True
  if missing_from_file:
    logging.error("There are {nof_missing} events missing from {out_filename}: {list_of_missing_events}".format(
      nof_missing            = len(missing_from_file),
      out_filename           = out_filename,
      list_of_missing_events = ', '.join(missing_from_file),
    ))
    ret_val = False
  if excess_in_file:
    logging.error("There are {nof_excess} event in excess in the file {out_filename}: {list_of_excess_events}".format(
      nof_excess            = len(excess_in_file),
      out_filename          = out_filename,
      list_of_excess_events = ', '.join(excess_in_file),
    ))
    ret_val = False

  return ret_val
Exemplo n.º 6
0
def exists(filename):
    if filename.startswith('/eos'):
        try:
            logging.debug('Trying eos on %s' % filename)
            result = cmd_execute('eos stat %s' % filename)
            return result.split()[1].replace('`', '').replace("'",
                                                              '') == filename
        except Exception as err:
            raise ValueError('Cannot access file %s on eos because: %s' %
                             (filename, err))
    elif filename.startswith('root://eoscms.cern.ch/'):
        try:
            logging.debug('Trying XRD on %s' % filename)
            filename_noprefix = filename.replace('root://eoscms.cern.ch/', '')
            result = cmd_execute('xrdfs root://eoscms.cern.ch stat %s' %
                                 filename_noprefix)
            return result.split()[1] == filename_noprefix
        except Exception as err:
            raise ValueError('Cannot access file %s on eos because: %s' %
                             (filename, err))
    else:
        logging.debug('Trying local file system on %s' % filename)
        return hdfs.isfile(filename)
Exemplo n.º 7
0
 def testIsNotFile(self):
     # The function returns False even if there is no such path on the file system
     self.assertFalse(hdfs.isfile(self.nonExistingHDFSfile))
     self.assertFalse(hdfs.isfile(self.nonExistingHomeFile))
Exemplo n.º 8
0
 def testIsFile(self):
     self.assertTrue(hdfs.isfile(self.userHDFSfile))
     self.assertTrue(hdfs.isfile(self.userHomeFile))
Exemplo n.º 9
0
def find_parents(input_file, input_rles):
  if input_file.startswith('/store'):
    return []
  if not hdfs.isfile(input_file):
    raise RuntimeError("No such file: %s" % input_file)
  if not all(RLE_REGEX.match(rle) for rle in input_rles):
    raise RuntimeError("Not all input run:lumi:event numbers conform to the expected format")

  input_file_basename = os.path.basename(input_file)
  tree_match = TREE_REGEX.match(input_file_basename)
  if not tree_match:
    raise RuntimeError("Not a valid Ntuple: %s" % input_file)

  tree_idx = int(tree_match.group('idx'))
  assert(tree_idx > 0)

  parent_candidates = []
  if input_file.startswith('/hdfs/local'):
    input_file_split = input_file.split(os.path.sep)
    assert(len(input_file_split) == 11)
    process_name = input_file_split[-3]
    version = input_file_split[-5]
    era = input_file_split[-6]

    modes = [ mode for mode in ALLOWED_MODES.keys() if version.endswith(mode) ]
    if len(modes) != 1:
      raise RuntimeError("Unable to deduce mode from input path: %s" % input_file)
    mode = modes[0]
    version_no_mode = version[:-len(mode) - 1]
    nom_signifier = version_no_mode.split('_')[-1]
    version_no_mode_nom = version_no_mode[:-len(nom_signifier) - 1]
    presel_signifier = version_no_mode_nom.split('_')[-1]
    sample_base = ALLOWED_MODES[mode]['base']
    sample_suffix = ALLOWED_MODES[mode]['suffix']
    if presel_signifier == 'wPresel':
      sample_suffix = 'preselected_{}'.format(sample_suffix) if mode == 'all' else '{}_preselected'.format(sample_suffix)
    samples = load_samples(era, True, base = sample_base, suffix = sample_suffix)
    dbs_key = ''
    for sample_key, sample_info in samples.items():
      if sample_key == 'sum_events':
        continue
      if sample_info['process_name_specific'] == process_name:
        dbs_key = sample_key
        break
    if not dbs_key:
      raise RuntimeError("Unable to find an entry from sample dictionary that corresponds to file: %s" % input_file)
    sample_nfiles = samples[dbs_key]['nof_files']
    if sample_nfiles < tree_idx:
      raise RuntimeError(
        "Tree index found from input path %s larger than expected number of Ntuples: %d" % (input_file, sample_nfiles)
      )
    if presel_signifier == 'wPresel':
      parent_samples = load_samples(era, True, base = sample_base,
        suffix = sample_suffix.replace('preselected_', '').replace('_preselected', '')
      )
      parent_sample = parent_samples[dbs_key]

    elif presel_signifier == 'woPresel':
      parent_samples = load_samples(era, False, base = sample_base)
      parent_sample = parent_samples[dbs_key]
    else:
      raise RuntimeError("Invalid preselection signifier found from input file %s: %s" % (input_file, presel_signifier))
    parent_sample_nfiles = parent_sample['nof_files']
    parent_sample_path = parent_sample['local_paths'][0]['path']
    parent_sample_blacklist = parent_sample['local_paths'][0]['blacklist']
    assert(parent_sample_nfiles >= sample_nfiles)
    whitelisted_indices = [ idx for idx in range(1, parent_sample_nfiles + 1) if idx not in parent_sample_blacklist ]
    len_whitelisted_indices = len(whitelisted_indices)
    if len_whitelisted_indices == sample_nfiles:
      # it's 1-1 correspondence
      parent_candidate = os.path.join(parent_sample_path, "%04d" % (tree_idx // 1000), 'tree_%d.root' % tree_idx)
      rle_matches = has_rles(parent_candidate, input_rles)
      if len(rle_matches) == len(input_rles):
        parent_candidates.append((parent_candidate, rle_matches))
      else:
        raise RuntimeError("Unable to find parent for: %s" % input_file)
    elif len_whitelisted_indices > sample_nfiles:
      # partition
      chunk_len = int(math.ceil(float(len_whitelisted_indices) / sample_nfiles))
      chunks = [ whitelisted_indices[idx:idx + chunk_len] for idx in range(0, len_whitelisted_indices, chunk_len) ]
      assert(len(chunks) == sample_nfiles)
      parent_chunk = chunks[tree_idx - 1]
      for parent_idx in parent_chunk:
        parent_candidate = os.path.join(parent_sample_path, "%04d" % (parent_idx // 1000), 'tree_%d.root' % parent_idx)
        rle_matches = has_rles(parent_candidate, input_rles)
        if rle_matches:
          parent_candidates.append((parent_candidate, rle_matches))
    else:
      raise RuntimeError("Fewer parent Ntuples than sibling Ntuples for the Ntuple: %s" % input_file)
  elif input_file.startswith('/hdfs/cms/store/user'):
    input_file_dirname = os.path.dirname(input_file)
    log_file = os.path.join(input_file_dirname, 'log', 'cmsRun_{}.log.tar.gz'.format(tree_idx))
    if hdfs.isfile(log_file):
      tar = tarfile.open(log_file, 'r:gz')
      tar_contents = tar.getnames()
      xml_filename = 'FrameworkJobReport-{}.xml'.format(tree_idx)
      if xml_filename in tar_contents:
        xml_tarmember = tar.getmember(xml_filename)
        xml_file = tar.extractfile(xml_tarmember)
        xml_contents = xml_file.read()
        xml_tree = ET.ElementTree(ET.fromstring(xml_contents))
        last_lfn = ''
        matched_ls = []
        expected_ls = { int(rle.split(':')[1]) : rle for rle in input_rles }
        for elem in xml_tree.iter():
          if elem.tag == 'Inputs' or len(expected_ls) == len(matched_ls):
            break
          if elem.tag == 'LFN':
            if last_lfn and matched_ls:
              parent_candidates.append((last_lfn, matched_ls))
            last_lfn = elem.text
            matched_ls = []
          elif elem.tag == 'LumiSection':
            ls = int(elem.attrib['ID'])
            if ls in expected_ls:
              matched_ls.append(expected_ls[ls])
        if last_lfn and matched_ls:
          parent_candidates.append((last_lfn, matched_ls))
      tar.close()
  else:
    raise RuntimeError("Invalid path: %s" % input_file)
  return parent_candidates
Exemplo n.º 10
0
create_output_dir(output_file)

if plot_files:
    for plot_file in plot_files:
        if not plot_file.endswith(ACCEPTED_PLOT_EXTS_TUPLE):
            raise RuntimeError(
                "Expected extensions %s, instead of whatever this is: %s" %
                (', '.join(ACCEPTED_PLOT_EXTS), plot_file))
        create_output_dir(plot_file)

input_files = []
with open(input_txt_file, 'r') as input_file_ptr:
    for line in input_file_ptr:
        input_file_cand = line.rstrip()
        if not hdfs.isfile(input_file_cand):
            raise RuntimeError("No such file: %s" % input_file_cand)
        input_files.append(input_file_cand)
logging.info("Found {} input files".format(len(input_files)))

weights_map = {}


def record_weights(file_name):
    fptr = ROOT.TFile.Open(file_name, 'read')
    tree = fptr.Get('Events')

    genWeight = array.array('f', [0.])
    tree.SetBranchAddress(GENWEIGHT_NAME, genWeight)

    tree.SetBranchStatus("*", 0)
Exemplo n.º 11
0
        '-v',
        '--verbose',
        dest='verbose',
        action='store_true',
        default=False,
        help='R|Verbose output',
    )
    args = parser.parse_args()
    input_file_names = args.input
    output_dir = os.path.abspath(args.output)
    rles = args.rle
    logging.getLogger().setLevel(
        logging.DEBUG if args.verbose else logging.INFO)

    for input_file_name in input_file_names:
        if not hdfs.isfile(input_file_name):
            raise ValueError("No such file: %s" % input_file_name)

    for rle in rles:
        assert (re.match('^\d+:\d+:\d+$', rle))

    mtable = MassTable()

    for input_file_name in input_file_names:
        graph_map = get_graph(input_file_name, rles, mtable)
        for rle in graph_map:
            output_file_filename = '{}-{}.png'.format(
                os.path.splitext(os.path.basename(input_file_name))[0],
                rle.replace(':', '-'))
            output_file_name = os.path.join(output_dir, output_file_filename)
            save_graph(graph_map[rle], output_file_name, args.keep)
Exemplo n.º 12
0
    def waitForJobs(self):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """
        text_line = '-' * 120

        # Set a delimiter, which distinguishes entries b/w different jobs
        delimiter = ','
        # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length):
        # 1) squeue -h -u {{user}} -o '%i %256k'
        #      Collects the list of running jobs
        #        a) -h omits header
        #        b) -u {{user}} looks only for jobs submitted by {{user}}
        #        c) -o '%i %256k' specifies the output format
        #           i)  %i -- job ID (1st column)
        #           ii) %256k -- comment with width of 256 characters (2nd column)
        #               If the job has no comments, the entry simply reads (null)
        # 2) grep {{comment}}
        #       Filter the jobs by the comment which must be unique per sbatchManager instance at all times
        # 3) awk '{print $1}'
        #       Filter only the jobIds out
        # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'
        #       Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read)
        command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \
                           "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'"
        command = jinja2.Template(command_template).render(
            user=self.user,
            pool_id_length=self.max_pool_id_length,
            comment=self.pool_id,
            delimiter=delimiter)

        # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes
        # even if some of them have already finished
        jobIds_set = set([
            job_id for job_id in self.submittedJobs
            if self.submittedJobs[job_id]['status'] == Status.submitted
        ])
        nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
        while nofJobs_left > 0:
            # Get the list of jobs submitted to batch system and convert their jobIds to a set
            poll_result, poll_result_err = '', ''
            while True:
                poll_result, poll_result_err = run_cmd(command,
                                                       do_not_log=False,
                                                       return_stderr=True)
                if not poll_result and poll_result_err:
                    logging.warning(
                        'squeue caught an error: {squeue_error}'.format(
                            squeue_error=poll_result_err))
                else:
                    break
                # sleep a minute and then try again
                # in principle we could limit the number of retries, but hopefully that's not necessary
                logging.debug("sleeping for %i seconds." % 60)
                time.sleep(60)
            polled_ids = set()
            if poll_result != '':
                polled_ids = set(poll_result.split(delimiter))

            # Check if number of jobs submitted to batch system is below maxSubmittedJobs;
            # if it is, take jobs from queuedJobs list and submit them,
            # until a total of maxSubmittedJobs is submitted to batch system
            nofJobs_toSubmit = min(len(self.queuedJobs),
                                   self.maxSubmittedJobs - len(polled_ids))
            if nofJobs_toSubmit > 0:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs."
                    .format(len(polled_ids), len(self.queuedJobs),
                            nofJobs_toSubmit))
            else:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing."
                    .format(len(polled_ids), len(self.queuedJobs)))
            for i in range(0, nofJobs_toSubmit):
                # randomly submit a job from the queue
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_idx = len(self.queuedJobs) - 1
                random_idx = random.randint(0, max_idx)
                job = self.queuedJobs.pop(random_idx)
                job['status'] = Status.submitted
                job_id = self.submit(job['sbatch_command'])
                self.submittedJobs[job_id] = job

            # Now check status of jobs submitted to batch system:
            # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of
            # jobs that have finished already
            finished_ids = list(jobIds_set - polled_ids)

            # Do not poll anything if currently there are no finished jobs
            if finished_ids:
                # Based on job's exit code what if the job has failed or completed successfully
                # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here
                # Therefore, we want to restrict the output by grepping specific job IDs
                # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable,
                # which is of order 2e6
                # This means that we have to split the job IDs into chunks each of which we have to check separately
                finished_ids_chunks = [
                    finished_ids[i:i + self.max_nof_greps]
                    for i in range(0, len(finished_ids), self.max_nof_greps)
                ]
                for finished_ids_chunk in finished_ids_chunks:
                    completion = self.check_job_completion(finished_ids_chunk)
                    completed_jobs, running_jobs, failed_jobs = [], [], []
                    for job_id, details in completion.iteritems():
                        if details.status == Status.completed:
                            completed_jobs.append(job_id)
                        elif details.status == Status.running:
                            running_jobs.append(job_id)
                        else:
                            failed_jobs.append(job_id)
                    # If there are any failed jobs, throw
                    if failed_jobs:

                        failed_jobs_str = ','.join(failed_jobs)
                        errors = [
                            completion[job_id].status for job_id in failed_jobs
                        ]
                        logging.error(
                            "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}"
                            .format(
                                jobIds=failed_jobs_str,
                                reasons=', '.join(map(Status.toString,
                                                      errors)),
                            ))

                        # Let's print a table where the first column corresponds to the job ID
                        # and the second column lists the exit code, the derived exit code, the status
                        # and the classification of the failed job
                        logging.error("Error table:")
                        for job_id in failed_jobs:
                            sys.stderr.write(
                                "{jobId} {exitCode} {derivedExitCode} {state} {status}\n"
                                .format(
                                    jobId=job_id,
                                    exitCode=completion[job_id].exit_code,
                                    derivedExitCode=completion[job_id].
                                    derived_exit_code,
                                    state=completion[job_id].state,
                                    status=Status.toString(
                                        completion[job_id].status),
                                ))

                        sys.stderr.write('%s\n' % text_line)
                        for failed_job in failed_jobs:
                            for log in zip(['wrapper', 'executable'],
                                           ['log_wrap', 'log_exec']):
                                logfile = self.submittedJobs[failed_job][
                                    log[1]]
                                if hdfs.isfile(logfile):
                                    logfile_contents = open(logfile,
                                                            'r').read()
                                else:
                                    logfile_contents = '<file is missing>'
                                sys.stderr.write(
                                    'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n'
                                    .format(
                                        id=failed_job,
                                        description=log[0],
                                        path=logfile,
                                        log=logfile_contents,
                                        line=text_line,
                                    ))

                            if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \
                               completion[failed_job].status == Status.io_error:
                                # The job is eligible for resubmission if the job hasn't been resubmitted more
                                # than a preset limit of resubmissions AND if the job failed due to I/O errors
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED because: {reason} "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=failed_job,
                                        args=self.submittedJobs[failed_job]
                                        ['args'],
                                        reason=Status.toString(
                                            completion[failed_job].status),
                                        attempt=self.submittedJobs[failed_job]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[failed_job]['args'])
                                # The old ID must be deleted, b/c otherwise it would be used to compare against
                                # squeue output and we would resubmit the failed job ad infinitum
                                del self.submittedJobs[failed_job]
                            else:
                                # We've exceeded the maximum number of resubmissions -> fail the workflow
                                raise Status.raiseError(
                                    completion[failed_job].status)
                    else:
                        logging.debug(
                            "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}"
                            .format(
                                completedIds=','.join(completed_jobs),
                                runningInfo='(%s still running)' %
                                ','.join(running_jobs) if running_jobs else '',
                            ))
                    # Mark successfully finished jobs as completed so that won't request their status code again
                    # Otherwise they will be still at ,,submitted'' state
                    for job_id in completed_jobs:
                        if not all(
                                map(
                                    lambda outputFile: is_file_ok(
                                        outputFile,
                                        validate_outputs=True,
                                        min_file_size=self.min_file_size), self
                                    .submittedJobs[job_id]['outputFiles'])):
                            if self.submittedJobs[job_id][
                                    'nof_submissions'] < self.max_resubmissions:
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=job_id,
                                        args=self.submittedJobs[job_id]
                                        ['args'],
                                        attempt=self.submittedJobs[job_id]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[job_id]['args'])
                                del self.submittedJobs[job_id]
                            else:
                                raise ValueError(
                                    "Job w/ ID {id} FAILED because it repeatedly produces bogus output "
                                    "file {output} yet the job still exits w/o any errors"
                                    .format(
                                        id=job_id,
                                        output=', '.join(
                                            self.submittedJobs[job_id]
                                            ['outputFiles']),
                                    ))
                        else:
                            # Job completed just fine
                            self.submittedJobs[job_id][
                                'status'] = Status.completed

            jobIds_set = set([
                job_id for job_id in self.submittedJobs
                if self.submittedJobs[job_id]['status'] == Status.submitted
            ])
            nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
            if nofJobs_left > 0:
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_delay = 300
                random_delay = random.randint(0, max_delay)
                logging.debug("sleeping for %i seconds." % random_delay)
                time.sleep(self.poll_interval + random_delay)
            else:
                break
            logging.info(
                "Waiting for sbatch to finish (%d job(s) still left) ..." %
                nofJobs_left)
Exemplo n.º 13
0
def get_hadd_stage2(input_paths):
    results = {}
    for input_path in input_paths:
        path_split = [
            subpath for subpath in input_path.split(os.path.sep)
            if subpath != ''
        ]
        nof_levels = len(path_split)
        if not (5 < nof_levels < 11):
            raise ValueError("Invalid path: %s" % input_path)

        current_paths = [input_path]
        if nof_levels == 6:
            assert (len(current_paths) == 1)
            current_path = os.path.join(current_paths.pop(), 'histograms')
            if not hdfs.isdir(current_path):
                return []
            current_paths = [current_path]
            nof_levels += 1
        if nof_levels == 7:
            assert (len(current_paths) == 1)
            current_path = current_paths.pop()
            current_paths = hdfs.listdir(current_path)
            nof_levels += 1
        if nof_levels == 8:
            next_paths = []
            for current_path in current_paths:
                region_paths = hdfs.listdir(current_path)
                for region_path in region_paths:
                    next_paths.append(region_path)
            current_paths = next_paths
            nof_levels += 1
        if nof_levels == 9:
            next_paths = []
            for current_path in current_paths:
                for next_path in hdfs.listdir(current_path):
                    next_path_basename = os.path.basename(next_path)
                    if next_path_basename == 'hadd':
                        next_paths.append(next_path)
            current_paths = next_paths
            nof_levels += 1
        if nof_levels == 10:
            next_paths = []
            for current_path in current_paths:
                candidate_files = []
                for candidate_file in hdfs.listdir(current_path):
                    if not hdfs.isfile(candidate_file):
                        continue
                    candidate_file_basename = os.path.basename(candidate_file)
                    if candidate_file_basename.startswith('hadd_stage2') and \
                       not HADD_STAGE2_RE.match(candidate_file_basename.split('.')[0]):
                        candidate_files.append(candidate_file)
                if candidate_files:
                    assert (len(candidate_files) == 1)
                    next_paths.append(candidate_files[0])
            current_paths = next_paths
        for current_path in current_paths:
            current_path_split = [
                subpath for subpath in current_path.split(os.path.sep)
                if subpath != ''
            ]
            channel = current_path_split[7]
            region = current_path_split[8]
            channel_region = '{}_{}'.format(channel, region)
            if channel_region in results:
                raise RuntimeError(
                  "Found two paths corresponding to the same channel (%s) and region (%s): %s and %s" % \
                  (channel, region, current_path, results[channel_region])
                )
            results[channel_region] = current_path
            logging.debug(
                'Found hadd stage2 file corresponding to channel {} and region {}: {}'
                .format(channel, region, current_path))

    return [results[k] for k in sorted(results.keys())]
Exemplo n.º 14
0
        input_files.append(infile)
    else:
        with open(infile, 'r') as f:
            for line in f:
                line_stripped = line.rstrip('\n')
                if not line_stripped:
                    # empty line
                    continue
                if not line_stripped.endswith('.root'):
                    logging.warning(
                        'File %s does not appear to be a ROOT file' %
                        line_stripped)
                    continue
                line_path = '/hdfs%s' % line_stripped if line_stripped.startswith(
                    ('/local', '/cms')) else line_stripped
                if not hdfs.isfile(line_path):
                    logging.error('File %s does not exist, skipping' %
                                  line_path)
                    continue
                if line_path not in input_files:
                    # require the input files to be unique
                    input_files.append(line_path)
                logging.debug('Preparing job for file: %s' % line_path)

    # check if the script directory exists, and if not, create it
    if not os.path.isdir(script_dir):
        logging.info('Directory %s does not exist, attempting to create it' %
                     script_dir)
        try:
            os.makedirs(script_dir)
        except IOError as err:
Exemplo n.º 15
0
    allowed_systematics = args.systematics
    searchable_regions = args.regions
    show_by_nodes = args.node
    show_by_sample = args.sample
    show_htxs = args.htxs

    if len(allowed_decay_modes) > 1 and '' in allowed_decay_modes:
        raise ValueError("Conflicting values to 'decay_modes' parameter")

    input_file_names_hadd_stage1 = []
    input_file_names_hadd_stage2 = []
    for input_file_path in input_file_paths:
        if not input_file_path.startswith('/hdfs/local'):
            raise ValueError("Invalid path: %s" % input_file_path)

        if hdfs.isfile(input_file_path):
            input_file_abs_path = os.path.abspath(input_file_path)
            if is_hadd_stage_file(input_file_abs_path, True):
                input_file_names_hadd_stage1.append(input_file_abs_path)
            elif is_hadd_stage_file(input_file_abs_path, False):
                input_file_names_hadd_stage2.append(input_file_abs_path)
            else:
                raise ValueError(
                    "Not a valid hadd stage 1 or stage 2 file: %s" %
                    input_file_path)
        else:
            input_file_names_hadd_stage1.extend(
                find_hadd_stage_files(input_file_path, searchable_regions,
                                      True))
            input_file_names_hadd_stage2.extend(
                find_hadd_stage_files(input_file_path, searchable_regions,
Exemplo n.º 16
0
def get_evt_yields(input_file_name, results=None):
    if not results:
        results = collections.OrderedDict()
    metadata = extract_metadata(input_file_name)
    if not input_file_name:
        return results, metadata
    assert ('sample' in metadata)
    sample = metadata['sample']

    assert (hdfs.isfile(input_file_name))
    input_file = ROOT.TFile.Open(input_file_name, 'read')

    subdirectories = get_keys(
      input_file,
      exclude = lambda key: key in [ 'analyzedEntries', 'selectedEntries' ] or \
                            key.endswith(('_fake', '_nonfake'))
    )
    for whitelist in ['1l_1tau_Fakeable_wFakeRateWeights', '1l_1tau_Tight']:
        if whitelist in subdirectories:
            subdirectories = [whitelist]
    is_single_subcategory = len(subdirectories) == 1

    for subdirectory in subdirectories:
        evt_directory_path = os.path.join(subdirectory, 'sel', 'evt')
        evt_directory_ptr = get_dir(input_file, evt_directory_path)
        subcategory_name = '' if is_single_subcategory else subdirectory
        processes = get_keys(evt_directory_ptr,
                             exclude=lambda key: key.startswith(
                                 ('tHq', 'tHW', 'HH')) and 'kt_' in key)
        for process in processes:
            process_path = os.path.join(evt_directory_path, process)
            process_dir_ptr = get_dir(input_file, process_path)
            process_name, decay_mode, gen_match, htxs = parse_process(process)
            histogram_names = get_keys(
              process_dir_ptr,
              include = lambda key: EVENTCOUNTER in key or \
                                    OUTPUT_NN_RE_CAT.match(key) or \
                                    OUTPUT_NN_RE.match(key)
              )
            for histogram_name in histogram_names:
                histogram_path = os.path.join(process_path, histogram_name)
                histogram = input_file.Get(histogram_path)
                if type(histogram) != ROOT.TH1D:
                    raise RuntimeError(
                        "Object at '%s' in file %s is not an instance of ROOT.TH1D"
                        % (histogram_path, input_file_name))
                systematics = get_systematics(histogram_name)
                node, category = get_node_subcategory(histogram_name)
                event_count = int(histogram.GetEntries())
                event_yield = histogram.Integral()
                if systematics.startswith('FR'):
                    continue
                if subcategory_name not in results:
                    results[subcategory_name] = collections.OrderedDict()
                if process_name not in results[subcategory_name]:
                    results[subcategory_name][
                        process_name] = collections.OrderedDict()
                if sample not in results[subcategory_name][process_name]:
                    results[subcategory_name][process_name][
                        sample] = collections.OrderedDict()
                if decay_mode not in results[subcategory_name][process_name][
                        sample]:
                    results[subcategory_name][process_name][sample][
                        decay_mode] = collections.OrderedDict()
                if gen_match not in results[subcategory_name][process_name][
                        sample][decay_mode]:
                    results[subcategory_name][process_name][sample][
                        decay_mode][gen_match] = collections.OrderedDict()
                if htxs not in results[subcategory_name][process_name][sample][
                        decay_mode][gen_match]:
                    results[subcategory_name][process_name][sample][
                        decay_mode][gen_match][htxs] = collections.OrderedDict(
                        )
                if systematics not in results[subcategory_name][process_name][
                        sample][decay_mode][gen_match][htxs]:
                    results[subcategory_name][process_name][sample][
                        decay_mode][gen_match][htxs][
                            systematics] = collections.OrderedDict()
                if node not in results[subcategory_name][process_name][sample][
                        decay_mode][gen_match][htxs][systematics]:
                    results[subcategory_name][process_name][sample][
                        decay_mode][gen_match][htxs][systematics][
                            node] = collections.OrderedDict()
                if category in results[subcategory_name][process_name][sample][
                        decay_mode][gen_match][htxs][systematics][node]:
                    raise RuntimeError(
                        "Possible double-counting the event counts and yields by reading histogram '%s' in file %s: "
                        "subcategory name = %s, process name = %s, sample = %s, decay mode = %s, gen matching = %s, "
                        "htxs = %s, systematics = %s, node = %s, category = %s"
                        % (
                            histogram_path,
                            input_file_name,
                            subcategory_name,
                            process_name,
                            sample,
                            decay_mode,
                            gen_match,
                            htxs,
                            systematics,
                            node,
                            category,
                        ))
                results[subcategory_name][process_name][sample][decay_mode][
                    gen_match][htxs][systematics][node][category] = {
                        'count': event_count,
                        'yield': event_yield,
                    }
                del histogram
            del process_dir_ptr
        del evt_directory_ptr
    input_file.Close()
    return results, metadata
Exemplo n.º 17
0
    sample_name = args.sample_name
    output_file = args.output
    grep_directory = args.directory
    grep_individually = args.all
    try:
        sample_name_re = re.compile(sample_name)
    except:
        logging.error(
            "Argument {arg} not a valid regex".format(arg=sample_name))
        sys.exit(1)

    if grep_individually and not grep_directory:
        logging.warning(
            'Option -a/--all has no effect unless you specify -d/--directory')

    if not hdfs.isfile(rle_file):
        logging.error("No such file: '{rle_filename}'".format(
            rle_filename=rle_file, ))
        sys.exit(1)

    if output_file and not hdfs.isdir(os.path.dirname(output_file)):
        logging.error(
            "Parent directory of '{output_file}' doesn't exist".format(
                output_file=output_file, ))
        sys.exit(1)

    if grep_directory and not hdfs.isdir(grep_directory):
        logging.error("Grep directory '{grep_directory}' doesn't exist".format(
            grep_directory=grep_directory, ))
        sys.exit(1)
Exemplo n.º 18
0
def plot(input_files, output_files, title, expected_neff, mode):
  histogram_dict = {}
  for sample_name, sample_entry in input_files.items():
    if not hdfs.isfile(sample_entry['input']):
      logging.error('Could not find file {}'.format(sample_entry['input']))
      continue
    root_file = ROOT.TFile.Open(sample_entry['input'], 'read')
    logging.debug('Opened file {}'.format(sample_entry['input']))
    root_directories = list(filter(
      lambda root_dir: root_dir != None, [
        root_file.Get(os.path.join(key.GetName(), mode, 'genEvt')) \
        for key in root_file.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile'
      ]
    ))
    if len(root_directories) != 1:
      raise RuntimeError('Expected single directory in %s' % sample_entry['input'])
    root_dir = root_directories[0]
    histogram_dirs = [
      root_dir.Get(key.GetName()) \
      for key in root_dir.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile'
    ]
    if len(histogram_dirs) != 1:
      raise RuntimeError(
        'Expected single directory containing lumiScale histograms in %s' % sample_entry['input']
      )
    histogram_dir = histogram_dirs[0]
    histograms = [
      key.GetName() for key in histogram_dir.GetListOfKeys() \
      if key.GetClassName().startswith('TH1') and 'lumiScale' in key.GetName()
    ]
    for histogram_name_actual in histograms:
      histogram_name = histogram_name_actual.replace('_lumiScale', '').replace('CMS_ttHl_', '') \
                       if histogram_name_actual != 'lumiScale' else 'central'
      histogram = histogram_dir.Get(histogram_name_actual).Clone()
      histogram.SetDirectory(0)
      if histogram.GetEntries() != sample_entry['nentries'] and mode == 'unbiased':
        raise RuntimeError('Expected {} entries from {} in file {}, but got {} entries'.format(
          sample_entry['nentries'], histogram_name, sample_entry['input'], histogram.GetEntries(),
        ))
      if histogram_name not in histogram_dict:
        histogram_dict[histogram_name] = {
          'histogram' : histogram,
          'nentries'  : histogram.GetEntries(),
          'nfiles'    : 1,
        }
      else:
        histogram_dict[histogram_name]['histogram'].Add(histogram)
        histogram_dict[histogram_name]['nentries'] += histogram.GetEntries()
        histogram_dict[histogram_name]['nfiles'] += 1

    root_file.Close()

  if not histogram_dict:
    logging.error('Could not find histograms for samples {}'.format(', '.join(list(input_files.keys()))))
    return

  if len(set(histogram_dict[histogram_name]['nfiles'] for histogram_name in histogram_dict)) != 1:
    raise RuntimeError(
      'Inconsistent number of files found for samples %s' % ', '.join(list(input_files.keys()))
    )
  if len(set(histogram_dict[histogram_name]['nentries'] for histogram_name in histogram_dict)) != 1:
    raise RuntimeError(
      'Inconsistent number of entries found in samples %s' % ', '.join(list(input_files.keys()))
    )

  min_y = -1
  max_y = -1
  nentries = -1
  for histograms in histogram_dict.values():
    histogram = histograms['histogram']
    y_content = histogram.GetBinContent(1)
    y_error   = histogram.GetBinError(1)

    y_down = y_content - y_error
    y_up   = y_content + y_error

    if min_y < 0:
      min_y = y_down
    if max_y < 0:
      max_y = y_up
    if y_down < min_y:
      min_y = y_down
    if y_up > max_y:
      max_y = y_up

    if nentries < 0:
      nentries = histograms['nentries']
    else:
      assert(nentries == histograms['nentries'])

    if not (y_down < expected_neff < y_up) and mode == 'unbiased':
      logging.warning(
        "Effective event count {} not within {} +- {}".format(expected_neff, y_content, y_error)
      )

  if mode == 'unbiased':
    min_y = min(min_y, expected_neff)
    max_y = max(max_y, expected_neff)
  diff = 0.2 * (max_y - min_y)
  min_y -= diff
  max_y += diff

  canvas = ROOT.TCanvas('c', 'c', 1200, 900)
  canvas.SetGrid()
  ROOT.gStyle.SetOptStat(0)

  legend = ROOT.TLegend(0.1, 0.7, 0.48, 0.9)
  legend.SetHeader('N_{eff} (%d entries)' % nentries)

  expected_histogram = None

  line_width = 3
  marker_style = 20
  fill_style = 4000

  lines = []

  for idx, histogram_name in enumerate(sorted(histogram_dict.keys())):
    histogram = histogram_dict[histogram_name]['histogram']
    color = 2 + idx

    histogram.SetTitle(title)
    histogram.SetAxisRange(min_y, max_y, "Y")
    histogram.SetLineColor(color)
    histogram.SetMarkerColor(color)
    histogram.SetLineWidth(line_width)
    histogram.SetMarkerStyle(marker_style)
    histogram.SetFillStyle(fill_style)
    histogram.Draw("l e1%s" % (" same" if idx > 0 else ""))

    y_content = histogram.GetBinContent(1)
    y_error   = histogram.GetBinError(1)
    y_up      = y_content + y_error
    y_down    = y_content - y_error

    bin_width  = histogram.GetBinWidth(1)
    bin_center = histogram.GetBinCenter(1)
    line_min_x = bin_center - bin_width / 4
    line_max_x = bin_center + bin_width / 4

    line_down = ROOT.TLine(line_min_x, y_down, line_max_x, y_down)
    line_down.SetLineColor(color)
    line_down.SetLineWidth(line_width)
    line_down.Draw()
    lines.append(line_down)

    line_up = ROOT.TLine(line_min_x, y_up, line_max_x, y_up)
    line_up.SetLineColor(color)
    line_up.SetLineWidth(line_width)
    line_up.Draw()
    lines.append(line_up)

    sig_digits = max(8 - int(math.ceil(math.log10(y_content))), 1) if y_content > 0. else 1
    leg_pattern = '%s (%.{}f #pm %.{}f)'.format(sig_digits, sig_digits)
    leg_name = leg_pattern % (histogram_name, y_content, y_error)
    legend.AddEntry(histogram, leg_name)

    logging.debug(
      'Effective event count for the sys unc option {} is {} +- {}'.format(
        histogram_name, y_content, y_error
      )
    )

    if not expected_histogram and mode == 'unbiased':
      expected_histogram = histogram.Clone()
      expected_histogram.Reset()
      expected_histogram.SetBinContent(1, expected_neff)
      expected_histogram.SetBinError(1, 0)
      expected_histogram.SetLineColor(ROOT.kBlack)
      expected_histogram.SetMarkerColor(ROOT.kBlack)
      expected_histogram.SetLineWidth(line_width)
      expected_histogram.SetMarkerStyle(marker_style)
      expected_histogram.SetLineStyle(9)
      expected_histogram.SetFillStyle(fill_style)

  if expected_histogram:
    logging.debug('Expecting {} events'.format(expected_neff))
    expected_histogram.Draw("e2 same")
    legend.AddEntry(expected_histogram, 'expected (%.1f)' % expected_neff)

  legend.Draw()

  for output_file in output_files:
    canvas.SaveAs(output_file)

  canvas.Close()
  legend.Delete()
  if expected_histogram:
    expected_histogram.Delete()
  for histogram_name in histogram_dict:
    histogram_dict[histogram_name]['histogram'].Delete()
  for line in lines:
    line.Delete()
Exemplo n.º 19
0
                      help = 'R|Enable verbose printout')
  args = parser.parse_args()

  if args.verbose:
    logging.getLogger().setLevel(logging.DEBUG)

  rle_filename = args.input
  out_filename = os.path.abspath(args.output)
  grep_dir     = args.directory
  sample_name  = args.sample_name
  force        = args.force
  debug_output = args.debug
  nof_files    = args.nof_files

  # check if input RLE file exists
  if not hdfs.isfile(rle_filename):
    logging.error("File {rle_filename} does not exist or is not a file!".format(rle_filename = rle_filename))
    sys.exit(1)

  # check if the directory into which we have to write the output ROOT file already exists
  out_parent_dir = os.path.dirname(out_filename)
  if not hdfs.isdir(out_parent_dir):
    if not force:
      logging.error("Parent directory of the output file {out_filename} does not exist".format(
        out_filename = out_filename),
      )
      sys.exit(1)
    else:
      logging.debug("Output directory {out_parent_dir} does not exist, attempting to create one".format(
        out_parent_dir = out_parent_dir,
      ))