def submit_resolution_dag(pairs_file, max_l1_pt, log_dir, append,
                          pu_bins, eta_bins, common_input_files,
                          force_submit=False):
    """Submit one makeResolutionPlots DAG for one pairs file.

    This will run makeResolutionPlots over exclusive and inclusive eta bins,
    and then finally hadd the results together.

    Parameters
    ----------
    pairs_files : str, optional
        Pairs file to process. Must be full path.

    max_l1_pt : int, optional
        Maximum L1 pt to consider when making plots.

    log_dir : str, optional
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str, optional
        String to append to filenames to track various settings (e.g. PU bin).

    pu_bins : list[list[int, int]], optional
        List of PU bin edges.

    eta_bins : list[float], optional
        List of eta bin edges, including upper edge of last bin.

    force_submit : bool, optional
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.

    """
    cc.check_file_exists(pairs_file)

    # Setup output directory for res* files
    # e.g. if pairs file in DATASET/pairs/pairs.root
    # then output goes in DATASET/resolution/
    out_dir = os.path.dirname(os.path.dirname(pairs_file))
    out_dir = os.path.join(out_dir, 'resolution')
    cc.check_create_dir(out_dir, info=True)

    # Stem for output filename
    out_stem = os.path.splitext(os.path.basename(pairs_file))[0]
    out_stem = out_stem.replace("pairs_", "res_")

    # Loop over PU bins
    # ---------------------------------------------------------------------
    pu_bins = pu_bins or [[-99, 999]]  # set ridiculous limits if no cut on PU
    status_files = []
    for (pu_min, pu_max) in pu_bins:

        log_stem = 'res.$(cluster).$(process)'
        res_jobs = ht.JobSet(exe='python',
                             copy_exe=False,
                             filename='submit_resolution.condor',
                             setup_script='worker_setup.sh',
                             share_exe_setup=True,
                             out_dir=log_dir, out_file=log_stem + '.out',
                             err_dir=log_dir, err_file=log_stem + '.err',
                             log_dir=log_dir, log_file=log_stem + '.log',
                             cpus=1, memory='100MB', disk='100MB',
                             transfer_hdfs_input=False,
                             common_input_files=common_input_files,
                             hdfs_store=out_dir)

        # For creating filenames later
        fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt)

        # Hold all output filenames
        res_output_files = []

        # Add exclusive eta bins to this JobSet
        for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)):
            out_file = out_stem + "_%d" % ind + append.format(**fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            res_output_files.append(out_file)

            job_args = ['makeResolutionPlots.py', pairs_file, out_file,
                        '--excl', #'--maxPt', max_l1_pt,
                        #'--PUmin', pu_min, '--PUmax', pu_max,
                        '--etaInd', ind]

            res_job = ht.Job(name='res_%d' % ind,
                             args=job_args,
                             input_files=[pairs_file],
                             output_files=[out_file])

            res_jobs.add_job(res_job)

        # Add inclusive bins (central, forward, all)
        # remove the [0:1] to do all - currently central only 'cos HF broke
        for incl in ['central', 'forward', 'all'][0:1]:
            out_file = out_stem + "_%s" % incl + append.format(**fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            res_output_files.append(out_file)

            job_args = ['makeResolutionPlots.py', pairs_file, out_file,
                        '--incl'] #, '--maxPt', max_l1_pt,
                        # '--PUmin', pu_min, '--PUmax', pu_max]
            if incl != 'all':
                job_args.append('--%s' % incl)

            res_job = ht.Job(name='res_%s' % incl,
                             args=job_args,
                             input_files=[pairs_file],
                             output_files=[out_file])

            res_jobs.add_job(res_job)

        # Add hadd jobs
        # ---------------------------------------------------------------------
        log_stem = 'resHadd.$(cluster).$(process)'

        hadd_jobs = ht.JobSet(exe='hadd',
                              copy_exe=False,
                              filename='haddSmall.condor',
                              setup_script="cmssw_setup.sh",
                              share_exe_setup=True,
                              out_dir=log_dir, out_file=log_stem + '.out',
                              err_dir=log_dir, err_file=log_stem + '.err',
                              log_dir=log_dir, log_file=log_stem + '.log',
                              cpus=1, memory='100MB', disk='20MB',
                              transfer_hdfs_input=False,
                              hdfs_store=out_dir)

        # Construct final hadded file name
        final_file = os.path.join(out_dir, out_stem + append.format(**fmt_dict) + '.root')
        hadd_output = [final_file]
        hadd_args = hadd_output + res_output_files

        hadder = ht.Job(name='haddRes',
                        args=hadd_args,
                        input_files=res_output_files,
                        output_files=hadd_output)

        hadd_jobs.add_job(hadder)

        # Add all jobs to DAG, with necessary dependencies
        # ---------------------------------------------------------------------
        stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
        res_dag = ht.DAGMan(filename='%s.dag' % stem,
                            status_file='%s.status' % stem)
        for job in res_jobs:
            res_dag.add_job(job)

        res_dag.add_job(hadder, requires=[j for j in res_jobs])

        # Check if any of the output files already exists - maybe we mucked up?
        # ---------------------------------------------------------------------
        if not force_submit:
            for f in [final_file] + res_output_files:
                if os.path.isfile(f):
                    print 'ERROR: output file already exists - not submitting'
                    print 'FILE:', f
                    return 1

        # res_dag.write()
        res_dag.submit()
        status_files.append(res_dag.status_file)

    print 'For all statuses:'
    print 'DAGstatus.py', ' '.join(status_files)
def submit_matcher_dag(exe, ntuple_dir, log_dir, l1_dir, ref_dir, deltaR, ref_min_pt, cleaning_cut,
                       append, force_submit):
    """Submit one matcher DAG for one directory of ntuples.

    This will run `exe` over all Ntuple files and then hadd the results together.

    Parameters
    ----------
    exe : str
        Name of executable.

    ntuple_dir : str
        Name of directory with L1Ntuples to run over.

    log_dir : str
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str
        String to append to filenames to track various settings (e.g. deltaR cut).

    l1_dir : str
        Name of TDirectory in Ntuple that holds L1 jets.

    ref_dir : str
        Name of TDirectory in Ntuple that holds reference jets.

    deltaR : float
        Maximum deltaR(L1, Ref) for a match.

    ref_min_pt : float
        Minimum pT cut on reference jets to be considered for matching.

    force_submit : bool
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.
    """
    # DAG for jobs
    stem = 'matcher_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
    matcher_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem),
                            status_file=os.path.join(log_dir, '%s.status' % stem))

    # JobSet for each matching job
    log_stem = 'matcher.$(cluster).$(process)'

    matcher_jobs = ht.JobSet(exe=find_executable(exe),
                             copy_exe=True,
                             filename='submit_matcher.condor',
                             setup_script=None,
                             out_dir=log_dir, out_file=log_stem + '.out',
                             err_dir=log_dir, err_file=log_stem + '.err',
                             log_dir=log_dir, log_file=log_stem + '.log',
                             cpus=1, memory='100MB', disk='100MB',
                             transfer_hdfs_input=False,
                             share_exe_setup=True,
                             hdfs_store=ntuple_dir)

    # For creating filenames later
    fmt_dict = dict()

    # Hold all output filenames
    match_output_files = []

    # Additional files to copy across - JEC, etc
    common_input_files = []

    # Add matcher job for each ntuple file
    for ind, ntuple in enumerate(os.listdir(ntuple_dir)):
        # if ind > 10:
        #     break

        # Skip non-ntuple files
        if not ntuple.endswith('.root') or ntuple.startswith('pairs'):
            continue

        ntuple_abspath = os.path.join(ntuple_dir, ntuple)

        # Construct output name
        ntuple_name = os.path.splitext(ntuple)[0]
        # handle anything up to first underscore (L1Tree, L1Ntuple, ...)
        result = re.match(r'^[a-zA-Z0-9]*_', ntuple_name)
        if result:
            pairs_file = '%s_%s.root' % (ntuple_name.replace(result.group(), 'pairs_'),
                                         append.format(**fmt_dict))
        else:
            pairs_file = 'pairs_%s_%s.root' % (ntuple_name, append.format(**fmt_dict))
        out_file = os.path.join(ntuple_dir, pairs_file)
        match_output_files.append(out_file)

        # Add matching job
        job_args = ['-I', ntuple_abspath, '-O', out_file,
                    '--refDir', ref_dir, '--l1Dir', l1_dir,
                    '--draw 0', '--deltaR', deltaR, '--refMinPt', ref_min_pt]
        if cleaning_cut:
            job_args.extend(['--cleanJets', cleaning_cut])

        input_files = common_input_files + [ntuple_abspath]

        match_job = ht.Job(name='match_%d' % ind,
                           args=job_args,
                           input_files=input_files,
                           output_files=[out_file])

        matcher_jobs.add_job(match_job)
        matcher_dag.add_job(match_job)

    # Construct final filename
    # ---------------------------------------------------------------------
    final_file = 'pairs_%s_%s.root' % (os.path.basename(ntuple_dir.rstrip('/')),
                                       append.format(**fmt_dict))
    final_dir = os.path.join(os.path.dirname(ntuple_dir.rstrip('/')), 'pairs')
    cc.check_create_dir(final_dir, info=True)
    final_file = os.path.join(final_dir, final_file)
    log.info("Final file: %s", final_file)

    # Check if any of the output files already exists - maybe we mucked up?
    # ---------------------------------------------------------------------
    if not force_submit:
        for f in [final_file] + match_output_files:
            if os.path.isfile(f):
                raise RuntimeError('ERROR: output file already exists - not submitting.'
                                   '\nTo bypass, use -f flag. \nFILE: %s' % f)

    # Add in hadding jobs
    # ---------------------------------------------------------------------
    hadd_jobs = add_hadd_jobs(matcher_dag, matcher_jobs.jobs.values(), final_file, log_dir)

    # Add in job to delete individual and intermediate hadd files
    # ---------------------------------------------------------------------
    log_stem = 'matcherRm.$(cluster).$(process)'

    rm_jobs = ht.JobSet(exe='hadoop',
                        copy_exe=False,
                        filename='submit_matcherRm.condor',
                        out_dir=log_dir, out_file=log_stem + '.out',
                        err_dir=log_dir, err_file=log_stem + '.err',
                        log_dir=log_dir, log_file=log_stem + '.log',
                        cpus=1, memory='100MB', disk='10MB',
                        transfer_hdfs_input=False,
                        share_exe_setup=False,
                        hdfs_store=ntuple_dir)

    for i, job in enumerate(chain(matcher_jobs, hadd_jobs[:-1])):
        pairs_file = job.output_files[0]
        rm_job = ht.Job(name='rm%d' % i,
                        args=' fs -rm -skipTrash %s' % pairs_file.replace('/hdfs', ''))
        rm_jobs.add_job(rm_job)
        matcher_dag.add_job(rm_job, requires=hadd_jobs[-1])

    # Submit
    # ---------------------------------------------------------------------
    # matcher_dag.write()
    matcher_dag.submit()
    return matcher_dag.status_file
def add_hadd_jobs(dagman, jobs, final_file, log_dir):
    """Add necessary hadd jobs to DAG. All jobs will be hadded together to make
    `final_file`.

    DAGs can only accept a maximum number of arguments, so we have to split
    up hadd-ing into groups. Therefore we need an intermediate layer of hadd
    jobs, and then finally hadd those intermediate output files

    Parameters
    ----------
    dagman : DAGMan
        DAGMan object to add jobs to.

    jobs : list[Job]
        Collection of Jobs to be hadd-ed together.

    final_file : str
        Final hadd-ed filename.

    Returns
    -------
    JobSet
        JobSet for hadd jobs.
    """
    group_size = 200  # max files per hadding job
    # adjust to avoid hadding 1 file by itself
    if len(jobs) % group_size == 0:
        group_size = 199
    # calculate number of intermediate hadd jobs required
    n_inter_jobs = int(math.ceil(len(jobs) * 1. / group_size))

    log_stem = 'matcherHadd.$(cluster).$(process)'

    hadd_jobs = ht.JobSet(exe='hadd',
                          copy_exe=False,
                          filename='haddBig.condor',
                          setup_script=None,
                          out_dir=log_dir, out_file=log_stem + '.out',
                          err_dir=log_dir, err_file=log_stem + '.err',
                          log_dir=log_dir, log_file=log_stem + '.log',
                          cpus=1, memory='100MB', disk='1GB',
                          transfer_hdfs_input=False,
                          share_exe_setup=True,
                          hdfs_store=os.path.dirname(final_file))

    if n_inter_jobs == 1:
        hadd_input = [j.output_files[0] for j in jobs]
        hadd_args = [final_file] + hadd_input
        hadd_job = ht.Job(name='finalHadd',
                          args=hadd_args,
                          input_files=hadd_input,
                          output_files=[final_file])
        hadd_jobs.add_job(hadd_job)
        dagman.add_job(hadd_job, requires=jobs)
    else:
        # Go through groups of Jobs, make intermediate hadd files in same dir
        # as final file
        intermediate_jobs = []
        for i, job_group in enumerate(grouper(jobs, group_size)):
            # Note, job_group is guaranteed to be length group_size, and is
            # padded with None if there arent' that many entries. So need to
            # filter out NoneType
            job_group = filter(None, job_group)
            hadd_input = [j.output_files[0] for j in job_group]
            inter_file = 'hadd_inter_%d_%s.root' % (i, cc.rand_str(5))
            inter_file = os.path.join(os.path.dirname(final_file), inter_file)
            hadd_args = [inter_file] + hadd_input
            hadd_job = ht.Job(name='interHadd%d' % i,
                              args=hadd_args,
                              input_files=hadd_input,
                              output_files=[inter_file])
            hadd_jobs.add_job(hadd_job)
            dagman.add_job(hadd_job, requires=job_group)
            intermediate_jobs.append(hadd_job)

        # Add final hadd job for intermediate files
        hadd_input = [j.output_files[0] for j in intermediate_jobs]
        hadd_args = [final_file] + hadd_input
        hadd_job = ht.Job(name='finalHadd',
                          args=hadd_args,
                          input_files=hadd_input,
                          output_files=[final_file])
        hadd_jobs.add_job(hadd_job)
        dagman.add_job(hadd_job, requires=intermediate_jobs)

    return hadd_jobs
def submit_resolution_dag(pairs_file,
                          max_l1_pt,
                          log_dir,
                          append,
                          pu_bins,
                          eta_bins,
                          common_input_files,
                          force_submit=False):
    """Submit one makeResolutionPlots DAG for one pairs file.

    This will run makeResolutionPlots over exclusive and inclusive eta bins,
    and then finally hadd the results together.

    Parameters
    ----------
    pairs_files : str, optional
        Pairs file to process. Must be full path.

    max_l1_pt : int, optional
        Maximum L1 pt to consider when making plots.

    log_dir : str, optional
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str, optional
        String to append to filenames to track various settings (e.g. PU bin).

    pu_bins : list[list[int, int]], optional
        List of PU bin edges.

    eta_bins : list[float], optional
        List of eta bin edges, including upper edge of last bin.

    force_submit : bool, optional
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.

    """
    cc.check_file_exists(pairs_file)

    # Setup output directory for res* files
    # e.g. if pairs file in DATASET/pairs/pairs.root
    # then output goes in DATASET/resolution/
    out_dir = os.path.dirname(os.path.dirname(pairs_file))
    out_dir = os.path.join(out_dir, 'resolution')
    cc.check_create_dir(out_dir, info=True)

    # Stem for output filename
    out_stem = os.path.splitext(os.path.basename(pairs_file))[0]
    out_stem = out_stem.replace("pairs_", "res_")

    # Loop over PU bins
    # ---------------------------------------------------------------------
    pu_bins = pu_bins or [[-99, 999]]  # set ridiculous limits if no cut on PU
    status_files = []
    for (pu_min, pu_max) in pu_bins:

        log_stem = 'res.$(cluster).$(process)'
        res_jobs = ht.JobSet(exe='python',
                             copy_exe=False,
                             filename='submit_resolution.condor',
                             setup_script='worker_setup.sh',
                             share_exe_setup=True,
                             out_dir=log_dir,
                             out_file=log_stem + '.out',
                             err_dir=log_dir,
                             err_file=log_stem + '.err',
                             log_dir=log_dir,
                             log_file=log_stem + '.log',
                             cpus=1,
                             memory='100MB',
                             disk='100MB',
                             transfer_hdfs_input=False,
                             common_input_files=common_input_files,
                             hdfs_store=out_dir)

        # For creating filenames later
        fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt)

        # Hold all output filenames
        res_output_files = []

        # Add exclusive eta bins to this JobSet
        for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)):
            out_file = out_stem + "_%d" % ind + append.format(
                **fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            res_output_files.append(out_file)

            job_args = [
                'makeResolutionPlots.py',
                pairs_file,
                out_file,
                '--excl',  #'--maxPt', max_l1_pt,
                #'--PUmin', pu_min, '--PUmax', pu_max,
                '--etaInd',
                ind
            ]

            res_job = ht.Job(name='res_%d' % ind,
                             args=job_args,
                             input_files=[pairs_file],
                             output_files=[out_file])

            res_jobs.add_job(res_job)

        # Add inclusive bins (central, forward, all)
        # remove the [0:1] to do all - currently central only 'cos HF broke
        for incl in ['central', 'forward', 'all'][0:1]:
            out_file = out_stem + "_%s" % incl + append.format(
                **fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            res_output_files.append(out_file)

            job_args = [
                'makeResolutionPlots.py', pairs_file, out_file, '--incl'
            ]  #, '--maxPt', max_l1_pt,
            # '--PUmin', pu_min, '--PUmax', pu_max]
            if incl != 'all':
                job_args.append('--%s' % incl)

            res_job = ht.Job(name='res_%s' % incl,
                             args=job_args,
                             input_files=[pairs_file],
                             output_files=[out_file])

            res_jobs.add_job(res_job)

        # Add hadd jobs
        # ---------------------------------------------------------------------
        log_stem = 'resHadd.$(cluster).$(process)'

        hadd_jobs = ht.JobSet(exe='hadd',
                              copy_exe=False,
                              filename='haddSmall.condor',
                              setup_script="cmssw_setup.sh",
                              share_exe_setup=True,
                              out_dir=log_dir,
                              out_file=log_stem + '.out',
                              err_dir=log_dir,
                              err_file=log_stem + '.err',
                              log_dir=log_dir,
                              log_file=log_stem + '.log',
                              cpus=1,
                              memory='100MB',
                              disk='20MB',
                              transfer_hdfs_input=False,
                              hdfs_store=out_dir)

        # Construct final hadded file name
        final_file = os.path.join(
            out_dir, out_stem + append.format(**fmt_dict) + '.root')
        hadd_output = [final_file]
        hadd_args = hadd_output + res_output_files

        hadder = ht.Job(name='haddRes',
                        args=hadd_args,
                        input_files=res_output_files,
                        output_files=hadd_output)

        hadd_jobs.add_job(hadder)

        # Add all jobs to DAG, with necessary dependencies
        # ---------------------------------------------------------------------
        stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
        res_dag = ht.DAGMan(filename='%s.dag' % stem,
                            status_file='%s.status' % stem)
        for job in res_jobs:
            res_dag.add_job(job)

        res_dag.add_job(hadder, requires=[j for j in res_jobs])

        # Check if any of the output files already exists - maybe we mucked up?
        # ---------------------------------------------------------------------
        if not force_submit:
            for f in [final_file] + res_output_files:
                if os.path.isfile(f):
                    print 'ERROR: output file already exists - not submitting'
                    print 'FILE:', f
                    return 1

        # res_dag.write()
        res_dag.submit()
        status_files.append(res_dag.status_file)

    print 'For all statuses:'
    print 'DAGstatus.py', ' '.join(status_files)
def submit_runCalib_dag(pairs_file, log_dir, append, pu_bins, eta_bins, common_input_files,
                        force_submit=False):
    """Submit one runCalibration DAG for one pairs file.

    This will run runCalibration over exclusive and inclusive eta bins,
    and then finally hadd the results together.

    Parameters
    ----------
    pairs_files : str, optional
        Pairs file to process. Must be full path.

    log_dir : str, optional
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str, optional
        String to append to filenames to track various settings (e.g. PU bin).

    pu_bins : list[list[int, int]], optional
        List of PU bin edges.

    eta_bins : list[float], optional
        List of eta bin edges, including upper edge of last bin.

    force_submit : bool, optional
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.

    """
    cc.check_file_exists(pairs_file)

    # Setup output directory for output* files
    # e.g. if pairs file in DATASET/pairs/pairs.root
    # then output goes in DATASET/output/
    out_dir = os.path.dirname(os.path.dirname(pairs_file))
    out_dir = os.path.join(out_dir, 'output')
    cc.check_create_dir(out_dir, info=True)

    # Stem for output filename
    out_stem = os.path.splitext(os.path.basename(pairs_file))[0]
    out_stem = out_stem.replace("pairs_", "output_")

    # Loop over PU bins
    # ---------------------------------------------------------------------
    pu_bins = pu_bins or [[-99, 999]]  # set ridiculous limits if no cut on PU
    status_files = []
    for (pu_min, pu_max) in pu_bins:
        log.info('**** Doing PU bin %g - %g', pu_min, pu_max)

        log_stem = 'runCalib.$(cluster).$(process)'
        runCalib_jobs = ht.JobSet(exe='python',
                                  copy_exe=False,
                                  filename=os.path.join(log_dir, 'submit_runCalib.condor'),
                                  setup_script='worker_setup.sh',
                                  share_exe_setup=True,
                                  out_dir=log_dir, out_file=log_stem + '.out',
                                  err_dir=log_dir, err_file=log_stem + '.err',
                                  log_dir=log_dir, log_file=log_stem + '.log',
                                  cpus=1, memory='100MB', disk='100MB',
                                  transfer_hdfs_input=False,
                                  common_input_files=common_input_files,
                                  hdfs_store=out_dir)

        # For creating filenames later
        fmt_dict = dict(puMin=pu_min, puMax=pu_max)

        # Hold all output filenames
        calib_output_files = []

        # Add exclusive eta bins to this JobSet
        for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)):
            out_file = out_stem + "_%d" % ind + append.format(**fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            calib_output_files.append(out_file)

            job_args = ['runCalibration.py', pairs_file, out_file,
                        "--no-genjet-plots", '--stage2',
                        '--no-correction-fit',
                        '--PUmin', pu_min, '--PUmax', pu_max,
                        '--etaInd', ind]

            calib_job = ht.Job(name='calib_%d' % ind,
                               args=job_args,
                               input_files=[pairs_file],
                               output_files=[out_file])

            runCalib_jobs.add_job(calib_job)

        # Add hadd jobs
        # ---------------------------------------------------------------------
        log_stem = 'runCalibHadd.$(cluster).$(process)'

        hadd_jobs = ht.JobSet(exe='hadd',
                              copy_exe=False,
                              share_exe_setup=True,
                              filename=os.path.join(log_dir, 'haddSmall.condor'),
                              setup_script="cmssw_setup.sh",
                              out_dir=log_dir, out_file=log_stem + '.out',
                              err_dir=log_dir, err_file=log_stem + '.err',
                              log_dir=log_dir, log_file=log_stem + '.log',
                              cpus=1, memory='100MB', disk='20MB',
                              transfer_hdfs_input=False,
                              hdfs_store=out_dir)

        # Construct final hadded file name
        final_file = os.path.join(out_dir, out_stem + append.format(**fmt_dict) + '.root')
        hadd_output = [final_file]
        hadd_args = hadd_output + calib_output_files

        hadder = ht.Job(name='haddRunCalib',
                        args=hadd_args,
                        input_files=calib_output_files,
                        output_files=hadd_output)

        hadd_jobs.add_job(hadder)

        # Add all jobs to DAG, with necessary dependencies
        # ---------------------------------------------------------------------
        stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
        calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem),
                              status_file=os.path.join(log_dir, '%s.status' % stem))
        for job in runCalib_jobs:
            calib_dag.add_job(job)

        calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs])

        # Check if any of the output files already exists - maybe we mucked up?
        # ---------------------------------------------------------------------
        if not force_submit:
            for f in [final_file] + calib_output_files:
                if os.path.isfile(f):
                    raise RuntimeError('Output file already exists - not submitting.'
                   '\nTo bypass, use -f flag. \nFILE: %s' % f)

        # calib_dag.write()
        calib_dag.submit()
        status_files.append(calib_dag.status_file)

    print 'For all statuses:'
    print 'DAGstatus.py', ' '.join(status_files)
    return status_files
def add_hadd_jobs(dagman, jobs, final_file, log_dir):
    """Add necessary hadd jobs to DAG. All jobs will be hadded together to make
    `final_file`.

    DAGs can only accept a maximum number of arguments, so we have to split
    up hadd-ing into groups. Therefore we need an intermediate layer of hadd
    jobs, and then finally hadd those intermediate output files

    Parameters
    ----------
    dagman : DAGMan
        DAGMan object to add jobs to.

    jobs : list[Job]
        Collection of Jobs to be hadd-ed together.

    final_file : str
        Final hadd-ed filename.

    Returns
    -------
    JobSet
        JobSet for hadd jobs.
    """
    group_size = 200  # max files per hadding job
    # adjust to avoid hadding 1 file by itself
    if len(jobs) % group_size == 0:
        group_size = 199
    # calculate number of intermediate hadd jobs required
    n_inter_jobs = int(math.ceil(len(jobs) * 1. / group_size))

    log_stem = 'matcherHadd.$(cluster).$(process)'

    hadd_jobs = ht.JobSet(exe='hadd',
                          copy_exe=False,
                          filename='haddBig.condor',
                          setup_script=None,
                          out_dir=log_dir,
                          out_file=log_stem + '.out',
                          err_dir=log_dir,
                          err_file=log_stem + '.err',
                          log_dir=log_dir,
                          log_file=log_stem + '.log',
                          cpus=1,
                          memory='100MB',
                          disk='1GB',
                          transfer_hdfs_input=False,
                          share_exe_setup=True,
                          hdfs_store=os.path.dirname(final_file))

    if n_inter_jobs == 1:
        hadd_input = [j.output_files[0] for j in jobs]
        hadd_args = [final_file] + hadd_input
        hadd_job = ht.Job(name='finalHadd',
                          args=hadd_args,
                          input_files=hadd_input,
                          output_files=[final_file])
        hadd_jobs.add_job(hadd_job)
        dagman.add_job(hadd_job, requires=jobs)
    else:
        # Go through groups of Jobs, make intermediate hadd files in same dir
        # as final file
        intermediate_jobs = []
        for i, job_group in enumerate(grouper(jobs, group_size)):
            # Note, job_group is guaranteed to be length group_size, and is
            # padded with None if there arent' that many entries. So need to
            # filter out NoneType
            job_group = filter(None, job_group)
            hadd_input = [j.output_files[0] for j in job_group]
            inter_file = 'hadd_inter_%d_%s.root' % (i, cc.rand_str(5))
            inter_file = os.path.join(os.path.dirname(final_file), inter_file)
            hadd_args = [inter_file] + hadd_input
            hadd_job = ht.Job(name='interHadd%d' % i,
                              args=hadd_args,
                              input_files=hadd_input,
                              output_files=[inter_file])
            hadd_jobs.add_job(hadd_job)
            dagman.add_job(hadd_job, requires=job_group)
            intermediate_jobs.append(hadd_job)

        # Add final hadd job for intermediate files
        hadd_input = [j.output_files[0] for j in intermediate_jobs]
        hadd_args = [final_file] + hadd_input
        hadd_job = ht.Job(name='finalHadd',
                          args=hadd_args,
                          input_files=hadd_input,
                          output_files=[final_file])
        hadd_jobs.add_job(hadd_job)
        dagman.add_job(hadd_job, requires=intermediate_jobs)

    return hadd_jobs
def submit_matcher_dag(exe, ntuple_dir, log_dir, l1_dir, ref_dir, deltaR,
                       ref_min_pt, cleaning_cut, append, force_submit):
    """Submit one matcher DAG for one directory of ntuples.

    This will run `exe` over all Ntuple files and then hadd the results together.

    Parameters
    ----------
    exe : str
        Name of executable.

    ntuple_dir : str
        Name of directory with L1Ntuples to run over.

    log_dir : str
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str
        String to append to filenames to track various settings (e.g. deltaR cut).

    l1_dir : str
        Name of TDirectory in Ntuple that holds L1 jets.

    ref_dir : str
        Name of TDirectory in Ntuple that holds reference jets.

    deltaR : float
        Maximum deltaR(L1, Ref) for a match.

    ref_min_pt : float
        Minimum pT cut on reference jets to be considered for matching.

    force_submit : bool
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.
    """
    # DAG for jobs
    stem = 'matcher_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
    matcher_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem),
                            status_file=os.path.join(log_dir,
                                                     '%s.status' % stem))

    # JobSet for each matching job
    log_stem = 'matcher.$(cluster).$(process)'

    matcher_jobs = ht.JobSet(exe=find_executable(exe),
                             copy_exe=True,
                             filename='submit_matcher.condor',
                             setup_script=None,
                             out_dir=log_dir,
                             out_file=log_stem + '.out',
                             err_dir=log_dir,
                             err_file=log_stem + '.err',
                             log_dir=log_dir,
                             log_file=log_stem + '.log',
                             cpus=1,
                             memory='100MB',
                             disk='100MB',
                             transfer_hdfs_input=False,
                             share_exe_setup=True,
                             hdfs_store=ntuple_dir)

    # For creating filenames later
    fmt_dict = dict()

    # Hold all output filenames
    match_output_files = []

    # Additional files to copy across - JEC, etc
    common_input_files = []

    # Add matcher job for each ntuple file
    for ind, ntuple in enumerate(os.listdir(ntuple_dir)):
        # if ind > 10:
        #     break

        # Skip non-ntuple files
        if not ntuple.endswith('.root') or ntuple.startswith('pairs'):
            continue

        ntuple_abspath = os.path.join(ntuple_dir, ntuple)

        # Construct output name
        ntuple_name = os.path.splitext(ntuple)[0]
        # handle anything up to first underscore (L1Tree, L1Ntuple, ...)
        result = re.match(r'^[a-zA-Z0-9]*_', ntuple_name)
        if result:
            pairs_file = '%s_%s.root' % (ntuple_name.replace(
                result.group(), 'pairs_'), append.format(**fmt_dict))
        else:
            pairs_file = 'pairs_%s_%s.root' % (ntuple_name,
                                               append.format(**fmt_dict))
        out_file = os.path.join(ntuple_dir, pairs_file)
        match_output_files.append(out_file)

        # Add matching job
        job_args = [
            '-I', ntuple_abspath, '-O', out_file, '--refDir', ref_dir,
            '--l1Dir', l1_dir, '--draw 0', '--deltaR', deltaR, '--refMinPt',
            ref_min_pt
        ]
        if cleaning_cut:
            job_args.extend(['--cleanJets', cleaning_cut])

        input_files = common_input_files + [ntuple_abspath]

        match_job = ht.Job(name='match_%d' % ind,
                           args=job_args,
                           input_files=input_files,
                           output_files=[out_file])

        matcher_jobs.add_job(match_job)
        matcher_dag.add_job(match_job)

    # Construct final filename
    # ---------------------------------------------------------------------
    final_file = 'pairs_%s_%s.root' % (os.path.basename(
        ntuple_dir.rstrip('/')), append.format(**fmt_dict))
    final_dir = os.path.join(os.path.dirname(ntuple_dir.rstrip('/')), 'pairs')
    cc.check_create_dir(final_dir, info=True)
    final_file = os.path.join(final_dir, final_file)
    log.info("Final file: %s", final_file)

    # Check if any of the output files already exists - maybe we mucked up?
    # ---------------------------------------------------------------------
    if not force_submit:
        for f in [final_file] + match_output_files:
            if os.path.isfile(f):
                raise RuntimeError(
                    'ERROR: output file already exists - not submitting.'
                    '\nTo bypass, use -f flag. \nFILE: %s' % f)

    # Add in hadding jobs
    # ---------------------------------------------------------------------
    hadd_jobs = add_hadd_jobs(matcher_dag, matcher_jobs.jobs.values(),
                              final_file, log_dir)

    # Add in job to delete individual and intermediate hadd files
    # ---------------------------------------------------------------------
    log_stem = 'matcherRm.$(cluster).$(process)'

    rm_jobs = ht.JobSet(exe='hadoop',
                        copy_exe=False,
                        filename='submit_matcherRm.condor',
                        out_dir=log_dir,
                        out_file=log_stem + '.out',
                        err_dir=log_dir,
                        err_file=log_stem + '.err',
                        log_dir=log_dir,
                        log_file=log_stem + '.log',
                        cpus=1,
                        memory='100MB',
                        disk='10MB',
                        transfer_hdfs_input=False,
                        share_exe_setup=False,
                        hdfs_store=ntuple_dir)

    for i, job in enumerate(chain(matcher_jobs, hadd_jobs[:-1])):
        pairs_file = job.output_files[0]
        rm_job = ht.Job(name='rm%d' % i,
                        args=' fs -rm -skipTrash %s' %
                        pairs_file.replace('/hdfs', ''))
        rm_jobs.add_job(rm_job)
        matcher_dag.add_job(rm_job, requires=hadd_jobs[-1])

    # Submit
    # ---------------------------------------------------------------------
    # matcher_dag.write()
    matcher_dag.submit()
    return matcher_dag.status_file
Exemplo n.º 8
0
def submit_runCalib_dag(pairs_file,
                        log_dir,
                        append,
                        pu_bins,
                        eta_bins,
                        common_input_files,
                        force_submit=False):
    """Submit one runCalibration DAG for one pairs file.

    This will run runCalibration over exclusive and inclusive eta bins,
    and then finally hadd the results together.

    Parameters
    ----------
    pairs_files : str, optional
        Pairs file to process. Must be full path.

    log_dir : str, optional
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str, optional
        String to append to filenames to track various settings (e.g. PU bin).

    pu_bins : list[list[int, int]], optional
        List of PU bin edges.

    eta_bins : list[float], optional
        List of eta bin edges, including upper edge of last bin.

    force_submit : bool, optional
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.

    """
    cc.check_file_exists(pairs_file)

    # Setup output directory for output* files
    # e.g. if pairs file in DATASET/pairs/pairs.root
    # then output goes in DATASET/output/
    out_dir = os.path.dirname(os.path.dirname(pairs_file))
    out_dir = os.path.join(out_dir, 'output')
    cc.check_create_dir(out_dir, info=True)

    # Stem for output filename
    out_stem = os.path.splitext(os.path.basename(pairs_file))[0]
    out_stem = out_stem.replace("pairs_", "output_")

    # Loop over PU bins
    # ---------------------------------------------------------------------
    pu_bins = pu_bins or [[-99, 999]]  # set ridiculous limits if no cut on PU
    status_files = []
    for (pu_min, pu_max) in pu_bins:
        log.info('**** Doing PU bin %g - %g', pu_min, pu_max)

        log_stem = 'runCalib.$(cluster).$(process)'
        runCalib_jobs = ht.JobSet(exe='python',
                                  copy_exe=False,
                                  filename='submit_runCalib.condor',
                                  setup_script='worker_setup.sh',
                                  share_exe_setup=True,
                                  out_dir=log_dir,
                                  out_file=log_stem + '.out',
                                  err_dir=log_dir,
                                  err_file=log_stem + '.err',
                                  log_dir=log_dir,
                                  log_file=log_stem + '.log',
                                  cpus=1,
                                  memory='100MB',
                                  disk='100MB',
                                  transfer_hdfs_input=False,
                                  common_input_files=common_input_files,
                                  hdfs_store=out_dir)

        # For creating filenames later
        fmt_dict = dict(puMin=pu_min, puMax=pu_max)

        # Hold all output filenames
        calib_output_files = []

        # Add exclusive eta bins to this JobSet
        for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)):
            out_file = out_stem + "_%d" % ind + append.format(
                **fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            calib_output_files.append(out_file)

            job_args = [
                'runCalibration.py', pairs_file, out_file, "--no-genjet-plots",
                '--stage2', '--no-correction-fit', '--PUmin', pu_min,
                '--PUmax', pu_max, '--etaInd', ind
            ]

            calib_job = ht.Job(name='calib_%d' % ind,
                               args=job_args,
                               input_files=[pairs_file],
                               output_files=[out_file])

            runCalib_jobs.add_job(calib_job)

        # Add hadd jobs
        # ---------------------------------------------------------------------
        log_stem = 'runCalibHadd.$(cluster).$(process)'

        hadd_jobs = ht.JobSet(exe='hadd',
                              copy_exe=False,
                              share_exe_setup=True,
                              filename='haddSmall.condor',
                              setup_script="cmssw_setup.sh",
                              out_dir=log_dir,
                              out_file=log_stem + '.out',
                              err_dir=log_dir,
                              err_file=log_stem + '.err',
                              log_dir=log_dir,
                              log_file=log_stem + '.log',
                              cpus=1,
                              memory='100MB',
                              disk='20MB',
                              transfer_hdfs_input=False,
                              hdfs_store=out_dir)

        # Construct final hadded file name
        final_file = os.path.join(
            out_dir, out_stem + append.format(**fmt_dict) + '.root')
        hadd_output = [final_file]
        hadd_args = hadd_output + calib_output_files

        hadder = ht.Job(name='haddRunCalib',
                        args=hadd_args,
                        input_files=calib_output_files,
                        output_files=hadd_output)

        hadd_jobs.add_job(hadder)

        # Add all jobs to DAG, with necessary dependencies
        # ---------------------------------------------------------------------
        stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
        calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem),
                              status_file=os.path.join(log_dir,
                                                       '%s.status' % stem))
        for job in runCalib_jobs:
            calib_dag.add_job(job)

        calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs])

        # Check if any of the output files already exists - maybe we mucked up?
        # ---------------------------------------------------------------------
        if not force_submit:
            for f in [final_file] + calib_output_files:
                if os.path.isfile(f):
                    print 'ERROR: output file already exists - not submitting'
                    print 'FILE:', f
                    return 1

        # calib_dag.write()
        calib_dag.submit()
        status_files.append(calib_dag.status_file)

    print 'For all statuses:'
    print 'DAGstatus.py', ' '.join(status_files)
    return status_files