def __create_analysis_layer(self, ntuple_jobs, mode): jobs = [] run_config = self.__config hdfs_store = run_config['outLFNDirBase'].replace('ntuple', 'atOutput') hdfs_store += '/tmp' job_set = htc.JobSet(exe=self.__analysis_script, copy_exe=True, setup_script=self.__analysis_setup_script, filename=os.path.join(self.__job_dir, 'ntuple_analysis.condor'), out_dir=self.__job_log_dir, out_file=LOG_STEM + '.out', err_dir=self.__job_log_dir, err_file=LOG_STEM + '.err', log_dir=self.__job_log_dir, log_file=LOG_STEM + '.log', share_exe_setup=True, common_input_files=self.__input_files, transfer_hdfs_input=False, hdfs_store=hdfs_store, certificate=self.REQUIRE_GRID_CERT, cpus=1, memory='1500MB') parameters = 'files={files} output_file={output_file} mode={mode}' input_files = [ f.hdfs for job in ntuple_jobs for f in job.output_file_mirrors if f.hdfs.endswith('.root') ] n_files_per_group = N_FILES_PER_ANALYSIS_JOB grouped_files = self.__group_files(input_files, n_files_per_group) for i, f in enumerate(grouped_files): output_file = '{dataset}_atOutput_{mode}_{job_number}.root'.format( dataset=run_config['outputDatasetTag'], mode=mode, job_number=i) args = parameters.format( files=','.join(f), output_file=output_file, mode=mode, ) rel_out_dir = os.path.relpath(RESULTDIR, NTPROOT) rel_log_dir = os.path.relpath(LOGDIR, NTPROOT) rel_out_file = os.path.join(rel_out_dir, output_file) rel_log_file = os.path.join(rel_log_dir, 'ntp.log') job = htc.Job(name='analysis_{0}_job_{1}'.format(mode, i), args=args, output_files=[rel_out_file, rel_log_file]) job_set.add_job(job) jobs.append(job) return jobs
def __create_ntuple_layer(self): jobs = [] run_config = self.__config input_files = run_config['files'] if self.__variables['test']: input_files = [input_files[0]] job_set = htc.JobSet(exe=self.__run_script, copy_exe=True, setup_script=self.__setup_script, filename=os.path.join(self.__job_dir, 'ntuple_production.condor'), out_dir=self.__job_log_dir, out_file=LOG_STEM + '.out', err_dir=self.__job_log_dir, err_file=LOG_STEM + '.err', log_dir=self.__job_log_dir, log_file=LOG_STEM + '.log', share_exe_setup=True, common_input_files=self.__input_files, transfer_hdfs_input=False, hdfs_store=run_config['outLFNDirBase'] + '/tmp', certificate=self.REQUIRE_GRID_CERT, cpus=1, memory='1500MB') parameters = 'files={files} output_file={output_file} {params}' if run_config['lumiMask']: parameters += ' json_url={0}'.format(run_config['lumiMask']) n_files_per_group = SPLITTING_BY_FILE['DEFAULT'] for name, value in SPLITTING_BY_FILE.items(): if name in run_config['inputDataset']: n_files_per_group = value grouped_files = make_even_chunks(input_files, n_files_per_group) for i, f in enumerate(grouped_files): output_file = '{dataset}_ntuple_{job_number}.root'.format( dataset=run_config['outputDatasetTag'], job_number=i) args = parameters.format(files=','.join(f), output_file=output_file, params=run_config['pyCfgParams']) rel_out_dir = os.path.relpath(RESULTDIR, NTPROOT) rel_log_dir = os.path.relpath(LOGDIR, NTPROOT) rel_out_file = os.path.join(rel_out_dir, output_file) rel_log_file = os.path.join(rel_log_dir, 'ntp.log') job = htc.Job(name='ntuple_job_{0}'.format(i), args=args, output_files=[rel_out_file, rel_log_file]) job_set.add_job(job) jobs.append(job) return jobs
def create_job_layer(self): jobs = [] self.__root_output_files = [] config = self.__config hdfs_store = config['outputDir'] job_set = htc.JobSet( exe=self.__run_script, copy_exe=True, setup_script=self.__setup_script, filename=os.path.join(self.__job_dir, '{0}.condor'.format(PREFIX)), out_dir=self.__job_log_dir, out_file=OUT_FILE, err_dir=self.__job_log_dir, err_file=ERR_FILE, log_dir=self.__job_log_dir, log_file=LOG_FILE, share_exe_setup=True, common_input_files=[], transfer_hdfs_input=False, hdfs_store=hdfs_store, certificate=self.REQUIRE_GRID_CERT, cpus=1, memory='1500MB' ) parameters = 'steps={steps} variable={variable} visiblePS={visiblePS}' output_file = OUTPUT_DIR i = 1 args = parameters.format( steps=config['steps'], variable=config['variable'], visiblePS=config['visiblePS'], ) rel_out_dir = os.path.relpath(RESULTDIR, NTPROOT) rel_log_dir = os.path.relpath(LOGDIR, NTPROOT) rel_out_file = os.path.join(rel_out_dir, output_file) rel_log_file = os.path.join(rel_log_dir, 'ntp.log') job = htc.Job( name='{0}_{1}_job_{2}'.format(PREFIX, config['variable'], i), args=args, output_files=[rel_out_file, rel_log_file] ) job_set.add_job(job) jobs.append(job) return jobs
def __create_merge_layer(self, analysis_jobs, mode): run_config = self.__config hdfs_store = run_config['outLFNDirBase'].replace('ntuple', 'atOutput') job_set = htc.JobSet(exe=self.__merge_script, copy_exe=True, setup_script=self.__merge_setup_script, filename=os.path.join(self.__job_dir, 'analysis_merge.condor'), out_dir=self.__job_log_dir, out_file=LOG_STEM + '.out', err_dir=self.__job_log_dir, err_file=LOG_STEM + '.err', log_dir=self.__job_log_dir, log_file=LOG_STEM + '.log', share_exe_setup=True, common_input_files=self.__input_files, transfer_hdfs_input=False, hdfs_store=hdfs_store, certificate=self.REQUIRE_GRID_CERT, cpus=1, memory='1500MB') parameters = '{files} output_file={output_file}' output_file = '{0}.root'.format(run_config['outputDatasetTag']) all_output_files = [ f for job in analysis_jobs for f in job.output_file_mirrors ] root_output_files = [ f.hdfs for f in all_output_files if f.hdfs.endswith('.root') ] args = parameters.format( files=' '.join(root_output_files), output_file=output_file, ) rel_log_dir = os.path.relpath(LOGDIR, NTPROOT) rel_log_file = os.path.join(rel_log_dir, 'ntp.log') job = htc.Job(name='{0}_merge_job'.format(mode), args=args, output_files=[output_file, rel_log_file]) job_set.add_job(job) return [job]
def submit_showoff_job(arg_str, out_dir, log_dir, common_input_files, output_files): log.debug(arg_str) log_stem = 'showoff.$(cluster).$(process)' timestamp = strftime("%H%M%S") showoff_jobs = ht.JobSet(exe='python', copy_exe=False, filename=os.path.join( log_dir, 'submit_showoff_%s.condor' % timestamp), setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='200MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # We don't want to stream-write plots to HDFS - easier to make them all on # the worker node, zip it up, then transfer to HDFS rand_int = randint(0, 1000) tmp_oDir = 'showoff_%s_%d' % (timestamp, rand_int) if '--oDir' not in arg_str: arg_str += ' --oDir %s' % tmp_oDir else: arg_str = arg_str.replace('--oDir %s' % out_dir, '--oDir %s' % tmp_oDir) sj = ht.Job(name=tmp_oDir, args=arg_str.split(), input_files=None, output_files=output_files, hdfs_mirror_dir=out_dir) showoff_jobs.add_job(sj) showoff_jobs.submit()
def create_job_layer(self, input_files, mode): jobs = [] self.__root_output_files = [] config = self.__config if self.__variables['test']: input_files = [input_files[0]] self.__config['input_files'] = input_files hdfs_store = config['outputDir'] job_set = htc.JobSet( exe=self.__run_script, copy_exe=True, setup_script=self.__setup_script, filename=os.path.join(self.__job_dir, '{0}.condor'.format(PREFIX)), out_dir=self.__job_log_dir, out_file=OUT_FILE, err_dir=self.__job_log_dir, err_file=ERR_FILE, log_dir=self.__job_log_dir, log_file=LOG_FILE, share_exe_setup=True, common_input_files=self.__input_files, transfer_hdfs_input=True, hdfs_store=hdfs_store, certificate=self.REQUIRE_GRID_CERT, cpus=1, memory='900MB', other_args={'Requirements':'( !stringListMember(substr(Target.Machine,0,2),"sm,bs") )'}, ) parameters = 'files={files} output_file_suffix={suffix} mode={mode}' parameters += ' dataset={dataset}' dataset = config['parameters']['dataset'] n_files_per_group = N_FILES_PER_ANALYSIS_JOB for name, value in SPLITTING_BY_FILE.items(): if name in dataset: n_files_per_group = value grouped_files = make_even_chunks( input_files, size_of_chunk=n_files_per_group) for i, f in enumerate(grouped_files): suffix = 'atOutput_{job_number}.root'.format( dataset=dataset, mode=mode, job_number=i ) args = parameters.format( files=','.join(f), suffix=suffix, mode=mode, dataset=dataset, ) output_file = '_'.join([dataset, mode, suffix]) rel_out_dir = os.path.relpath(RESULTDIR, NTPROOT) rel_log_dir = os.path.relpath(LOGDIR, NTPROOT) rel_out_file = os.path.join(rel_out_dir, output_file) rel_log_file = os.path.join(rel_log_dir, 'ntp.log') job = htc.Job( name='{0}_{1}_job_{2}'.format(PREFIX, mode, i), args=args, output_files=[rel_out_file, rel_log_file]) job_set.add_job(job) jobs.append(job) return jobs
def haddaway(in_args=sys.argv[1:]): parser = ArgParser(description=__doc__, formatter_class=CustomFormatter) args = parser.parse_args(args=in_args) if args.verbose: log.setLevel(logging.DEBUG) log.debug(args) # Check hadd exists check_hadd_exists() if not args.input and not args.inputList: raise RuntimeError("Need to specify --input or --inputFiles") final_filename = args.output if not final_filename.startswith("/hdfs"): raise RuntimeError("Output file MUST be on HDFS") # Get list of input files, do checks input_files = [] if args.inputList: if not os.path.isfile(args.inputList): raise IOError("%s does not exist" % args.inputList) with open(args.inputList) as f: input_files = f.readlines() else: input_files = args.input[:] if len(input_files) < 2: raise RuntimeError("Fewer than 2 input files - hadd not needed") # sanitise paths, check existance for i, f in enumerate(input_files): input_files[i] = os.path.abspath(f).strip().strip("\n").strip() if not os.path.isfile(input_files[i]): raise IOError("Input %s does not exist" % input_files[i]) log.debug('Input:', input_files) # Arrange into jobs inter_hadd_jobs, final_hadd_job = create_hadd_jobs(input_files, args.size, final_filename, hadd_args=args.haddArgs) log.info("Creating %d intermediate jobs", len(inter_hadd_jobs)) # Add to JobSet and DAG user_dict = { "username": os.environ['LOGNAME'], 'datestamp': strftime("%d_%b_%y"), 'timestamp': strftime("%H%M%S") } log_dir = "/storage/{username}/haddaway/{datestamp}/".format(**user_dict) dag_file = os.path.join(log_dir, "haddaway_{timestamp}.dag".format(**user_dict)) status_file = os.path.join(log_dir, "haddaway_{timestamp}.status".format(**user_dict)) hadd_dag = ht.DAGMan(filename=dag_file, status_file=status_file) condor_file = os.path.join(log_dir, "haddaway_{timestamp}.condor".format(**user_dict)) log_stem = "hadd.$(cluster).$(process)" # TODO: clever estimate of RAM/disk size required hadd_jobset = ht.JobSet(exe='hadd', copy_exe=False, filename=condor_file, out_dir=os.path.join(log_dir, 'logs'), out_file=log_stem + '.out', err_dir=os.path.join(log_dir, 'logs'), err_file=log_stem + '.err', log_dir=os.path.join(log_dir, 'logs'), log_file=log_stem + '.log', cpus=1, memory='1GB', disk='1.5GB', transfer_hdfs_input=True, share_exe_setup=True, hdfs_store=os.path.dirname(final_filename)) for job in inter_hadd_jobs: hadd_jobset.add_job(job) hadd_dag.add_job(job) hadd_jobset.add_job(final_hadd_job) hadd_dag.add_job(final_hadd_job, requires=inter_hadd_jobs if inter_hadd_jobs else None) # Add removal jobs if necessary rm_jobs = create_intermediate_cleanup_jobs(inter_hadd_jobs) if len(rm_jobs) > 0: condor_file = os.path.join(log_dir, "rm_{timestamp}.condor".format(**user_dict)) log_stem = "rm.$(cluster).$(process)" rm_jobset = ht.JobSet(exe="hadoop", copy_exe=False, filename=condor_file, out_dir=os.path.join(log_dir, 'logs'), out_file=log_stem + '.out', err_dir=os.path.join(log_dir, 'logs'), err_file=log_stem + '.err', log_dir=os.path.join(log_dir, 'logs'), log_file=log_stem + '.log', cpus=1, memory='100MB', disk='10MB', transfer_hdfs_input=False, share_exe_setup=False, hdfs_store=os.path.dirname(final_filename)) for job in rm_jobs: rm_jobset.add_job(job) hadd_dag.add_job(job, requires=final_hadd_job) # add jobs to remove copies from HDFS if they weren't there originally for job_ind, job in enumerate(inter_hadd_jobs): for m_ind, mirror in enumerate(job.input_file_mirrors): if not mirror.original.startswith('/hdfs'): condor_file = os.path.join(log_dir, "rmCopy_{timestamp}.condor".format(**user_dict)) log_stem = "rmCopy.$(cluster).$(process)" rm_job = ht.Job(name="rmCopy_%d_%d" % (job_ind, m_ind), args=" fs -rm -skipTrash %s" % mirror.hdfs.replace("/hdfs", "")) rm_jobset.add_job(rm_job) hadd_dag.add_job(rm_job, requires=job) # Submit jobs hadd_dag.submit() return 0
def submit_runCalib_dag(pairs_file, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one runCalibration DAG for one pairs file. This will run runCalibration over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for output* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/output/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'output') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "output_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log.info('**** Doing PU bin %g - %g', pu_min, pu_max) log_stem = 'runCalib.$(cluster).$(process)' runCalib_jobs = ht.JobSet(exe='python', copy_exe=False, filename='submit_runCalib.condor', setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max) # Hold all output filenames calib_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) calib_output_files.append(out_file) job_args = [ 'runCalibration.py', pairs_file, out_file, "--no-genjet-plots", '--stage2', '--no-correction-fit', '--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind ] calib_job = ht.Job(name='calib_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) runCalib_jobs.add_job(calib_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'runCalibHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, share_exe_setup=True, filename='haddSmall.condor', setup_script="cmssw_setup.sh", out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join( out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + calib_output_files hadder = ht.Job(name='haddRunCalib', args=hadd_args, input_files=calib_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem), status_file=os.path.join(log_dir, '%s.status' % stem)) for job in runCalib_jobs: calib_dag.add_job(job) calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + calib_output_files: if os.path.isfile(f): print 'ERROR: output file already exists - not submitting' print 'FILE:', f return 1 # calib_dag.write() calib_dag.submit() status_files.append(calib_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files) return status_files
def submit_resolution_dag(pairs_file, max_l1_pt, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one makeResolutionPlots DAG for one pairs file. This will run makeResolutionPlots over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. max_l1_pt : int, optional Maximum L1 pt to consider when making plots. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for res* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/resolution/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'resolution') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "res_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log_stem = 'res.$(cluster).$(process)' res_jobs = ht.JobSet(exe='python', copy_exe=False, filename='submit_resolution.condor', setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt) # Hold all output filenames res_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = [ 'makeResolutionPlots.py', pairs_file, out_file, '--excl', #'--maxPt', max_l1_pt, #'--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind ] res_job = ht.Job(name='res_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add inclusive bins (central, forward, all) # remove the [0:1] to do all - currently central only 'cos HF broke for incl in ['central', 'forward', 'all'][0:1]: out_file = out_stem + "_%s" % incl + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = [ 'makeResolutionPlots.py', pairs_file, out_file, '--incl' ] #, '--maxPt', max_l1_pt, # '--PUmin', pu_min, '--PUmax', pu_max] if incl != 'all': job_args.append('--%s' % incl) res_job = ht.Job(name='res_%s' % incl, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'resHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, filename='haddSmall.condor', setup_script="cmssw_setup.sh", share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join( out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + res_output_files hadder = ht.Job(name='haddRes', args=hadd_args, input_files=res_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) res_dag = ht.DAGMan(filename='%s.dag' % stem, status_file='%s.status' % stem) for job in res_jobs: res_dag.add_job(job) res_dag.add_job(hadder, requires=[j for j in res_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + res_output_files: if os.path.isfile(f): print 'ERROR: output file already exists - not submitting' print 'FILE:', f return 1 # res_dag.write() res_dag.submit() status_files.append(res_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files)
HDFS_STORE = "/hdfs/user/%s/simple_cmssw_job" % os.environ['LOGNAME'] # Set location for logs LOG_STORE = "/storage/%s/simple_cmssw_job/logs" % os.environ['LOGNAME'] log_stem = 'simple.$(cluster).$(process)' # Define a JobSet object for all jobs running the same exe # with same configuration for logs, etc job_set = ht.JobSet( exe='edmDumpEventContent', copy_exe=False, setup_script='setup_cmssw.sh', filename=os.path.join(LOG_STORE, 'simple_cmssw_job.condor'), out_dir=LOG_STORE, out_file=log_stem + '.out', err_dir=LOG_STORE, err_file=log_stem + '.err', log_dir=LOG_STORE, log_file=log_stem + '.log', cpus=1, memory='50MB', disk='1', certificate=True, # !!! important for passing Grid certificate to jobs hdfs_store=HDFS_STORE) # Now add a Job # Note that in this scenario, we are accessing the file over XRootD, # and thus we don't need to add it to the input_files argument. job = ht.Job( name='cmssw_job', args=[ 'root://xrootd.unl.edu//store/mc/RunIISpring15Digi74/QCD_Pt_30to50_TuneCUETP8M1_13TeV_pythia8/GEN-SIM-RAW/AVE_20_BX_25ns_tsg_MCRUN2_74_V7-v1/00000/00228B32-44F0-E411-9FC7-0025905C3DCE.root'
def cmsRunCondor(in_args=sys.argv[1:]): """Creates a condor job description file with the correct arguments, and optionally submit it. Returns a dict of information about the job. """ parser = ArgParser(description=__doc__, formatter_class=CustomFormatter) args = parser.parse_args(args=in_args) if args.verbose: log.setLevel(logging.DEBUG) log.debug(args) check_args(args) # Why not just use args.lumiMask to hold result? run_list = parse_run_range(args.runRange) if args.runRange else None lumi_mask = setup_lumi_mask(args.lumiMask) if args.lumiMask else None log.debug("Run range: %s", run_list) log.debug("Lumi mask: %s", lumi_mask) ########################################################################### # Lookup dataset with das_client to determine number of files/jobs # but only if we're not profiling ########################################################################### # placehold vars total_num_jobs = 1 filelist_filename, lumilist_filename = None, None # This could probably be done better! if not args.valgrind and not args.callgrind and not args.asIs: list_of_files, list_of_secondary_files = None, None list_of_lumis = None if args.unitsPerJob is None: raise RuntimeError( 'You must specify an integer number of --unitsPerJob') if args.filelist: # Get files from user's file with open(args.filelist) as flist: list_of_files = [ DatasetFile(name=line.strip(), lumi_list=None) for line in flist if line.strip() ] n_files = args.totalUnits if n_files < 0: n_files = None elif n_files < 1: n_files = int(round(n_files * len(list_of_files))) else: n_files = int(n_files) if n_files >= len(list_of_files): raise IndexError( "You cannot have more files than in the files:" " use -1 (the default) if you want them all") list_of_files = list_of_files[:n_files] filelist_filename = "filelist_user_%s.py" % ( strftime("%H%M%S")) # add time to ensure unique else: filelist_filename = generate_filelist_filename(args.dataset) lumilist_filename = generate_lumilist_filename(args.dataset) # Get list of files from DAS, also store corresponding lumis n_files = args.totalUnits if args.splitByFiles else -1 list_of_files = get_list_of_files_from_das(args.dataset, n_files) log.debug("Pre lumi filter") log.debug(list_of_files) if run_list: list_of_files = filter_by_run_num(list_of_files, run_list) if lumi_mask: list_of_files = filter_by_lumi_list(list_of_files, lumi_mask) log.debug("After lumi filter") log.debug(list_of_files) if args.secondaryDataset: list_of_secondary_files = get_list_of_files_from_das( args.secondaryDataset, -1) # do lumisection matching between primary and secondary datasets for f in list_of_files: f.parents = find_matching_files(list_of_secondary_files, f.lumi_list) # figure out job grouping if args.splitByFiles: job_files = group_files_by_files_per_job(list_of_files, args.unitsPerJob) total_num_jobs = len(job_files) create_filelist(job_files, filelist_filename) if lumilist_filename: # make an overall lumilist for all files in each job job_lumis = [] for f in job_files: tmp = f[0].lumi_list for x in f[1:]: tmp += x.lumi_list job_lumis.append(tmp) create_lumilists(job_lumis, lumilist_filename) elif args.splitByLumis: # need to keep track of which files correspond with which lumi # this holds a map of {(run:LS) : DatasetFile} list_of_lumis = {} for f in list_of_files: for x in f.lumi_list.getLumis(): list_of_lumis[x] = f # choose the required number of lumis if 0 < args.totalUnits < 1: end = int(math.ceil(len(list_of_lumis) * args.totalUnits)) list_of_lumis = { k: list_of_lumis[k] for k in list_of_lumis.keys()[0:end + 1] } elif args.totalUnits >= 1: list_of_lumis = { k: list_of_lumis[k] for k in list_of_lumis.keys()[0:int(args.totalUnits)] } # do job grouping job_files, job_lumis = group_files_by_lumis_per_job( list_of_lumis, args.unitsPerJob) total_num_jobs = len(job_files) create_filelist(job_files, filelist_filename) create_lumilists(job_lumis, lumilist_filename) log.info("Will be submitting %d jobs", total_num_jobs) ########################################################################### # Create sandbox of user's files ########################################################################### sandbox_local = "sandbox.tgz" additional_input_files = args.inputFile or [] if lumilist_filename and os.path.isfile(lumilist_filename): additional_input_files.append(lumilist_filename) setup_sandbox(sandbox_local, args.config, filelist_filename, additional_input_files) ########################################################################### # Setup DAG if needed ########################################################################### cmsrun_dag = None if args.dag: if args.filelist: job_name = os.path.splitext(os.path.basename( args.filelist))[0][:20] elif args.callgrind: job_name = "callgrind" elif args.valgrind: job_name = "valgrind" elif args.asIs: job_name = "cmsRun_%s" % strftime("%H%M%S") else: job_name = args.dataset[1:].replace("/", "_").replace("-", "_") status_filename = args.dag.replace( ".dag", "") # TODO: handle if it doesn't end with .dag status_filename += ".status" cmsrun_dag = ht.DAGMan(filename=args.dag, status_file=status_filename) ########################################################################### # Create Jobs ########################################################################### script_dir = os.path.dirname(__file__) cmsrun_jobs = ht.JobSet( exe=os.path.join(script_dir, 'cmsRun_worker.sh'), copy_exe=True, filename=args.condorScript, out_dir=args.logDir, out_file='cmsRun.$(cluster).$(process).out', err_dir=args.logDir, err_file='cmsRun.$(cluster).$(process).err', log_dir=args.logDir, log_file='cmsRun.$(cluster).$(process).log', cpus=1, memory='2GB', disk='3GB', # cpus=1, memory='1GB', disk='500MB', certificate=True, transfer_hdfs_input=True, share_exe_setup=True, common_input_files=[sandbox_local ], # EVERYTHING should be in the sandbox hdfs_store=args.outputDir) output_files = get_output_files_from_config(args.config) for job_ind in xrange(total_num_jobs): # Construct args to pass to cmsRun_worker.sh on the worker node args_dict = dict(output=args.outputDir, ind=job_ind) report_filename = "report{ind}.xml".format(**args_dict) args_dict['report'] = report_filename args_str = "-o {output} -i {ind} -a $ENV(SCRAM_ARCH) " \ "-c $ENV(CMSSW_VERSION) -r {report}".format(**args_dict) if args.lumiMask or args.runRange: if lumilist_filename: args_str += ' -l ' + os.path.basename(lumilist_filename) elif is_url(args.lumiMask): args_str += ' -l ' + args.lumiMask if args.asIs: args_str += ' -u' if args.valgrind: args_str += ' -m' if args.callgrind: args_str += ' -p' # warning: this must be aligned with whatever cmsRun_worker.sh does... job_output_files = [ o.replace('.root', '_%d.root' % job_ind) for o in output_files ] job_output_files.append(report_filename) if args.callgrind or args.valgrind: job_output_files.append('callgrind.out.*') job = ht.Job( name='cmsRun_%d' % job_ind, args=args_str, input_files=None, # need the CMSSW_*/src since the output is produced there output_files=[ os.path.join(os.environ['CMSSW_VERSION'], 'src', j) for j in job_output_files ], hdfs_mirror_dir=args.outputDir) cmsrun_jobs.add_job(job) if args.dag: cmsrun_dag.add_job(job, retry=5) ########################################################################### # Submit unless dry run ########################################################################### if not args.dry: if args.dag: cmsrun_dag.submit() else: cmsrun_jobs.submit() # Cleanup local files remove_file(sandbox_local) if filelist_filename: remove_file(filelist_filename) if lumilist_filename: remove_file(lumilist_filename) ########################################################################### # Return job properties ########################################################################### return cmsrun_dag, cmsrun_jobs
import htcondenser as ht # Set location on HDFS to hold files HDFS_STORE = "/hdfs/user/%s/dag_example_common" % os.environ['LOGNAME'] # Set location for logs LOG_STORE = "/storage/%s/dag_example_common/logs" % os.environ['LOGNAME'] log_stem = 'simple.$(cluster).$(process)' job_set = ht.JobSet(exe='runScript.sh', copy_exe=True, setup_script='setupScript.sh', filename=os.path.join(LOG_STORE, 'simple_job.condor'), out_dir=LOG_STORE, out_file=log_stem + '.out', err_dir=LOG_STORE, err_file=log_stem + '.err', log_dir=LOG_STORE, log_file=log_stem + '.log', share_exe_setup=True, common_input_files=['example.txt'], transfer_hdfs_input=False, hdfs_store=HDFS_STORE) jobA = ht.Job(name='jobA', args='A') jobB = ht.Job(name='jobB', args='B') jobC = ht.Job(name='jobC', args='C') jobD = ht.Job(name='jobD', args='D') job_set.add_job(jobA) job_set.add_job(jobB) job_set.add_job(jobC) job_set.add_job(jobD)
def add_hadd_jobs(dagman, jobs, final_file, log_dir): """Add necessary hadd jobs to DAG. All jobs will be hadded together to make `final_file`. DAGs can only accept a maximum number of arguments, so we have to split up hadd-ing into groups. Therefore we need an intermediate layer of hadd jobs, and then finally hadd those intermediate output files Parameters ---------- dagman : DAGMan DAGMan object to add jobs to. jobs : list[Job] Collection of Jobs to be hadd-ed together. final_file : str Final hadd-ed filename. Returns ------- JobSet JobSet for hadd jobs. """ group_size = 200 # max files per hadding job # adjust to avoid hadding 1 file by itself if len(jobs) % group_size == 0: group_size = 199 # calculate number of intermediate hadd jobs required n_inter_jobs = int(math.ceil(len(jobs) * 1. / group_size)) log_stem = 'matcherHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, filename='haddBig.condor', setup_script=None, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='1GB', transfer_hdfs_input=False, share_exe_setup=True, hdfs_store=os.path.dirname(final_file)) if n_inter_jobs == 1: hadd_input = [j.output_files[0] for j in jobs] hadd_args = [final_file] + hadd_input hadd_job = ht.Job(name='finalHadd', args=hadd_args, input_files=hadd_input, output_files=[final_file]) hadd_jobs.add_job(hadd_job) dagman.add_job(hadd_job, requires=jobs) else: # Go through groups of Jobs, make intermediate hadd files in same dir # as final file intermediate_jobs = [] for i, job_group in enumerate(grouper(jobs, group_size)): # Note, job_group is guaranteed to be length group_size, and is # padded with None if there arent' that many entries. So need to # filter out NoneType job_group = filter(None, job_group) hadd_input = [j.output_files[0] for j in job_group] inter_file = 'hadd_inter_%d_%s.root' % (i, cc.rand_str(5)) inter_file = os.path.join(os.path.dirname(final_file), inter_file) hadd_args = [inter_file] + hadd_input hadd_job = ht.Job(name='interHadd%d' % i, args=hadd_args, input_files=hadd_input, output_files=[inter_file]) hadd_jobs.add_job(hadd_job) dagman.add_job(hadd_job, requires=job_group) intermediate_jobs.append(hadd_job) # Add final hadd job for intermediate files hadd_input = [j.output_files[0] for j in intermediate_jobs] hadd_args = [final_file] + hadd_input hadd_job = ht.Job(name='finalHadd', args=hadd_args, input_files=hadd_input, output_files=[final_file]) hadd_jobs.add_job(hadd_job) dagman.add_job(hadd_job, requires=intermediate_jobs) return hadd_jobs
def submit_matcher_dag(exe, ntuple_dir, log_dir, l1_dir, ref_dir, deltaR, ref_min_pt, cleaning_cut, append, force_submit): """Submit one matcher DAG for one directory of ntuples. This will run `exe` over all Ntuple files and then hadd the results together. Parameters ---------- exe : str Name of executable. ntuple_dir : str Name of directory with L1Ntuples to run over. log_dir : str Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str String to append to filenames to track various settings (e.g. deltaR cut). l1_dir : str Name of TDirectory in Ntuple that holds L1 jets. ref_dir : str Name of TDirectory in Ntuple that holds reference jets. deltaR : float Maximum deltaR(L1, Ref) for a match. ref_min_pt : float Minimum pT cut on reference jets to be considered for matching. force_submit : bool If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ # DAG for jobs stem = 'matcher_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) matcher_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem), status_file=os.path.join(log_dir, '%s.status' % stem)) # JobSet for each matching job log_stem = 'matcher.$(cluster).$(process)' matcher_jobs = ht.JobSet(exe=find_executable(exe), copy_exe=True, filename='submit_matcher.condor', setup_script=None, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, share_exe_setup=True, hdfs_store=ntuple_dir) # For creating filenames later fmt_dict = dict() # Hold all output filenames match_output_files = [] # Additional files to copy across - JEC, etc common_input_files = [] # Add matcher job for each ntuple file for ind, ntuple in enumerate(os.listdir(ntuple_dir)): # if ind > 10: # break # Skip non-ntuple files if not ntuple.endswith('.root') or ntuple.startswith('pairs'): continue ntuple_abspath = os.path.join(ntuple_dir, ntuple) # Construct output name ntuple_name = os.path.splitext(ntuple)[0] # handle anything up to first underscore (L1Tree, L1Ntuple, ...) result = re.match(r'^[a-zA-Z0-9]*_', ntuple_name) if result: pairs_file = '%s_%s.root' % (ntuple_name.replace( result.group(), 'pairs_'), append.format(**fmt_dict)) else: pairs_file = 'pairs_%s_%s.root' % (ntuple_name, append.format(**fmt_dict)) out_file = os.path.join(ntuple_dir, pairs_file) match_output_files.append(out_file) # Add matching job job_args = [ '-I', ntuple_abspath, '-O', out_file, '--refDir', ref_dir, '--l1Dir', l1_dir, '--draw 0', '--deltaR', deltaR, '--refMinPt', ref_min_pt ] if cleaning_cut: job_args.extend(['--cleanJets', cleaning_cut]) input_files = common_input_files + [ntuple_abspath] match_job = ht.Job(name='match_%d' % ind, args=job_args, input_files=input_files, output_files=[out_file]) matcher_jobs.add_job(match_job) matcher_dag.add_job(match_job) # Construct final filename # --------------------------------------------------------------------- final_file = 'pairs_%s_%s.root' % (os.path.basename( ntuple_dir.rstrip('/')), append.format(**fmt_dict)) final_dir = os.path.join(os.path.dirname(ntuple_dir.rstrip('/')), 'pairs') cc.check_create_dir(final_dir, info=True) final_file = os.path.join(final_dir, final_file) log.info("Final file: %s", final_file) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + match_output_files: if os.path.isfile(f): raise RuntimeError( 'ERROR: output file already exists - not submitting.' '\nTo bypass, use -f flag. \nFILE: %s' % f) # Add in hadding jobs # --------------------------------------------------------------------- hadd_jobs = add_hadd_jobs(matcher_dag, matcher_jobs.jobs.values(), final_file, log_dir) # Add in job to delete individual and intermediate hadd files # --------------------------------------------------------------------- log_stem = 'matcherRm.$(cluster).$(process)' rm_jobs = ht.JobSet(exe='hadoop', copy_exe=False, filename='submit_matcherRm.condor', out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='10MB', transfer_hdfs_input=False, share_exe_setup=False, hdfs_store=ntuple_dir) for i, job in enumerate(chain(matcher_jobs, hadd_jobs[:-1])): pairs_file = job.output_files[0] rm_job = ht.Job(name='rm%d' % i, args=' fs -rm -skipTrash %s' % pairs_file.replace('/hdfs', '')) rm_jobs.add_job(rm_job) matcher_dag.add_job(rm_job, requires=hadd_jobs[-1]) # Submit # --------------------------------------------------------------------- # matcher_dag.write() matcher_dag.submit() return matcher_dag.status_file
HDFS_STORE = "/hdfs/user/%s/simple_root6_job" % os.environ['LOGNAME'] # Set location for logs LOG_STORE = "/storage/%s/simple_root6_job/logs" % os.environ['LOGNAME'] log_stem = 'simple.$(cluster).$(process)' # Define a JobSet object for all jobs running the same exe # with same configuration for logs, etc job_set = ht.JobSet( exe='root', copy_exe=False, # setup_script='setup_root6.sh', setup_script=None, filename='simple_root6_job.condor', out_dir=LOG_STORE, out_file=log_stem + '.out', err_dir=LOG_STORE, err_file=log_stem + '.err', log_dir=LOG_STORE, log_file=log_stem + '.log', cpus=1, memory='50MB', disk='1', hdfs_store=HDFS_STORE) # Now add individual Jobs job = ht.Job(name='root6_job', args='-l -q -b hist.C'.split(), input_files=['hist.C'], output_files=['hist.pdf', 'simple_tree.root'], quantity=1) job_set.add_job(job)
# Set location on HDFS to hold files HDFS_STORE = "/hdfs/user/%s/simple_exe_job" % os.environ['LOGNAME'] # Set location for logs LOG_STORE = "/storage/%s/simple_exe_job/logs" % os.environ['LOGNAME'] log_stem = 'simple.$(cluster).$(process)' # Define a JobSet object for all jobs running the same exe # with same configuration for logs, etc job_set = ht.JobSet(exe='showsize', copy_exe=True, setup_script=None, filename=os.path.join(LOG_STORE, 'simple_exe_job.condor'), out_dir=LOG_STORE, out_file=log_stem + '.out', err_dir=LOG_STORE, err_file=log_stem + '.err', log_dir=LOG_STORE, log_file=log_stem + '.log', cpus=1, memory='50MB', disk='1', hdfs_store=HDFS_STORE) # Now add individual Jobs job = ht.Job(name='job_exe') job_set.add_job(job) # Now submit jobs job_set.submit()
LOG_DIR = '/storage/%s/CMSSW/%s' % (os.environ['LOGNAME'], strftime("%d_%b_%y")) OUT_DIR = '/hdfs/user/%s' % (os.environ['LOGNAME']) if __name__ == "__main__": # Get output ntuple file from config file config = importlib.import_module(os.path.splitext(CONFIG)[0]) output_file = config.process.TFileService.fileName.value() print 'Output file:', output_file # Create job job_set = ht.JobSet(exe='cmsRun', copy_exe=False, certificate=True, out_dir=LOG_DIR, err_dir=LOG_DIR, log_dir=LOG_DIR, cpus=1, memory='200MB', disk='200MB', hdfs_store=OUT_DIR, filename='cmsRun.condor') job = ht.Job(name='cmsRunJob', args=[CONFIG], input_files=[CONFIG], output_files=[output_file], hdfs_mirror_dir=OUT_DIR) job_set.add_job(job) job_set.submit()