def _load_raw(branches): pieces = [] for fn, n in zip(md.inputfiles, md.num_events): step = int(math.ceil(frac * n)) start = step * jobid stop = start + step if start >= n: continue filepath = xrd(fn) if batch_mode else fn # logging.debug('Load events [%d, %d) from file %s' % (start, stop, filepath)) trial = 0 while trial < 5: try: a = root2array(filepath, treename=md.treename, selection=md.selection, branches=branches, start=start, stop=stop) break except: logging.error('Error reading %s:\n%s' % (filepath, traceback.format_exc())) time.sleep(10) trial += 1 if trial >= 5: raise RuntimeError('Cannot read file %s' % filepath) pieces.append(a) rec = np.concatenate(pieces) return rec
def submit(args): scriptfile = os.path.join(args.jobdir, 'runjob.sh') metadatafile = os.path.join(args.jobdir, args.metadata) if not args.resubmit: from helper import xrd md, njobs = update_metadata(args) script = \ '''#!/bin/bash jobid=$1 workdir=`pwd` echo `hostname` echo "workdir: $workdir" echo "args: $@" ls -l export PATH={conda_path}:$PATH source activate {conda_env_name} echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" python {script} {outputdir} $jobid -n {events} {test_sample} status=$? echo "Status = $status" ls -l if [ $status -ne 0 ]; then exit $status else echo {xrdcp} fi exit $status '''.format(conda_path=args.conda_path, conda_env_name=args.conda_env_name, script=os.path.abspath('converter.py'), outputdir=args.outputdir, events=args.events_per_file, test_sample='--test-sample' if args.test_sample else '', xrdcp='' if not args.outputdir.startswith('/eos') else 'xrdcp -np *.h5 %s ; rm *.h5' % (xrd(args.outputdir) + '/') ) with open(scriptfile, 'w') as f: f.write(script) os.system('chmod +x %s' % scriptfile) jobids = [str(jobid) for jobid in range(njobs)] jobids_file = os.path.join(args.jobdir, 'submit.txt') else: # resubmit jobids = [] jobids_file = os.path.join(args.jobdir, 'resubmit.txt') log_files = [f for f in os.listdir(args.jobdir) if f.endswith('.log')] for fn in log_files: with open(os.path.join(args.jobdir, fn)) as logfile: errormsg = None for line in reversed(logfile.readlines()): if 'Job removed' in line or 'aborted' in line: errormsg = line if 'Job submitted from host' in line: # if seeing this first: the job has been resubmited break if 'return value' in line: if 'return value 0' not in line: errormsg = line break if errormsg: logging.debug(fn + '\n ' + errormsg) jobids.append(fn.split('.')[0]) assert jobids[-1].isdigit() with open(jobids_file, 'w') as f: f.write('\n'.join(jobids)) condordesc = '''\ universe = vanilla requirements = (Arch == "X86_64") && (OpSys == "LINUX") request_disk = 10000000 executable = {scriptfile} arguments = $(jobid) transfer_input_files = {metadatafile} output = {jobdir}/$(jobid).out error = {jobdir}/$(jobid).err log = {jobdir}/$(jobid).log use_x509userproxy = true +MaxRuntime = 172800 Should_Transfer_Files = YES queue jobid from {jobids_file} '''.format(scriptfile=os.path.abspath(scriptfile), metadatafile=os.path.abspath(metadatafile), jobdir=os.path.abspath(args.jobdir), outputdir=args.outputdir, jobids_file=os.path.abspath(jobids_file) ) condorfile = os.path.join(args.jobdir, 'submit.cmd') with open(condorfile, 'w') as f: f.write(condordesc) print('Run the following command to submit the jobs:\ncondor_submit {condorfile}'.format(condorfile=condorfile))
def writeData(md, outputdir, jobid, batch_mode=False, test_sample=False, events=200000, dryrun=False): ''' Convert input files to a HDF file. ''' from root_numpy import root2array def _write(rec, output): logging.debug(log_prefix + 'Start making output file') with tables.open_file(output, mode='w') as h5file: _make_labels(md, rec, h5file) logging.debug(log_prefix + 'Start producing weights') _make_weight(md, rec, h5file) _make_class_weight(md, rec, h5file) logging.debug(log_prefix + 'Start transforming variables') _transform_var(md, rec, h5file, md.var_no_transform_branches, no_transform=True) _transform_var(md, rec, h5file, md.var_branches) if md.var_img: logging.debug(log_prefix + 'Start making images') _make_image(md, rec, h5file, output='img') log_prefix = '[%d] ' % jobid outname = '{type}_file_{jobid}.h5'.format( type='test' if test_sample else 'train', jobid=jobid) output = os.path.join(outputdir, outname) if os.path.exists(output) and os.path.getsize(output) > 100 * 1024 * 1024: # ignore if > 100M logging.info(log_prefix + 'File %s already exist! Skipping.' % output) return frac = float(events) / sum(md.num_events) use_branches = set(md.var_branches + md.var_no_transform_branches + md.label_branches + md.reweight_classes + md.reweight_var) if md.var_img: use_branches |= set([md.var_img] + md.var_pos) # use_branches = [str(var) for var in use_branches] logging.debug(log_prefix + 'Start loading from root files') pieces = [] for fn, n in zip(md.inputfiles, md.num_events): step = int(math.ceil(frac * n)) start = step * jobid stop = start + step if start >= n: continue filepath = xrd(fn) if batch_mode else fn # logging.debug('Load events [%d, %d) from file %s' % (start, stop, filepath)) a = root2array(filepath, treename=md.treename, selection=md.selection, branches=use_branches, start=start, stop=stop) pieces.append(a) rec = np.concatenate(pieces) if rec.shape[0] == 0: return if not test_sample: # important: shuffle the array if not for testing np.random.shuffle(rec) if batch_mode: if not dryrun: _write(rec, outname) logging.info(log_prefix + 'Writing output to: \n' + outname) else: output_tmp = output + '.tmp' if not dryrun: _write(rec, output_tmp) os.rename(output_tmp, output) logging.info(log_prefix + 'Writing output to: \n' + output) logging.info(log_prefix + 'Done!')
def submit(args): scriptfile = os.path.join(args.jobdir, 'runjob.sh') metadatafile = os.path.join(args.jobdir, args.metadata) tarball = os.path.join(args.jobdir, args.tarball) dir_path = os.path.dirname(os.path.realpath(__file__)) import tarfile with tarfile.open(tarball, mode='w:gz') as archive: archive.add(dir_path, arcname='preprocessing', recursive=True) if not args.resubmit: from helper import xrd md, njobs = update_metadata(args) if 'LCG_VERSION' in os.environ: env_setup = 'source %s\n' % args.lcg_env env_setup += 'tar xvzf preprocessing.tar.gz\n' env_setup += 'export PYTHONPATH=`pwd`/preprocessing/.local/lib/python3.6/site-packages:$PYTHONPATH\n' else: env_setup = '''export PATH={conda_path}:$PATH source activate {conda_env_name}'''.format(conda_path=args.conda_path, conda_env_name=args.conda_env_name) script = \ '''#!/bin/bash jobid=$1 workdir=`pwd` echo `hostname` echo "workdir: $workdir" echo "args: $@" ls -l {env_setup} echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" python {script} {outputdir} $jobid -n {events} {test_sample} status=$? echo "Status = $status" ls -l if [ $status -ne 0 ]; then exit $status else echo {xrdcp} fi exit $status '''.format(env_setup=env_setup, script='preprocessing/converter.py', outputdir=args.outputdir, events=args.events_per_file, test_sample='--test-sample' if args.test_sample else '', xrdcp='' if not args.outputdir.startswith('/eos') else 'xrdcp -np *.h5 %s ; rm *.h5' % (xrd(args.outputdir) + '/') ) with open(scriptfile, 'w') as f: f.write(script) os.system('chmod +x %s' % scriptfile) jobids = [str(jobid) for jobid in range(njobs)] jobids_file = os.path.join(args.jobdir, 'submit.txt') else: # resubmit jobids = [] jobids_file = os.path.join(args.jobdir, 'resubmit.txt') log_files = [f for f in os.listdir(args.jobdir) if f.endswith('.log')] for fn in log_files: with open(os.path.join(args.jobdir, fn)) as logfile: errormsg = None for line in reversed(logfile.readlines()): if 'Job removed' in line or 'aborted' in line: errormsg = line if 'Job submitted from host' in line: # if seeing this first: the job has been resubmited break if 'return value' in line: if 'return value 0' not in line: errormsg = line break if errormsg: logging.debug(fn + '\n ' + errormsg) jobids.append(fn.split('.')[0]) assert jobids[-1].isdigit() with open(jobids_file, 'w') as f: f.write('\n'.join(jobids)) condordesc = '''\ universe = vanilla request_disk = 10000000 request_memory = 8192 executable = {scriptfile} arguments = $(jobid) transfer_input_files = {metadatafile},{tarball} output = {jobdir}/$(jobid).out error = {jobdir}/$(jobid).err log = {jobdir}/$(jobid).log use_x509userproxy = true +MaxRuntime = 24000 Should_Transfer_Files = YES queue jobid from {jobids_file} '''.format(scriptfile=os.path.abspath(scriptfile), metadatafile=os.path.abspath(metadatafile), tarball = os.path.abspath(tarball), jobdir=os.path.abspath(args.jobdir), outputdir=args.outputdir, jobids_file=os.path.abspath(jobids_file) ) condorfile = os.path.join(args.jobdir, 'submit.cmd') with open(condorfile, 'w') as f: f.write(condordesc) print('Run the following command to submit the jobs:\n condor_submit {condorfile}'.format(condorfile=condorfile))