def compare(fn1, fn2): pair_name = '%s->%s' % (fn1.replace('.gz',''), fn2.replace('.gz','')) process1, names1, ver1 = parse(fn1) process2, names2, ver2 = parse(fn2) if ver1 != ver2: print colors.yellow('%s changed versions %s to %s' % (pair_name, ver1, ver2)) commn = [n for n in names1 if n in names2] commn_diff = [] for x in commn: o1, o2 = [getattr(p, x).dumpPython() for p in process1, process2] if o1 != o2: if not commn_diff: print colors.yellow(pair_name + ' changed these:\n') print colors.yellow(x) print 'process1.%s =' % x, o1 print 'process2.%s =' % x, o2 commn_diff.append(x) added = [n for n in names2 if n not in names1] deled = [n for n in names1 if n not in names2] if added: print colors.yellow('%s added these: %s\n' % (pair_name, ' '.join(added))) for x in added: print 'process2.%s =' % x, getattr(process2, x).dumpPython() if deled: print colors.yellow('%s deled these: %s\n' % (pair_name, ' '.join(deled))) for x in deled: print 'process1.%s =' % x, getattr(process1, x).dumpPython() return commn_diff or added or deled
def crab_get_njobs_from_log(working_dir, jobs_re=re.compile(r'\([\d ]+/([\d ]+)\)')): # find njobs using a line printed as result of crab status that looks like ( 76/788) njobs = [] for line in crab_log_open(working_dir): mo = jobs_re.search(line) if mo: njobs.append(int(mo.group(1))) if not njobs: raise ValueError('problem parsing crab.log in wd=%s for njobs' % working_dir) if crab_global_options.support_automatic_splitting: # njobs may only increase at later parts of the log # this should handle how crab automatic splitting resubmission jobs work for a, b in zip(njobs, njobs[1:]): if a > b: print colors.red('crab.log wd=%s has decreasing njobs: %r' % (working_dir, njobs)) if len(set(njobs)) != 1: print colors.yellow( 'crab_get_njobs_from_log for %s found more than one value: %r\n\tThis may have happened because of Automatic splitting. Support is still experimental, scrutinize the output well.' % (working_dir, sorted(set(njobs)))) elif len(set(njobs)) != 1: raise ValueError('problem parsing crab.log in wd=%s for njobs: %r' % (working_dir, njobs)) return njobs[-1]
def cs_report(wd, partial=False): njobs = cs_njobs(wd) lls = [] for i in xrange(njobs): fjr_fn = os.path.join(wd, 'fjr_%i.xml' % i) if os.path.isfile(fjr_fn): lls.append((i, fjr2ll(fjr_fn))) elif partial: print colors.yellow('missing fjr %s but partial allowed' % fjr_fn) else: raise IOError('missing fjr %s' % fjr_fn) for (ia, lla), (ib, llb) in combinations(lls, 2): if lla & llb: problem = 'problem with fjrs for %s: overlap found in pair %i + %i\n' % ( wd, ia, ib) problem += repr((ia, lla)) + '\n' problem += repr((ib, llb)) + '\n' problem += 'and ' + repr(lla & llb) + '\n' raise ValueError(problem) ll_all = lls.pop()[1] for _, ll in lls: ll_all |= ll ll_all.writeJSON(os.path.join(wd, 'processedLumis.json')) return ll_all
def runem(cb): for dataset in datasets: for sample in samples: if not sample.has_dataset(dataset): print colors.yellow('no dataset %s for %s' % (dataset, sample.name)) continue sample.set_curr_dataset(dataset) cb(dataset, sample)
def crab_fjr_json_to_ll(fn): print colors.yellow('this is not fully tested') j = crab_fjr_json(fn) ll = LumiList() for x in j['steps']['cmsRun']['input']['source']: x2 = defaultdict(list) for k,v in x['runs'].iteritems(): for l in v.keys(): x2[int(k)].append(int(l)) ll += LumiList(runsAndLumis=x2) return ll
def crab_fjr_json_to_ll(fn): print colors.yellow('this is not fully tested') j = crab_fjr_json(fn) ll = LumiList() for x in j['steps']['cmsRun']['input']['source']: x2 = defaultdict(list) for k, v in x['runs'].iteritems(): for l in v.keys(): x2[int(k)].append(int(l)) ll += LumiList(runsAndLumis=x2) return ll
def crab_get_njobs_from_log(working_dir, jobs_re=re.compile(r'\([\d ]+/([\d ]+)\)')): # find njobs using a line printed as result of crab status that looks like ( 76/788) njobs = [] for line in crab_log_open(working_dir): mo = jobs_re.search(line) if mo: njobs.append(int(mo.group(1))) if not njobs: raise ValueError('problem parsing crab.log in wd=%s for njobs' % working_dir) if crab_global_options.support_automatic_splitting: # njobs may only increase at later parts of the log # this should handle how crab automatic splitting resubmission jobs work for a, b in zip(njobs, njobs[1:]): if a > b: print colors.red('crab.log wd=%s has decreasing njobs: %r' % (working_dir, njobs)) if len(set(njobs)) != 1: print colors.yellow('crab_get_njobs_from_log for %s found more than one value: %r\n\tThis may have happened because of Automatic splitting. Support is still experimental, scrutinize the output well.' % (working_dir, sorted(set(njobs)))) elif len(set(njobs)) != 1: raise ValueError('problem parsing crab.log in wd=%s for njobs: %r' % (working_dir, njobs)) return njobs[-1]
def compare(fn1, fn2): pair_name = '%s->%s' % (fn1.replace('.gz', ''), fn2.replace('.gz', '')) process1, names1, ver1 = parse(fn1) process2, names2, ver2 = parse(fn2) if ver1 != ver2: print colors.yellow('%s changed versions %s to %s' % (pair_name, ver1, ver2)) commn = [n for n in names1 if n in names2] commn_diff = [] for x in commn: o1, o2 = [getattr(p, x).dumpPython() for p in process1, process2] if o1 != o2: if not commn_diff: print colors.yellow(pair_name + ' changed these:\n') print colors.yellow(x) print 'process1.%s =' % x, o1 print 'process2.%s =' % x, o2 commn_diff.append(x) added = [n for n in names2 if n not in names1] deled = [n for n in names1 if n not in names2] if added: print colors.yellow('%s added these: %s\n' % (pair_name, ' '.join(added))) for x in added: print 'process2.%s =' % x, getattr(process2, x).dumpPython() if deled: print colors.yellow('%s deled these: %s\n' % (pair_name, ' '.join(deled))) for x in deled: print 'process1.%s =' % x, getattr(process1, x).dumpPython() return commn_diff or added or deled
def crab_hadd(working_dir, new_name=None, new_dir=None, raise_on_empty=False, chunk_size=900, pattern=None, lpc_shortcut=False, range_filter=None): working_dir, new_name, new_dir = crab_hadd_args(working_dir, new_name, new_dir) expected, files = crab_hadd_files(working_dir, lpc_shortcut, range_filter=range_filter) result = HaddBatchResult('crab', working_dir, new_name, new_dir, expected, files) print '%s: expecting %i files if all jobs succeeded' % (working_dir, expected) if pattern: if '/' not in pattern: pattern = '*/' + pattern files = fnmatch.filter(files, pattern) automatic_splitting = False pprinted = False jobs = [] for f in files: jobnum = f.split('_')[-1].split('.root')[0] if crab_global_options.support_automatic_splitting and '-' in jobnum: automatic_splitting = True if not pprinted: pprint(files) pprinted = True it, jobnum = jobnum.split('-') it, jobnum = int(it), int(jobnum) assert it >= 1 # probe jobs "0-*" should not show up jobnum = it * 10000 + jobnum else: jobnum = int(jobnum) jobs.append(jobnum) jobs.sort() expected = range(1, expected + 1) if jobs != expected: print '\033[36;7m %i files found %s not what expected \033[m' % ( len(jobs), crabify_list(jobs)) missing = sorted(set(expected) - set(jobs)) print '\033[36;7m %i missing: %r \033[m' % (len(missing), ' '.join( str(j) for j in missing)) l = len(files) if l == 0: result.success = False msg = 'crab_hadd: no files found in %s' % working_dir if raise_on_empty: raise CRABToolsException(msg) else: print '\033[36;7m', msg, '\033[m' elif l == 1: print working_dir, ': just one file found, copying' cmd = 'xrdcp -s %s %s' % (files[0], new_name) result.success = os.system(cmd) == 0 if result.success and not new_name.startswith('root://'): os.chmod(new_name, 0644) else: result.success = hadd(new_name, files) if automatic_splitting: n = norm_from_file(new_name) sn, s = fn_to_sample(Samples, new_name) if not s: print colors.yellow( "\tnorm_from_file returns %r, couldn't get sample %s" % (n, sn)) else: no1, no2 = s.datasets['main'].nevents_orig, s.datasets[ 'miniaod'].nevents_orig if n == no1 or n == no2: print '\tnorm_from_file returns nevents_orig = %i' % n else: print colors.yellow( '\tnorm_from_file returns %r while %s.nevents_orig is %i (main) %i (miniaod' % (n, sn, no1, no2)) return result
dses = ['miniaod'] for ds in dses: print colors.bold(ds) for sample in Samples.registry.all(): if not sample.has_dataset(ds): continue sample.set_curr_dataset(ds) if '/None/' in sample.dataset or getattr(sample, 'is_private', False): continue try: sites = DBS.sites_for_dataset(sample.dataset, instance=sample.dbs_inst, json=True) except (RuntimeError, ValueError): print colors.yellow('%s %s DBS problem' % (sample.name, sample.dataset)) continue if not sites: continue print sample.name, sites.sort(key=lambda site: DBS.site_completions(site, True)) max_site_completion = DBS.site_completions(sites[-1], True) found = False for site in sites: if DBS.site_is_tape(site): continue is_complete = DBS.complete_at_site(site) is_good_as_possible = DBS.site_completions(site) >= max_site_completion
def main(samples_registry): from glob import glob from sys import argv from pprint import pprint from JMTucker.Tools import colors if 'merge' in argv: samples = samples_registry.from_argv(from_root_fns=True, raise_if_none=True) out_fn = [x for x in argv if x.endswith('.root') and not os.path.isfile(x)] out_fn = out_fn[0] if out_fn else 'merge.root' norm_to = typed_from_argv(float, default_value=1.) norm_path = typed_from_argv(str, default_value='', name='norm_path') merge(samples, output=out_fn, norm_to=norm_to, norm_path=norm_path) elif 'printmissing' in argv: samples = [s.name for s in samples_registry.from_argv(raise_if_none=True)] samples.sort() look_for_root_files = 'no_root' not in sys.argv no_batch_dir, no_root_file = [], [] for s in samples: if not os.path.isdir('condor_' + s) and not glob('crab_*_' + s): no_batch_dir.append(s) if not os.path.isfile('%s.root' % s): no_root_file.append(s) if no_batch_dir: print colors.yellow('no batch dir for these:') for s in no_batch_dir: print s if look_for_root_files and no_root_file: print colors.yellow('no root file for these:') for s in no_root_file: print s elif 'ds' in argv: samples = samples_registry.from_argv(raise_if_none=True) if len(samples) != 1: raise ValueError('must have exactly one sample in argv') sample = samples[0] dataset = argv[argv.index(sample.name)+1] if not sample.has_dataset(dataset): raise KeyError('no dataset %s in %s' % (dataset, sample)) print sample.datasets[dataset].dataset elif 'file' in argv: samples = samples_registry.from_argv(raise_if_none=True) if len(samples) != 1: raise ValueError('must have exactly one sample in argv') sample = samples[0] dataset = argv[argv.index(sample.name)+1] if not sample.has_dataset(dataset): raise KeyError('no dataset %s in %s' % (dataset, sample)) sample.set_curr_dataset(dataset) for x in sample.filenames[:typed_from_argv(int, 5)]: print x elif 'nevents' in argv: samples = samples_registry.from_argv(raise_if_none=True) if len(samples) != 1: raise ValueError('must have exactly one sample in argv') sample = samples[0] dataset = argv[argv.index(sample.name)+1] if not sample.has_dataset(dataset): raise KeyError('no dataset %s in %s' % (dataset, sample)) sample.set_curr_dataset(dataset) print DBS.numevents_in_dataset(sample.dataset) elif 'site' in argv: samples = samples_registry.from_argv(raise_if_none=True) dataset = samples_registry.datasets_from_argv() if len(dataset) > 1: raise ValueError('only zero/one dataset allowed') dataset = dataset[0] if len(dataset) == 1 else 'main' mlen = max(len(s.name) for s in samples) for sample in samples: sample.set_curr_dataset(dataset) try: sites = DBS.sites_for_dataset(sample.dataset, json=True) except RuntimeError: print sample.name, 'PROBLEM' continue print sample.name.ljust(mlen+5), sites.sort(key=lambda x: x['name']) for site in sites: if DBS.site_is_tape(site): continue is_complete = DBS.complete_at_site(site) print (colors.green if is_complete else colors.yellow)(DBS.site_completions_string(site)), elif 'samplefiles' in argv: # rm a; touch a; for ds in '' miniaod; do for x in qcd ttbar leptonic; do ( samples samplefiles ${x}_samples_2017 $ds >> a ) ; done; done # rm a; touch a; for ds in '' miniaod; do for year in 2017 2018; do for x in data auxiliary_data ; do ( samples samplefiles ${x}_samples_${year} $ds >> a ) ; done; done; done samples = samples_registry.from_argv(raise_if_none=True) dataset = 'main' for arg in argv[1:]: if arg == 'miniaod' or arg.startswith('ntuple'): dataset = arg break print 'getting files for dataset %s:' % dataset, ', '.join(s.name for s in samples) import SampleFiles as sf for s in samples: d = {} if not s.has_dataset(dataset): print colors.yellow('no dataset %s for %s' % (dataset, s.name)) continue s.set_curr_dataset(dataset) if sf.has(s.name, dataset): raise KeyError('SampleFiles already has an entry for %s' % s.name) else: fns = s.filenames print 'DBS has %i files for %s' % (len(fns), s.name) d[(s.name, dataset)] = (len(fns), fns) print "('%s:%s', '%s')," % (s.name, dataset, sf._enc(d))
def main(samples_registry): from glob import glob from sys import argv from pprint import pprint from JMTucker.Tools import colors from JMTucker.Tools.general import chunks, typed_from_argv samples = samples_registry.from_argv() datasets = samples_registry.datasets_from_argv() def prnt(*x): print ' '.join(str(y) for y in x) def runem(cb): for dataset in datasets: for sample in samples: if not sample.has_dataset(dataset): print colors.yellow('no dataset %s for %s' % (dataset, sample.name)) continue sample.set_curr_dataset(dataset) cb(dataset, sample) if 'merge' in argv: samples = samples_registry.from_argv(from_root_fns=True, raise_if_none=True) out_fn = [x for x in argv if x.endswith('.root') and not os.path.isfile(x)] out_fn = out_fn[0] if out_fn else 'merge.root' norm_to = typed_from_argv(float, default_value=1.) norm_path = typed_from_argv(str, default_value='', name='norm_path') merge(samples, output=out_fn, norm_to=norm_to, norm_path=norm_path) elif 'printmissing' in argv: samples = [s.name for s in samples_registry.from_argv(raise_if_none=True)] samples.sort() look_for_root_files = 'no_root' not in sys.argv no_batch_dir, no_root_file = [], [] for s in samples: if not os.path.isdir('condor_' + s) and not glob('crab_*_' + s): no_batch_dir.append(s) if not os.path.isfile('%s.root' % s): no_root_file.append(s) if no_batch_dir: print colors.yellow('no batch dir for these:') for s in no_batch_dir: print s if look_for_root_files and no_root_file: print colors.yellow('no root file for these:') for s in no_root_file: print s elif 'name' in argv: runem(lambda dataset, sample: prnt(sample.name, dataset)) elif 'ds' in argv: runem(lambda dataset, sample: prnt(sample.name, dataset, sample.dataset)) elif 'file' in argv: runem(lambda dataset, sample: [prnt(sample.name, dataset, x) for x in sample.filenames[:typed_from_argv(int, 5)]]) elif 'nevents' in argv: runem(lambda dataset, sample: prnt(sample.name, dataset, DBS.numevents_in_dataset(sample.dataset))) elif 'files_for_events' in argv: rles = typed_from_argv(int, return_multiple=True) if len(rles) % 3 != 0: raise ValueError('expect list of ints in argv with length divisible by 3 [run1 lumi1 event1 ...]') rles = list(chunks(rles,3)) runem(lambda dataset, sample: prnt(sample.name, dataset, ' '.join(DBS.files_for_events(rles, sample.dataset)))) elif 'site' in argv: mlen = max(len(s.name) for s in samples) def cb(dataset, sample): ljname = sample.name.ljust(mlen+3) try: sites = DBS.sites_for_dataset(sample.dataset, json=True) except RuntimeError: print colors.boldred(ljname + ' DBS problem') else: print ljname, sites.sort(key=lambda x: x['name']) for site in sites: if DBS.site_is_tape(site): continue is_complete = DBS.complete_at_site(site) print (colors.green if is_complete else colors.yellow)(DBS.site_completions_string(site)), ' ', print runem(cb) elif 'samplefiles' in argv: import SampleFiles as sf def cb(dataset, sample): if sf.has(sample.name, dataset): raise KeyError('SampleFiles already has an entry for %s' % sample.name) fns = sample.filenames print 'DBS has %i files for %s' % (len(fns), sample.name) d = {(sample.name, dataset): (len(fns), fns)} print "('%s:%s', '%s')," % (sample.name, dataset, sf._enc(d)) runem(cb) elif 'sfhas' in argv: neg = 'neg' in argv import SampleFiles as sf for dataset in datasets: for sample in samples: if sf.has(sample.name, dataset) != neg: print sample.name
def crab_hadd(working_dir, new_name=None, new_dir=None, raise_on_empty=False, chunk_size=900, pattern=None, lpc_shortcut=False, range_filter=None): working_dir, new_name, new_dir = crab_hadd_args(working_dir, new_name, new_dir) expected, files = crab_hadd_files(working_dir, lpc_shortcut, range_filter=range_filter) print '%s: expecting %i files if all jobs succeeded' % (working_dir, expected) if pattern: if '/' not in pattern: pattern = '*/' + pattern files = fnmatch.filter(files, pattern) automatic_splitting = False pprinted = False jobs = [] for f in files: jobnum = f.split('_')[-1].split('.root')[0] if crab_global_options.support_automatic_splitting and '-' in jobnum: automatic_splitting = True if not pprinted: pprint(files) pprinted = True it, jobnum = jobnum.split('-') it, jobnum = int(it), int(jobnum) assert it >= 1 # probe jobs "0-*" should not show up jobnum = it*10000 + jobnum else: jobnum = int(jobnum) jobs.append(jobnum) jobs.sort() expected = range(1, expected+1) if jobs != expected: print '\033[36;7m %i files found %s not what expected \033[m' % (len(jobs), crabify_list(jobs)) missing = sorted(set(expected) - set(jobs)) print '\033[36;7m %i missing: %r \033[m' % (len(missing), ' '.join(str(j) for j in missing)) l = len(files) if l == 0: msg = 'crab_hadd: no files found in %s' % working_dir if raise_on_empty: raise CRABToolsException(msg) else: print '\033[36;7m', msg, '\033[m' elif l == 1: print working_dir, ': just one file found, copying' cmd = 'xrdcp -s %s %s' % (files[0], new_name) os.system(cmd) os.chmod(new_name, 0644) else: hadd(new_name, files) if automatic_splitting: n = norm_from_file(new_name) sn, s = fn_to_sample(Samples, new_name) if not s: print colors.yellow("\tnorm_from_file returns %r, couldn't get sample %s" % (n, sn)) else: no1, no2 = s.datasets['main'].nevents_orig, s.datasets['miniaod'].nevents_orig if n == no1 or n == no2: print '\tnorm_from_file returns nevents_orig = %i' % n else: print colors.yellow('\tnorm_from_file returns %r while %s.nevents_orig is %i (main) %i (miniaod' % (n, sn, no1, no2)) return new_name