def get_prepId_from_das(dataset, das_dir): "get prepid for dataset" # get prepid from das/dataset prepid = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'prep_id') if prepid == None: # try to get from das/mcm: prepid = get_from_deep_json(get_das_store_json(dataset, 'mcm', das_dir), 'prepid') # todo also try different queries from the json. prep_id? return prepid
def get_conffile_ids(dataset, das_dir): """Return location of the configuration files for the dataset.""" ids = {} byoutput = get_from_deep_json( get_das_store_json(dataset, 'config', das_dir), 'byoutputdataset') byinput = get_from_deep_json( get_das_store_json(dataset, 'config', das_dir), 'byinputdataset') if byoutput: for someid in byoutput: ids[someid] = 1 if byinput: for someid in byinput: ids[someid] = 1 return list(ids.keys())
def get_size(dataset, das_dir): """Return size of the dataset.""" size = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'size') if size: return size return 0
def get_number_files(dataset, das_dir): """Return number of files for the dataset.""" number_files = get_from_deep_json( get_das_store_json(dataset, 'dataset', das_dir), 'nfiles') if number_files: return number_files return 0
def get_number_events(dataset, das_dir): """Return number of events for the dataset.""" number_events = get_from_deep_json( get_das_store_json(dataset, 'dataset', das_dir), 'nevents') if number_events: return number_events return 0
def get_generator_name(dataset, das_dir, mcm_dir): "Return list of generators used for that dataset" generator_names = [] mcm_dict = get_mcm_dict(dataset, mcm_dir) generators = get_from_deep_json(mcm_dict, 'generators') input_generators = [] dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') if input_dataset: mcm_dict = get_mcm_dict(input_dataset, mcm_dir) input_generators = get_from_deep_json(mcm_dict, 'generators') if generators and input_generators: generators += input_generators if generators: for item in generators: for char in ['"', '\\', '[', ']']: # remove ", \, [, ] item = item.replace(char, '') generator = item if generator not in generator_names: generator_names.append(item) return generator_names
def get_conffile_ids_from_das(dataset, das_dir, mcm_dir): """Return location of the configuration files for the dataset from DAS.""" ids = {} output = get_from_deep_json(get_das_store_json(dataset, 'config', das_dir), 'byoutputdataset') if output: for someid in output: ids[someid] = 1 else: print("Error: No config id found from DAS config for " + dataset, file=sys.stderr) return list(ids.keys())
def get_genfragment_url(dataset, mcm_dir, das_dir): "return list of url's of the genfragments used" input_dataset = '' url = [] # get GEN-SIM dataset if get_dataset_format(dataset) == 'AODSIM': dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') else: input_dataset = dataset script_path = get_cmsDriver_script(input_dataset, mcm_dir) if script_path == None: return None with open(script_path, 'r') as script: for line in script: if 'curl' in line: curl = re.search('(?P<url>https?://[^\s]+)', line) if curl: url.append(curl.group('url')) return url
def mcm_downloader(prepid, dataset, mcm_dir, das_dir): "Query dictionary and setup script from McM database" # this function is so ugly... but finally works! You're welcome to refactor it though cmd = "curl -s -k https://cms-pdmv.cern.ch/mcm/public/restapi/requests/{query}/{prepId}" # As prep_id in DAS for some datasets can be found with underscores and MCM # takes without underscores, we need to process prep_id removing all of them if "_" in prepid: print("Found some underscores in prep_id: " + prepid + ", removing...") prepid = prepid.replace("_", "") mcm_dict = subprocess.run(cmd.format(query="get", prepId=prepid), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_script = subprocess.run(cmd.format(query="get_setup", prepId=prepid), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_dict_out = str(mcm_dict.stdout.decode("utf-8")) mcm_script_out = str(mcm_script.stdout.decode("utf-8")) # check if results are not empty if mcm_dict_out == '{"results": {}}\n': print("[ERROR] Empty McM dict (get) for {ds}".format(ds=dataset), file=sys.stderr) else: outfile = mcm_dir + "/dict/" + dataset.replace('/', '@') + ".json" with open(outfile, 'w') as dict_file: dict_file.write(mcm_dict_out) if mcm_script_out == '' or mcm_script_out[0] == '{': print("[ERROR] Empty McM script (get_setup) for {ds}".format(ds=dataset), file=sys.stderr) else: outfile = mcm_dir + "/scripts/" + dataset.replace('/', '@') + ".sh" with open(outfile, 'w') as dict_file: dict_file.write(mcm_script_out) # same thing for "input_dataset": hopefully the GEN-SIM step dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') # /bla/ble/GEN-SIM if input_dataset: mcm_dict = subprocess.run(cmd.format(query="produces", prepId=input_dataset[1:]), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_out = str(mcm_dict.stdout.decode("utf-8")) # check if results are not empty if mcm_out == '{"results": {}}' or mcm_out == '{"results": {}}\n': print("[ERROR] Empty McM dict (get) for {ds}".format(ds=input_dataset), file=sys.stderr) else: outfile = mcm_dir + "/dict/" + input_dataset.replace('/', '@') + ".json" with open(outfile, 'w') as dict_file: dict_file.write(mcm_out) prepid = get_prepid_from_mcm(input_dataset, mcm_dir) if prepid != None: mcm_script = subprocess.run(cmd.format(query="get_setup", prepId=prepid), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if mcm_script.stdout.decode("utf-8")[0] == '{': print("[ERROR] Empty McM script (get_setup) for {ds}".format(ds=input_dataset), file=sys.stderr) else: outfile = mcm_dir + "/scripts/" + input_dataset.replace('/', '@') + ".sh" with open(outfile, 'w') as dict_file: dict_file.write(mcm_script.stdout.decode("utf-8")) else: print("[ERROR] No prep_id in McM Store for record {ds}".format(ds=input_dataset), file=sys.stderr) else: print("[ERROR] No input_dataset in das_store/mcm for record {ds}".format(ds=dataset), file=sys.stderr)
def print_ancestor_information(dataset, das_dir, mcm_dir, recid_file, doi_info): "All the information we have so far" # everything should be a sublist item (4 spaces of indentation): # - dataset_name # - info # TODO add to this function: # - config files present # - step GEN # - step RECO # - step HLT # - gen_parameters: # - cross section from XSECDB. # see github issue opendata.cern.ch#1137 # ideally we should make a local cache of that. # - LHE stuff? # - Data popularity from github.com/katilp/cms-data-popularity # ideally we should make a local cache of that. # it would be very nice if this printer script needed not external (non cached) information # record ID as in OpenData portal # TODO move this code to other place, no need to open a file everytime RECID_INFO = {} _locals = locals() exec(open(recid_file, 'r').read(), globals(), _locals) RECID_INFO = _locals['RECID_INFO'] try: recid = RECID_INFO[dataset] print(" - Record ID: [{recid}]({url})".format( recid=recid, url='http://opendata.cern.ch/record/' + str(recid))) except: pass # DOI doi = get_doi(dataset, doi_info) if doi: print(" - DOI: [{doi}]({url})".format(doi=doi, url='https://doi.org/' + str(doi))) # PrepId prepid = get_prepId_from_das(dataset, das_dir) if not prepid: prepid = get_prepid_from_mcm(dataset, mcm_dir) if prepid: print(" - PrepId: [{prepid}]({url})".format( prepid=prepid, url='https://cms-pdmv.cern.ch/mcm/requests?prepid=' + str(prepid))) # global tag & cmssw version global_tag = get_global_tag(dataset, mcm_dir) cmssw_ver = get_cmssw_version(dataset, mcm_dir) if global_tag: print(" - Global Tag:", global_tag) if cmssw_ver: print(" - CMSSW version:", cmssw_ver) # Energy print(" - Collision Energy: ", get_dataset_energy(dataset, mcm_dir), "TeV") # Generators generators = get_generator_name(dataset, das_dir, mcm_dir) if generators: print(" - Generators: ", generators) # GEN-SIM dataset used to produce the AODSIM dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') if input_dataset: print(" - Input Dataset:", input_dataset) input_global_tag = get_global_tag(input_dataset, mcm_dir) input_cmssw_ver = get_cmssw_version(input_dataset, mcm_dir) if input_global_tag: print(" - Global Tag:", input_global_tag) if input_cmssw_ver: print(" - CMSSW version:", input_cmssw_ver) gen_fragment = get_genfragment_url(dataset, mcm_dir, das_dir) if gen_fragment: for url in gen_fragment: print(" - Gen Fragment: [{url}]({url})".format(url=url)) # gen parameters of input dataset generator_parameters = get_generator_parameters(dataset, das_dir) if generator_parameters: print(' - Generator parameters:') print(' - Cross section:', generator_parameters.get('cross_section', None)) print(' - Filter efficiency:', generator_parameters.get('filter_efficiency', None)) print(' - Filter efficiency error:', generator_parameters.get('filter_efficiency_error', None)) print(' - Match efficiency:', generator_parameters.get('match_efficiency', None)) print(' - Match efficiency error:', generator_parameters.get('match_efficiency_error', None)) # mcm scripts with cmsDriver instructions cmsDriver1 = get_cmsDriver_script(input_dataset, mcm_dir) cmsDriver2 = get_cmsDriver_script(dataset, mcm_dir) global DATASETS_WITH_BOTH_CMSDRIVER global DATASETS_WITH_CMSDRIVER1 global DATASETS_WITH_CMSDRIVER2 if cmsDriver1 or cmsDriver2: print(" - cmsDriver scripts:") if cmsDriver1: print(' - GEN-SIM:', cmsDriver1) DATASETS_WITH_CMSDRIVER1 += 1 if cmsDriver2: print(' - RECO-HLT:', cmsDriver2) DATASETS_WITH_CMSDRIVER2 += 1 if cmsDriver1 and cmsDriver2: DATASETS_WITH_BOTH_CMSDRIVER += 1 # python config files conffile_ids = get_conffile_ids(dataset, das_dir) parent = get_parent_dataset(dataset, das_dir) while parent != '' and parent: conffile_ids += get_conffile_ids(parent, das_dir) parent = get_parent_dataset(parent, das_dir) global DATASETS_WITH_3CONFFILES if conffile_ids: print(" - python config scripts: ", conffile_ids) if len(conffile_ids) > 2: DATASETS_WITH_3CONFFILES += 1 global DATASETS_WITH_FULL_PROVENANCE if (cmsDriver1 and cmsDriver2) or len(conffile_ids) > 2: DATASETS_WITH_FULL_PROVENANCE += 1 # pile up information mcm_dict = get_mcm_dict(dataset, mcm_dir) if mcm_dict: pileup = get_from_deep_json(mcm_dict, 'pileup') pileup_dataset = get_from_deep_json(mcm_dict, 'pileup_dataset_name') if pileup or pileup_dataset: print(' - pile-up:') if pileup: print(' -', pileup) if pileup_dataset: print(' -', pileup_dataset) notes = get_from_deep_json(mcm_dict, 'notes') if notes != None: print( ' - notes:', notes.replace('\n', '\n ') ) # some notes have several lines, this makes the markdown use them in the same item list