def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir): """Create record for the given dataset.""" rec = {} dataset = get_dataset(dataset_full_name) dataset_format = get_dataset_format(dataset_full_name) year_created = str(get_dataset_year(dataset_full_name)) year_published = '2021' # FIXME get from somewhere, do not hardcode it! run_period = ['Run2015C', 'Run2015D'] # FIXME Hardcoded!! global_tag = get_global_tag(dataset_full_name, mcm_dir) release = get_cmssw_version(dataset_full_name, das_dir, mcm_dir) additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data' rec['abstract'] = {} rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \ '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>' + \ '<p>These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.</p>' rec['accelerator'] = "CERN-LHC" rec['collaboration'] = {} rec['collaboration']['name'] = 'CMS Collaboration' rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name) rec['collections'] = [ 'CMS-Simulated-Datasets', ] rec['collision_information'] = {} rec['collision_information']['energy'] = get_dataset_energy( dataset_full_name, mcm_dir) rec['collision_information']['type'] = 'pp' # FIXME do not hardcode # FIXME cross section not working # we should try to get the cross section from the parent, and the parent-parent, and so on... generator_parameters = get_generator_parameters_from_mcm( dataset_full_name, mcm_dir) if generator_parameters: rec['cross_section'] = {} rec['cross_section']['value'] = generator_parameters.get( 'cross_section', None) rec['cross_section']['filter_efficiency:'] = generator_parameters.get( 'filter_efficiency', None) rec['cross_section'][ 'filter_efficiency_error:'] = generator_parameters.get( 'filter_efficiency_error', None) rec['cross_section']['match_efficiency:'] = generator_parameters.get( 'match_efficiency', None) rec['cross_section'][ 'match_efficiency error:'] = generator_parameters.get( 'match_efficiency_error', None) rec['date_created'] = [year_created] rec['date_published'] = year_published rec['date_reprocessed'] = year_created rec['distribution'] = {} rec['distribution']['formats'] = [dataset_format.lower(), 'root'] rec['distribution']['number_events'] = get_number_events( dataset_full_name, das_dir) rec['distribution']['number_files'] = get_number_files( dataset_full_name, das_dir) rec['distribution']['size'] = get_size(dataset_full_name, das_dir) #if not dataset_full_name in doi_info: FIXME # rec['distribution']['availability'] = 'ondemand' doi = get_doi(dataset_full_name, doi_info) if doi: rec['doi'] = doi rec['experiment'] = 'CMS' rec_files = get_dataset_index_files(dataset_full_name, eos_dir) if rec_files: rec['files'] = [] #TODO if no files: "Dataset available under request" for index_type in ['.json', '.txt']: index_files = [f for f in rec_files if f[0].endswith(index_type)] for file_number, (file_uri, file_size, file_checksum) in enumerate(index_files): rec['files'].append({ 'checksum': 'adler32:' + file_checksum, 'description': dataset + dataset_format + ' dataset file index (' + str(file_number + 1) + ' of ' + str(len(index_files)) + ') for access to data via CMS virtual machine', 'size': file_size, 'type': 'index' + index_type, 'uri': file_uri }) rec['license'] = {} rec['license']['attribution'] = 'CC0' rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir, mcm_dir, conffiles_dir) pileup_dataset_name = '' parent = dataset_full_name while parent != '' and parent and not pileup_dataset_name: pileup_dataset_name = get_pileup_from_mcm(parent, mcm_dir) parent = get_parent_dataset(parent, das_dir) or get_parent_dataset_from_mcm( parent, das_dir, mcm_dir) pileup_dataset_recid = { '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM': 36, # 2011 '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM': 37, # 2012 '/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM': 22314, # 2015 #'/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-magnetOffBS0T_MCRUN2_71_V1-v1/GEN-SIM': {recid}, # 2015 TODO '/MinBias_TuneCP5_13TeV-pythia8/RunIIFall18GS-IdealGeometry_102X_upgrade2018_design_v9-v1/GEN-SIM': 12302 # 2018 }.get(pileup_dataset_name, 0) if pileup_dataset_name: rec['pileup'] = {} if pileup_dataset_recid: rec['pileup'][ 'description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> are added to the simulated event in this step.</p>" rec['pileup']['links'] = [{ "recid": str(pileup_dataset_recid), "title": pileup_dataset_name }] else: rec['pileup']['description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> from the dataset <code>"\ + pileup_dataset_name\ + "</code> are added to the simulated event in this step.</p>" rec['publisher'] = 'CERN Open Data Portal' rec['recid'] = str(recid_info[dataset_full_name]) # rec['relations'] = [] # rec['relations']['title'] = '' # FIXME # rec['relations']['type'] = 'isChildOf' rec['run_period'] = run_period # recomended global tag and cmssw release recommended for analysis recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name) recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name) rec['system_details'] = {} rec['system_details'][ 'global_tag'] = "76X_mcRun2_asymptotic_RunIIFall15DR76_v1" # FIXME rec['system_details']['release'] = "CMSSW_7_6_7" # FIXME rec['title'] = dataset_full_name rec['title_additional'] = additional_title topic = guess_title_category(dataset_full_name) category = topic.split('/')[0] subcategory = None if len(topic.split('/')) == 2: subcategory = topic.split('/')[1] rec['categories'] = {} rec['categories']['primary'] = category if subcategory: rec['categories']['secondary'] = [subcategory] rec['categories']['source'] = 'CMS Collaboration' rec['type'] = {} rec['type']['primary'] = 'Dataset' rec['type']['secondary'] = [ 'Simulated', ] year_getting_started = { '2010': 2010, '2011': 2011, '2012': 2011 }.get(year_created, 2011) rec['usage'] = {} rec['usage'][ 'description'] = "You can access these data through the CMS Open Data container or the CMS Virtual Machine. See the instructions for setting up one of the two alternative environments and getting started in" # FIXME rec['usage']['links'] = [ # FIXME { "description": "Running CMS analysis code using Docker", "url": "/docs/cms-guide-docker" }, { "description": "How to install the CMS Virtual Machine", "url": "/docs/cms-virtual-machine-2015" }, { "description": "Getting started with CMS open data", "url": "/docs/cms-getting-started-2015" } ] rec['validation'] = {} rec['validation'][ 'description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures." #rec['validation']['links'] = 'FIXME' return rec
def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir): """Create record for the given dataset.""" rec = {} dataset = get_dataset(dataset_full_name) dataset_format = get_dataset_format(dataset_full_name) year_created = str(get_dataset_year(dataset_full_name)) year_published = '2018' # FIXME get from somewhere, do not hardcode it! run_period = ['Run' + year_created + 'A', 'Run' + year_created + 'B'] # FIXME remove the 'A'!! global_tag = get_global_tag(dataset_full_name, mcm_dir) release = get_cmssw_version(dataset_full_name, mcm_dir) additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data' rec['abstract'] = {} rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \ '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>' rec['accelerator'] = "CERN-LHC" rec['collaboration'] = {} rec['collaboration']['name'] = 'CMS Collaboration' rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name) rec['collections'] = [ 'CMS-Simulated-Datasets', ] rec['collision_information'] = {} rec['collision_information']['energy'] = get_dataset_energy( dataset_full_name, mcm_dir) rec['collision_information']['type'] = 'pp' # FIXME do not hardcode # FIXME cross section not working # we should try to get the cross section from the parent, and the parent-parent, and so on... generator_parameters = get_generator_parameters(dataset_full_name, das_dir) if generator_parameters: rec['cross_section'] = {} rec['cross_section']['value'] = generator_parameters.get( 'cross_section', None) rec['cross_section']['filter_efficiency:'] = generator_parameters.get( 'filter_efficiency', None) rec['cross_section'][ 'filter_efficiency_error:'] = generator_parameters.get( 'filter_efficiency_error', None) rec['cross_section']['match_efficiency:'] = generator_parameters.get( 'match_efficiency', None) rec['cross_section'][ 'match_efficiency error:'] = generator_parameters.get( 'match_efficiency_error', None) rec['date_created'] = [year_created] rec['date_published'] = year_published rec['date_reprocessed'] = year_created rec['distribution'] = {} rec['distribution']['formats'] = [dataset_format.lower(), 'root'] rec['distribution']['number_events'] = get_number_events( dataset_full_name, das_dir) rec['distribution']['number_files'] = get_number_files( dataset_full_name, das_dir) rec['distribution']['size'] = get_size(dataset_full_name, das_dir) if not dataset_full_name in doi_info: rec['distribution']['availability'] = 'ondemand' doi = get_doi(dataset_full_name, doi_info) if doi: rec['doi'] = doi rec['experiment'] = 'CMS' rec_files = get_dataset_index_files(dataset_full_name, eos_dir) if rec_files: rec['files'] = [] #TODO if no files: "Dataset available under request" for index_type in ['.json', '.txt']: index_files = [f for f in rec_files if f[0].endswith(index_type)] for file_number, (file_uri, file_size, file_checksum) in enumerate(index_files): rec['files'].append({ 'checksum': 'adler32:' + file_checksum, 'description': dataset + dataset_format + ' dataset file index (' + str(file_number + 1) + ' of ' + str(len(index_files)) + ') for access to data via CMS virtual machine', 'size': file_size, 'type': 'index' + index_type, 'uri': file_uri }) rec['license'] = {} rec['license']['attribution'] = 'CC0' rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir, mcm_dir, conffiles_dir) rec['note'] = {} rec['note'][ 'description'] = 'These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.' pileup_dataset_title = { '2010': None, '2011': '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM', '2012': '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM' }.get(year_created, 0) pileup_dataset_recid = { '2010': None, '2011': 36, '2012': 37 }.get(year_created, 0) if pileup_dataset_recid: rec['pileup'] = {} rec['pileup'][ 'description'] = '<p>To make these simulated data comparable with the collision data, <a href="/docs/cms-guide-pileup-simulation">pile-up events</a> are added to the simulated event in this step.</p>' if pileup_dataset_recid: rec['pileup']['links'] = [ { 'recid': str(pileup_dataset_recid), 'title': pileup_dataset_title }, ] rec['publisher'] = 'CERN Open Data Portal' rec['recid'] = str(recid_info[dataset_full_name]) # rec['relations'] = [] # rec['relations']['title'] = '' # FIXME # rec['relations']['type'] = 'isChildOf' rec['run_period'] = run_period # recomended global tag and cmssw release recommended for analysis recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name) recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name) rec['system_details'] = {} rec['system_details']['global_tag'] = recommended_gt rec['system_details']['release'] = recommended_cmssw rec['title'] = dataset_full_name rec['title_additional'] = additional_title topic = guess_title_category(dataset_full_name) category = topic.split('/')[0] subcategory = None if len(topic.split('/')) == 2: subcategory = topic.split('/')[1] rec['categories'] = {} rec['categories']['primary'] = category if subcategory: rec['categories']['secondary'] = [subcategory] rec['categories']['source'] = 'CMS Collaboration' rec['type'] = {} rec['type']['primary'] = 'Dataset' rec['type']['secondary'] = [ 'Simulated', ] year_getting_started = { '2010': 2010, '2011': 2011, '2012': 2011 }.get(year_created, 2011) rec['usage'] = {} rec['usage'][ 'description'] = 'You can access these data through the CMS Virtual Machine. See the instructions for setting up the Virtual Machine and getting started in' rec['usage']['links'] = [{ "description": "How to install the CMS Virtual Machine", "url": "/docs/cms-virtual-machine-{}".format(year_created) }, { "description": "Getting started with CMS open data", "url": "/docs/cms-getting-started-{}".format(year_getting_started) }] rec['validation'] = {} rec['validation'][ 'description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures." #rec['validation']['links'] = 'FIXME' return rec
def print_ancestor_information(dataset, das_dir, mcm_dir, recid_file, doi_info): "All the information we have so far" # everything should be a sublist item (4 spaces of indentation): # - dataset_name # - info # TODO add to this function: # - config files present # - step GEN # - step RECO # - step HLT # - gen_parameters: # - cross section from XSECDB. # see github issue opendata.cern.ch#1137 # ideally we should make a local cache of that. # - LHE stuff? # - Data popularity from github.com/katilp/cms-data-popularity # ideally we should make a local cache of that. # it would be very nice if this printer script needed not external (non cached) information # record ID as in OpenData portal # TODO move this code to other place, no need to open a file everytime RECID_INFO = {} _locals = locals() exec(open(recid_file, 'r').read(), globals(), _locals) RECID_INFO = _locals['RECID_INFO'] try: recid = RECID_INFO[dataset] print(" - Record ID: [{recid}]({url})".format( recid=recid, url='http://opendata.cern.ch/record/' + str(recid))) except: pass # DOI doi = get_doi(dataset, doi_info) if doi: print(" - DOI: [{doi}]({url})".format(doi=doi, url='https://doi.org/' + str(doi))) # PrepId prepid = get_prepId_from_das(dataset, das_dir) if not prepid: prepid = get_prepid_from_mcm(dataset, mcm_dir) if prepid: print(" - PrepId: [{prepid}]({url})".format( prepid=prepid, url='https://cms-pdmv.cern.ch/mcm/requests?prepid=' + str(prepid))) # global tag & cmssw version global_tag = get_global_tag(dataset, mcm_dir) cmssw_ver = get_cmssw_version(dataset, mcm_dir) if global_tag: print(" - Global Tag:", global_tag) if cmssw_ver: print(" - CMSSW version:", cmssw_ver) # Energy print(" - Collision Energy: ", get_dataset_energy(dataset, mcm_dir), "TeV") # Generators generators = get_generator_name(dataset, das_dir, mcm_dir) if generators: print(" - Generators: ", generators) # GEN-SIM dataset used to produce the AODSIM dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') if input_dataset: print(" - Input Dataset:", input_dataset) input_global_tag = get_global_tag(input_dataset, mcm_dir) input_cmssw_ver = get_cmssw_version(input_dataset, mcm_dir) if input_global_tag: print(" - Global Tag:", input_global_tag) if input_cmssw_ver: print(" - CMSSW version:", input_cmssw_ver) gen_fragment = get_genfragment_url(dataset, mcm_dir, das_dir) if gen_fragment: for url in gen_fragment: print(" - Gen Fragment: [{url}]({url})".format(url=url)) # gen parameters of input dataset generator_parameters = get_generator_parameters(dataset, das_dir) if generator_parameters: print(' - Generator parameters:') print(' - Cross section:', generator_parameters.get('cross_section', None)) print(' - Filter efficiency:', generator_parameters.get('filter_efficiency', None)) print(' - Filter efficiency error:', generator_parameters.get('filter_efficiency_error', None)) print(' - Match efficiency:', generator_parameters.get('match_efficiency', None)) print(' - Match efficiency error:', generator_parameters.get('match_efficiency_error', None)) # mcm scripts with cmsDriver instructions cmsDriver1 = get_cmsDriver_script(input_dataset, mcm_dir) cmsDriver2 = get_cmsDriver_script(dataset, mcm_dir) global DATASETS_WITH_BOTH_CMSDRIVER global DATASETS_WITH_CMSDRIVER1 global DATASETS_WITH_CMSDRIVER2 if cmsDriver1 or cmsDriver2: print(" - cmsDriver scripts:") if cmsDriver1: print(' - GEN-SIM:', cmsDriver1) DATASETS_WITH_CMSDRIVER1 += 1 if cmsDriver2: print(' - RECO-HLT:', cmsDriver2) DATASETS_WITH_CMSDRIVER2 += 1 if cmsDriver1 and cmsDriver2: DATASETS_WITH_BOTH_CMSDRIVER += 1 # python config files conffile_ids = get_conffile_ids(dataset, das_dir) parent = get_parent_dataset(dataset, das_dir) while parent != '' and parent: conffile_ids += get_conffile_ids(parent, das_dir) parent = get_parent_dataset(parent, das_dir) global DATASETS_WITH_3CONFFILES if conffile_ids: print(" - python config scripts: ", conffile_ids) if len(conffile_ids) > 2: DATASETS_WITH_3CONFFILES += 1 global DATASETS_WITH_FULL_PROVENANCE if (cmsDriver1 and cmsDriver2) or len(conffile_ids) > 2: DATASETS_WITH_FULL_PROVENANCE += 1 # pile up information mcm_dict = get_mcm_dict(dataset, mcm_dir) if mcm_dict: pileup = get_from_deep_json(mcm_dict, 'pileup') pileup_dataset = get_from_deep_json(mcm_dict, 'pileup_dataset_name') if pileup or pileup_dataset: print(' - pile-up:') if pileup: print(' -', pileup) if pileup_dataset: print(' -', pileup_dataset) notes = get_from_deep_json(mcm_dict, 'notes') if notes != None: print( ' - notes:', notes.replace('\n', '\n ') ) # some notes have several lines, this makes the markdown use them in the same item list