def get_dataset_location(dataset): "Return EOS location of the dataset." return XROOTD_DIR_BASE + \ EXPERIMENT + '/' + \ MCDIR_BASE + str(get_dataset_year(dataset)) + '/' + \ get_dataset_runperiod(dataset) + '/' + \ get_dataset_name(dataset) + '/' + \ get_dataset_format(dataset) + '/' + \ get_dataset_version(dataset)
def get_dataset_index_file_base(dataset): "Return index file base for given dataset." filebase = EXPERIMENT.upper() + '_' + \ MCDIR_BASE + str(get_dataset_year(dataset)) + '_' + \ get_dataset_runperiod(dataset) + '_' + \ get_dataset_name(dataset) + '_' + \ get_dataset_format(dataset) + '_' + \ get_dataset_version(dataset) return filebase
def get_dataset_energy(dataset, mcm_dir): "Return energy of that dataset in TeV" mcm_dict = get_mcm_dict(dataset, mcm_dir) if mcm_dict: energy = get_from_deep_json(mcm_dict, 'energy') if isinstance(energy, str): return energy else: return str(energy).replace('.0', '') + 'TeV' else: year = get_dataset_year(dataset) return { 2010: '7TeV', 2011: '7TeV', 2012: '8TeV', 2015: '13TeV', 2016: '13TeV', }.get(year, 0)
def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir): """Create record for the given dataset.""" rec = {} dataset = get_dataset(dataset_full_name) dataset_format = get_dataset_format(dataset_full_name) year_created = str(get_dataset_year(dataset_full_name)) year_published = '2018' # FIXME get from somewhere, do not hardcode it! run_period = ['Run' + year_created + 'A', 'Run' + year_created + 'B'] # FIXME remove the 'A'!! global_tag = get_global_tag(dataset_full_name, mcm_dir) release = get_cmssw_version(dataset_full_name, mcm_dir) additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data' rec['abstract'] = {} rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \ '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>' rec['accelerator'] = "CERN-LHC" rec['collaboration'] = {} rec['collaboration']['name'] = 'CMS Collaboration' rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name) rec['collections'] = [ 'CMS-Simulated-Datasets', ] rec['collision_information'] = {} rec['collision_information']['energy'] = get_dataset_energy( dataset_full_name, mcm_dir) rec['collision_information']['type'] = 'pp' # FIXME do not hardcode # FIXME cross section not working # we should try to get the cross section from the parent, and the parent-parent, and so on... generator_parameters = get_generator_parameters(dataset_full_name, das_dir) if generator_parameters: rec['cross_section'] = {} rec['cross_section']['value'] = generator_parameters.get( 'cross_section', None) rec['cross_section']['filter_efficiency:'] = generator_parameters.get( 'filter_efficiency', None) rec['cross_section'][ 'filter_efficiency_error:'] = generator_parameters.get( 'filter_efficiency_error', None) rec['cross_section']['match_efficiency:'] = generator_parameters.get( 'match_efficiency', None) rec['cross_section'][ 'match_efficiency error:'] = generator_parameters.get( 'match_efficiency_error', None) rec['date_created'] = [year_created] rec['date_published'] = year_published rec['date_reprocessed'] = year_created rec['distribution'] = {} rec['distribution']['formats'] = [dataset_format.lower(), 'root'] rec['distribution']['number_events'] = get_number_events( dataset_full_name, das_dir) rec['distribution']['number_files'] = get_number_files( dataset_full_name, das_dir) rec['distribution']['size'] = get_size(dataset_full_name, das_dir) if not dataset_full_name in doi_info: rec['distribution']['availability'] = 'ondemand' doi = get_doi(dataset_full_name, doi_info) if doi: rec['doi'] = doi rec['experiment'] = 'CMS' rec_files = get_dataset_index_files(dataset_full_name, eos_dir) if rec_files: rec['files'] = [] #TODO if no files: "Dataset available under request" for index_type in ['.json', '.txt']: index_files = [f for f in rec_files if f[0].endswith(index_type)] for file_number, (file_uri, file_size, file_checksum) in enumerate(index_files): rec['files'].append({ 'checksum': 'adler32:' + file_checksum, 'description': dataset + dataset_format + ' dataset file index (' + str(file_number + 1) + ' of ' + str(len(index_files)) + ') for access to data via CMS virtual machine', 'size': file_size, 'type': 'index' + index_type, 'uri': file_uri }) rec['license'] = {} rec['license']['attribution'] = 'CC0' rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir, mcm_dir, conffiles_dir) rec['note'] = {} rec['note'][ 'description'] = 'These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.' pileup_dataset_title = { '2010': None, '2011': '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM', '2012': '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM' }.get(year_created, 0) pileup_dataset_recid = { '2010': None, '2011': 36, '2012': 37 }.get(year_created, 0) if pileup_dataset_recid: rec['pileup'] = {} rec['pileup'][ 'description'] = '<p>To make these simulated data comparable with the collision data, <a href="/docs/cms-guide-pileup-simulation">pile-up events</a> are added to the simulated event in this step.</p>' if pileup_dataset_recid: rec['pileup']['links'] = [ { 'recid': str(pileup_dataset_recid), 'title': pileup_dataset_title }, ] rec['publisher'] = 'CERN Open Data Portal' rec['recid'] = str(recid_info[dataset_full_name]) # rec['relations'] = [] # rec['relations']['title'] = '' # FIXME # rec['relations']['type'] = 'isChildOf' rec['run_period'] = run_period # recomended global tag and cmssw release recommended for analysis recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name) recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name) rec['system_details'] = {} rec['system_details']['global_tag'] = recommended_gt rec['system_details']['release'] = recommended_cmssw rec['title'] = dataset_full_name rec['title_additional'] = additional_title topic = guess_title_category(dataset_full_name) category = topic.split('/')[0] subcategory = None if len(topic.split('/')) == 2: subcategory = topic.split('/')[1] rec['categories'] = {} rec['categories']['primary'] = category if subcategory: rec['categories']['secondary'] = [subcategory] rec['categories']['source'] = 'CMS Collaboration' rec['type'] = {} rec['type']['primary'] = 'Dataset' rec['type']['secondary'] = [ 'Simulated', ] year_getting_started = { '2010': 2010, '2011': 2011, '2012': 2011 }.get(year_created, 2011) rec['usage'] = {} rec['usage'][ 'description'] = 'You can access these data through the CMS Virtual Machine. See the instructions for setting up the Virtual Machine and getting started in' rec['usage']['links'] = [{ "description": "How to install the CMS Virtual Machine", "url": "/docs/cms-virtual-machine-{}".format(year_created) }, { "description": "Getting started with CMS open data", "url": "/docs/cms-getting-started-{}".format(year_getting_started) }] rec['validation'] = {} rec['validation'][ 'description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures." #rec['validation']['links'] = 'FIXME' return rec
def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir): """Create record for the given dataset.""" rec = {} dataset = get_dataset(dataset_full_name) dataset_format = get_dataset_format(dataset_full_name) year_created = str(get_dataset_year(dataset_full_name)) year_published = '2021' # FIXME get from somewhere, do not hardcode it! run_period = ['Run2015C', 'Run2015D'] # FIXME Hardcoded!! global_tag = get_global_tag(dataset_full_name, mcm_dir) release = get_cmssw_version(dataset_full_name, das_dir, mcm_dir) additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data' rec['abstract'] = {} rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \ '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>' + \ '<p>These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.</p>' rec['accelerator'] = "CERN-LHC" rec['collaboration'] = {} rec['collaboration']['name'] = 'CMS Collaboration' rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name) rec['collections'] = [ 'CMS-Simulated-Datasets', ] rec['collision_information'] = {} rec['collision_information']['energy'] = get_dataset_energy( dataset_full_name, mcm_dir) rec['collision_information']['type'] = 'pp' # FIXME do not hardcode # FIXME cross section not working # we should try to get the cross section from the parent, and the parent-parent, and so on... generator_parameters = get_generator_parameters_from_mcm( dataset_full_name, mcm_dir) if generator_parameters: rec['cross_section'] = {} rec['cross_section']['value'] = generator_parameters.get( 'cross_section', None) rec['cross_section']['filter_efficiency:'] = generator_parameters.get( 'filter_efficiency', None) rec['cross_section'][ 'filter_efficiency_error:'] = generator_parameters.get( 'filter_efficiency_error', None) rec['cross_section']['match_efficiency:'] = generator_parameters.get( 'match_efficiency', None) rec['cross_section'][ 'match_efficiency error:'] = generator_parameters.get( 'match_efficiency_error', None) rec['date_created'] = [year_created] rec['date_published'] = year_published rec['date_reprocessed'] = year_created rec['distribution'] = {} rec['distribution']['formats'] = [dataset_format.lower(), 'root'] rec['distribution']['number_events'] = get_number_events( dataset_full_name, das_dir) rec['distribution']['number_files'] = get_number_files( dataset_full_name, das_dir) rec['distribution']['size'] = get_size(dataset_full_name, das_dir) #if not dataset_full_name in doi_info: FIXME # rec['distribution']['availability'] = 'ondemand' doi = get_doi(dataset_full_name, doi_info) if doi: rec['doi'] = doi rec['experiment'] = 'CMS' rec_files = get_dataset_index_files(dataset_full_name, eos_dir) if rec_files: rec['files'] = [] #TODO if no files: "Dataset available under request" for index_type in ['.json', '.txt']: index_files = [f for f in rec_files if f[0].endswith(index_type)] for file_number, (file_uri, file_size, file_checksum) in enumerate(index_files): rec['files'].append({ 'checksum': 'adler32:' + file_checksum, 'description': dataset + dataset_format + ' dataset file index (' + str(file_number + 1) + ' of ' + str(len(index_files)) + ') for access to data via CMS virtual machine', 'size': file_size, 'type': 'index' + index_type, 'uri': file_uri }) rec['license'] = {} rec['license']['attribution'] = 'CC0' rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir, mcm_dir, conffiles_dir) pileup_dataset_name = '' parent = dataset_full_name while parent != '' and parent and not pileup_dataset_name: pileup_dataset_name = get_pileup_from_mcm(parent, mcm_dir) parent = get_parent_dataset(parent, das_dir) or get_parent_dataset_from_mcm( parent, das_dir, mcm_dir) pileup_dataset_recid = { '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM': 36, # 2011 '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM': 37, # 2012 '/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM': 22314, # 2015 #'/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-magnetOffBS0T_MCRUN2_71_V1-v1/GEN-SIM': {recid}, # 2015 TODO '/MinBias_TuneCP5_13TeV-pythia8/RunIIFall18GS-IdealGeometry_102X_upgrade2018_design_v9-v1/GEN-SIM': 12302 # 2018 }.get(pileup_dataset_name, 0) if pileup_dataset_name: rec['pileup'] = {} if pileup_dataset_recid: rec['pileup'][ 'description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> are added to the simulated event in this step.</p>" rec['pileup']['links'] = [{ "recid": str(pileup_dataset_recid), "title": pileup_dataset_name }] else: rec['pileup']['description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> from the dataset <code>"\ + pileup_dataset_name\ + "</code> are added to the simulated event in this step.</p>" rec['publisher'] = 'CERN Open Data Portal' rec['recid'] = str(recid_info[dataset_full_name]) # rec['relations'] = [] # rec['relations']['title'] = '' # FIXME # rec['relations']['type'] = 'isChildOf' rec['run_period'] = run_period # recomended global tag and cmssw release recommended for analysis recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name) recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name) rec['system_details'] = {} rec['system_details'][ 'global_tag'] = "76X_mcRun2_asymptotic_RunIIFall15DR76_v1" # FIXME rec['system_details']['release'] = "CMSSW_7_6_7" # FIXME rec['title'] = dataset_full_name rec['title_additional'] = additional_title topic = guess_title_category(dataset_full_name) category = topic.split('/')[0] subcategory = None if len(topic.split('/')) == 2: subcategory = topic.split('/')[1] rec['categories'] = {} rec['categories']['primary'] = category if subcategory: rec['categories']['secondary'] = [subcategory] rec['categories']['source'] = 'CMS Collaboration' rec['type'] = {} rec['type']['primary'] = 'Dataset' rec['type']['secondary'] = [ 'Simulated', ] year_getting_started = { '2010': 2010, '2011': 2011, '2012': 2011 }.get(year_created, 2011) rec['usage'] = {} rec['usage'][ 'description'] = "You can access these data through the CMS Open Data container or the CMS Virtual Machine. See the instructions for setting up one of the two alternative environments and getting started in" # FIXME rec['usage']['links'] = [ # FIXME { "description": "Running CMS analysis code using Docker", "url": "/docs/cms-guide-docker" }, { "description": "How to install the CMS Virtual Machine", "url": "/docs/cms-virtual-machine-2015" }, { "description": "Getting started with CMS open data", "url": "/docs/cms-getting-started-2015" } ] rec['validation'] = {} rec['validation'][ 'description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures." #rec['validation']['links'] = 'FIXME' return rec