예제 #1
0
def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir,
                  mcm_dir, conffiles_dir):
    """Create record for the given dataset."""

    rec = {}

    dataset = get_dataset(dataset_full_name)
    dataset_format = get_dataset_format(dataset_full_name)
    year_created = str(get_dataset_year(dataset_full_name))
    year_published = '2021'  # FIXME get from somewhere, do not hardcode it!
    run_period = ['Run2015C', 'Run2015D']  # FIXME Hardcoded!!
    global_tag = get_global_tag(dataset_full_name, mcm_dir)
    release = get_cmssw_version(dataset_full_name, das_dir, mcm_dir)

    additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data'

    rec['abstract'] = {}
    rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \
                                     '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>' + \
                                     '<p>These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.</p>'

    rec['accelerator'] = "CERN-LHC"

    rec['collaboration'] = {}
    rec['collaboration']['name'] = 'CMS Collaboration'
    rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name)

    rec['collections'] = [
        'CMS-Simulated-Datasets',
    ]

    rec['collision_information'] = {}
    rec['collision_information']['energy'] = get_dataset_energy(
        dataset_full_name, mcm_dir)
    rec['collision_information']['type'] = 'pp'  # FIXME do not hardcode

    # FIXME cross section not working
    # we should try to get the cross section from the parent, and the parent-parent, and so on...
    generator_parameters = get_generator_parameters_from_mcm(
        dataset_full_name, mcm_dir)
    if generator_parameters:
        rec['cross_section'] = {}
        rec['cross_section']['value'] = generator_parameters.get(
            'cross_section', None)
        rec['cross_section']['filter_efficiency:'] = generator_parameters.get(
            'filter_efficiency', None)
        rec['cross_section'][
            'filter_efficiency_error:'] = generator_parameters.get(
                'filter_efficiency_error', None)
        rec['cross_section']['match_efficiency:'] = generator_parameters.get(
            'match_efficiency', None)
        rec['cross_section'][
            'match_efficiency error:'] = generator_parameters.get(
                'match_efficiency_error', None)

    rec['date_created'] = [year_created]
    rec['date_published'] = year_published
    rec['date_reprocessed'] = year_created

    rec['distribution'] = {}
    rec['distribution']['formats'] = [dataset_format.lower(), 'root']
    rec['distribution']['number_events'] = get_number_events(
        dataset_full_name, das_dir)
    rec['distribution']['number_files'] = get_number_files(
        dataset_full_name, das_dir)
    rec['distribution']['size'] = get_size(dataset_full_name, das_dir)

    #if not dataset_full_name in doi_info: FIXME
    #    rec['distribution']['availability'] = 'ondemand'

    doi = get_doi(dataset_full_name, doi_info)
    if doi:
        rec['doi'] = doi

    rec['experiment'] = 'CMS'

    rec_files = get_dataset_index_files(dataset_full_name, eos_dir)
    if rec_files:
        rec['files'] = []  #TODO if no files: "Dataset available under request"
        for index_type in ['.json', '.txt']:
            index_files = [f for f in rec_files if f[0].endswith(index_type)]
            for file_number, (file_uri, file_size,
                              file_checksum) in enumerate(index_files):
                rec['files'].append({
                    'checksum':
                    'adler32:' + file_checksum,
                    'description':
                    dataset + dataset_format + ' dataset file index (' +
                    str(file_number + 1) + ' of ' + str(len(index_files)) +
                    ') for access to data via CMS virtual machine',
                    'size':
                    file_size,
                    'type':
                    'index' + index_type,
                    'uri':
                    file_uri
                })

    rec['license'] = {}
    rec['license']['attribution'] = 'CC0'

    rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir,
                                                mcm_dir, conffiles_dir)

    pileup_dataset_name = ''
    parent = dataset_full_name
    while parent != '' and parent and not pileup_dataset_name:
        pileup_dataset_name = get_pileup_from_mcm(parent, mcm_dir)
        parent = get_parent_dataset(parent,
                                    das_dir) or get_parent_dataset_from_mcm(
                                        parent, das_dir, mcm_dir)

    pileup_dataset_recid = {
        '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM':
        36,  # 2011
        '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM':
        37,  # 2012
        '/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM':
        22314,  # 2015
        #'/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-magnetOffBS0T_MCRUN2_71_V1-v1/GEN-SIM': {recid}, # 2015 TODO
        '/MinBias_TuneCP5_13TeV-pythia8/RunIIFall18GS-IdealGeometry_102X_upgrade2018_design_v9-v1/GEN-SIM':
        12302  # 2018
    }.get(pileup_dataset_name, 0)

    if pileup_dataset_name:
        rec['pileup'] = {}
        if pileup_dataset_recid:
            rec['pileup'][
                'description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> are added to the simulated event in this step.</p>"
            rec['pileup']['links'] = [{
                "recid": str(pileup_dataset_recid),
                "title": pileup_dataset_name
            }]
        else:
            rec['pileup']['description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> from the dataset <code>"\
                                            + pileup_dataset_name\
                                            + "</code> are added to the simulated event in this step.</p>"

    rec['publisher'] = 'CERN Open Data Portal'

    rec['recid'] = str(recid_info[dataset_full_name])

    # rec['relations'] = []
    # rec['relations']['title'] = ''  # FIXME
    # rec['relations']['type'] = 'isChildOf'

    rec['run_period'] = run_period

    # recomended global tag and cmssw release recommended for analysis
    recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name)
    recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name)
    rec['system_details'] = {}
    rec['system_details'][
        'global_tag'] = "76X_mcRun2_asymptotic_RunIIFall15DR76_v1"  # FIXME
    rec['system_details']['release'] = "CMSSW_7_6_7"  # FIXME

    rec['title'] = dataset_full_name

    rec['title_additional'] = additional_title

    topic = guess_title_category(dataset_full_name)
    category = topic.split('/')[0]
    subcategory = None
    if len(topic.split('/')) == 2:
        subcategory = topic.split('/')[1]
    rec['categories'] = {}
    rec['categories']['primary'] = category
    if subcategory:
        rec['categories']['secondary'] = [subcategory]
    rec['categories']['source'] = 'CMS Collaboration'

    rec['type'] = {}
    rec['type']['primary'] = 'Dataset'
    rec['type']['secondary'] = [
        'Simulated',
    ]

    year_getting_started = {
        '2010': 2010,
        '2011': 2011,
        '2012': 2011
    }.get(year_created, 2011)
    rec['usage'] = {}
    rec['usage'][
        'description'] = "You can access these data through the CMS Open Data container or the CMS Virtual Machine. See the instructions for setting up one of the two alternative environments and getting started in"  # FIXME
    rec['usage']['links'] = [  # FIXME
        {
            "description": "Running CMS analysis code using Docker",
            "url": "/docs/cms-guide-docker"
        }, {
            "description": "How to install the CMS Virtual Machine",
            "url": "/docs/cms-virtual-machine-2015"
        }, {
            "description": "Getting started with CMS open data",
            "url": "/docs/cms-getting-started-2015"
        }
    ]

    rec['validation'] = {}
    rec['validation'][
        'description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures."
    #rec['validation']['links'] = 'FIXME'

    return rec
예제 #2
0
def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir,
                  mcm_dir, conffiles_dir):
    """Create record for the given dataset."""

    rec = {}

    dataset = get_dataset(dataset_full_name)
    dataset_format = get_dataset_format(dataset_full_name)
    year_created = str(get_dataset_year(dataset_full_name))
    year_published = '2018'  # FIXME get from somewhere, do not hardcode it!
    run_period = ['Run' + year_created + 'A',
                  'Run' + year_created + 'B']  # FIXME remove the 'A'!!
    global_tag = get_global_tag(dataset_full_name, mcm_dir)
    release = get_cmssw_version(dataset_full_name, mcm_dir)

    additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data'

    rec['abstract'] = {}
    rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \
                                     '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>'

    rec['accelerator'] = "CERN-LHC"

    rec['collaboration'] = {}
    rec['collaboration']['name'] = 'CMS Collaboration'
    rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name)

    rec['collections'] = [
        'CMS-Simulated-Datasets',
    ]

    rec['collision_information'] = {}
    rec['collision_information']['energy'] = get_dataset_energy(
        dataset_full_name, mcm_dir)
    rec['collision_information']['type'] = 'pp'  # FIXME do not hardcode

    # FIXME cross section not working
    # we should try to get the cross section from the parent, and the parent-parent, and so on...
    generator_parameters = get_generator_parameters(dataset_full_name, das_dir)
    if generator_parameters:
        rec['cross_section'] = {}
        rec['cross_section']['value'] = generator_parameters.get(
            'cross_section', None)
        rec['cross_section']['filter_efficiency:'] = generator_parameters.get(
            'filter_efficiency', None)
        rec['cross_section'][
            'filter_efficiency_error:'] = generator_parameters.get(
                'filter_efficiency_error', None)
        rec['cross_section']['match_efficiency:'] = generator_parameters.get(
            'match_efficiency', None)
        rec['cross_section'][
            'match_efficiency error:'] = generator_parameters.get(
                'match_efficiency_error', None)

    rec['date_created'] = [year_created]
    rec['date_published'] = year_published
    rec['date_reprocessed'] = year_created

    rec['distribution'] = {}
    rec['distribution']['formats'] = [dataset_format.lower(), 'root']
    rec['distribution']['number_events'] = get_number_events(
        dataset_full_name, das_dir)
    rec['distribution']['number_files'] = get_number_files(
        dataset_full_name, das_dir)
    rec['distribution']['size'] = get_size(dataset_full_name, das_dir)

    if not dataset_full_name in doi_info:
        rec['distribution']['availability'] = 'ondemand'

    doi = get_doi(dataset_full_name, doi_info)
    if doi:
        rec['doi'] = doi

    rec['experiment'] = 'CMS'

    rec_files = get_dataset_index_files(dataset_full_name, eos_dir)
    if rec_files:
        rec['files'] = []  #TODO if no files: "Dataset available under request"
        for index_type in ['.json', '.txt']:
            index_files = [f for f in rec_files if f[0].endswith(index_type)]
            for file_number, (file_uri, file_size,
                              file_checksum) in enumerate(index_files):
                rec['files'].append({
                    'checksum':
                    'adler32:' + file_checksum,
                    'description':
                    dataset + dataset_format + ' dataset file index (' +
                    str(file_number + 1) + ' of ' + str(len(index_files)) +
                    ') for access to data via CMS virtual machine',
                    'size':
                    file_size,
                    'type':
                    'index' + index_type,
                    'uri':
                    file_uri
                })

    rec['license'] = {}
    rec['license']['attribution'] = 'CC0'

    rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir,
                                                mcm_dir, conffiles_dir)

    rec['note'] = {}
    rec['note'][
        'description'] = 'These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.'

    pileup_dataset_title = {
        '2010': None,
        '2011':
        '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM',
        '2012':
        '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM'
    }.get(year_created, 0)
    pileup_dataset_recid = {
        '2010': None,
        '2011': 36,
        '2012': 37
    }.get(year_created, 0)
    if pileup_dataset_recid:
        rec['pileup'] = {}
        rec['pileup'][
            'description'] = '<p>To make these simulated data comparable with the collision data, <a href="/docs/cms-guide-pileup-simulation">pile-up events</a> are added to the simulated event in this step.</p>'
        if pileup_dataset_recid:
            rec['pileup']['links'] = [
                {
                    'recid': str(pileup_dataset_recid),
                    'title': pileup_dataset_title
                },
            ]

    rec['publisher'] = 'CERN Open Data Portal'

    rec['recid'] = str(recid_info[dataset_full_name])

    # rec['relations'] = []
    # rec['relations']['title'] = ''  # FIXME
    # rec['relations']['type'] = 'isChildOf'

    rec['run_period'] = run_period

    # recomended global tag and cmssw release recommended for analysis
    recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name)
    recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name)
    rec['system_details'] = {}
    rec['system_details']['global_tag'] = recommended_gt
    rec['system_details']['release'] = recommended_cmssw

    rec['title'] = dataset_full_name

    rec['title_additional'] = additional_title

    topic = guess_title_category(dataset_full_name)
    category = topic.split('/')[0]
    subcategory = None
    if len(topic.split('/')) == 2:
        subcategory = topic.split('/')[1]
    rec['categories'] = {}
    rec['categories']['primary'] = category
    if subcategory:
        rec['categories']['secondary'] = [subcategory]
    rec['categories']['source'] = 'CMS Collaboration'

    rec['type'] = {}
    rec['type']['primary'] = 'Dataset'
    rec['type']['secondary'] = [
        'Simulated',
    ]

    year_getting_started = {
        '2010': 2010,
        '2011': 2011,
        '2012': 2011
    }.get(year_created, 2011)
    rec['usage'] = {}
    rec['usage'][
        'description'] = 'You can access these data through the CMS Virtual Machine. See the instructions for setting up the Virtual Machine and getting started in'
    rec['usage']['links'] = [{
        "description":
        "How to install the CMS Virtual Machine",
        "url":
        "/docs/cms-virtual-machine-{}".format(year_created)
    }, {
        "description":
        "Getting started with CMS open data",
        "url":
        "/docs/cms-getting-started-{}".format(year_getting_started)
    }]

    rec['validation'] = {}
    rec['validation'][
        'description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures."
    #rec['validation']['links'] = 'FIXME'

    return rec
예제 #3
0
def print_ancestor_information(dataset, das_dir, mcm_dir, recid_file,
                               doi_info):
    "All the information we have so far"
    # everything should be a sublist item (4 spaces of indentation):
    # - dataset_name
    #     - info

    # TODO add to this function:
    # - config files present
    #   - step GEN
    #   - step RECO
    #   - step HLT
    # - gen_parameters:
    #   - cross section from XSECDB.
    #     see github issue opendata.cern.ch#1137
    #     ideally we should make a local cache of that.
    # - LHE stuff?
    # - Data popularity from github.com/katilp/cms-data-popularity
    #   ideally we should make a local cache of that.
    # it would be very nice if this printer script needed not external (non cached) information

    # record ID as in OpenData portal
    # TODO move this code to other place, no need to open a file everytime
    RECID_INFO = {}
    _locals = locals()
    exec(open(recid_file, 'r').read(), globals(), _locals)
    RECID_INFO = _locals['RECID_INFO']

    try:
        recid = RECID_INFO[dataset]
        print("    - Record ID: [{recid}]({url})".format(
            recid=recid, url='http://opendata.cern.ch/record/' + str(recid)))
    except:
        pass

    # DOI
    doi = get_doi(dataset, doi_info)
    if doi:
        print("    - DOI: [{doi}]({url})".format(doi=doi,
                                                 url='https://doi.org/' +
                                                 str(doi)))

    # PrepId
    prepid = get_prepId_from_das(dataset, das_dir)
    if not prepid:
        prepid = get_prepid_from_mcm(dataset, mcm_dir)
    if prepid:
        print("    - PrepId: [{prepid}]({url})".format(
            prepid=prepid,
            url='https://cms-pdmv.cern.ch/mcm/requests?prepid=' + str(prepid)))

    # global tag & cmssw version
    global_tag = get_global_tag(dataset, mcm_dir)
    cmssw_ver = get_cmssw_version(dataset, mcm_dir)
    if global_tag:
        print("    - Global Tag:", global_tag)
    if cmssw_ver:
        print("    - CMSSW version:", cmssw_ver)

    # Energy
    print("    - Collision Energy: ", get_dataset_energy(dataset, mcm_dir),
          "TeV")

    # Generators
    generators = get_generator_name(dataset, das_dir, mcm_dir)
    if generators:
        print("    - Generators: ", generators)

    # GEN-SIM dataset used to produce the AODSIM
    dataset_json = get_das_store_json(dataset, 'mcm', das_dir)
    input_dataset = get_from_deep_json(dataset_json, 'input_dataset')
    if input_dataset:
        print("    - Input Dataset:", input_dataset)

        input_global_tag = get_global_tag(input_dataset, mcm_dir)
        input_cmssw_ver = get_cmssw_version(input_dataset, mcm_dir)
        if input_global_tag:
            print("        - Global Tag:", input_global_tag)
        if input_cmssw_ver:
            print("        - CMSSW version:", input_cmssw_ver)

        gen_fragment = get_genfragment_url(dataset, mcm_dir, das_dir)
        if gen_fragment:
            for url in gen_fragment:
                print("        - Gen Fragment: [{url}]({url})".format(url=url))

    # gen parameters of input dataset
    generator_parameters = get_generator_parameters(dataset, das_dir)
    if generator_parameters:
        print('        - Generator parameters:')
        print('            - Cross section:',
              generator_parameters.get('cross_section', None))
        print('            - Filter efficiency:',
              generator_parameters.get('filter_efficiency', None))
        print('            - Filter efficiency error:',
              generator_parameters.get('filter_efficiency_error', None))
        print('            - Match efficiency:',
              generator_parameters.get('match_efficiency', None))
        print('            - Match efficiency error:',
              generator_parameters.get('match_efficiency_error', None))

    # mcm scripts with cmsDriver instructions
    cmsDriver1 = get_cmsDriver_script(input_dataset, mcm_dir)
    cmsDriver2 = get_cmsDriver_script(dataset, mcm_dir)
    global DATASETS_WITH_BOTH_CMSDRIVER
    global DATASETS_WITH_CMSDRIVER1
    global DATASETS_WITH_CMSDRIVER2

    if cmsDriver1 or cmsDriver2:
        print("    - cmsDriver scripts:")
        if cmsDriver1:
            print('        - GEN-SIM:', cmsDriver1)
            DATASETS_WITH_CMSDRIVER1 += 1
        if cmsDriver2:
            print('        - RECO-HLT:', cmsDriver2)
            DATASETS_WITH_CMSDRIVER2 += 1

        if cmsDriver1 and cmsDriver2:
            DATASETS_WITH_BOTH_CMSDRIVER += 1

    # python config files
    conffile_ids = get_conffile_ids(dataset, das_dir)
    parent = get_parent_dataset(dataset, das_dir)
    while parent != '' and parent:
        conffile_ids += get_conffile_ids(parent, das_dir)
        parent = get_parent_dataset(parent, das_dir)
    global DATASETS_WITH_3CONFFILES
    if conffile_ids:
        print("    - python config scripts: ", conffile_ids)
        if len(conffile_ids) > 2:
            DATASETS_WITH_3CONFFILES += 1

    global DATASETS_WITH_FULL_PROVENANCE
    if (cmsDriver1 and cmsDriver2) or len(conffile_ids) > 2:
        DATASETS_WITH_FULL_PROVENANCE += 1

    # pile up information
    mcm_dict = get_mcm_dict(dataset, mcm_dir)
    if mcm_dict:
        pileup = get_from_deep_json(mcm_dict, 'pileup')
        pileup_dataset = get_from_deep_json(mcm_dict, 'pileup_dataset_name')
        if pileup or pileup_dataset:
            print('    - pile-up:')
            if pileup:
                print('        -', pileup)
            if pileup_dataset:
                print('        -', pileup_dataset)

        notes = get_from_deep_json(mcm_dict, 'notes')
        if notes != None:
            print(
                '    - notes:', notes.replace('\n', '\n        ')
            )  # some notes have several lines, this makes the markdown use them in the same item list