Python get_dataset_format примеры, utils.get_dataset_format Python примеры использования

Пример #1

0

Показать файл

Файл: eos_store.py Проект: cernopendata/data-curation

def get_dataset_location(dataset):
    "Return EOS location of the dataset."
    return XROOTD_DIR_BASE + \
        EXPERIMENT + '/' + \
        MCDIR_BASE + '/' + \
        get_dataset_runperiod(dataset) + '/' + \
        get_dataset_name(dataset) + '/' + \
        get_dataset_format(dataset) + '/' + \
        get_dataset_version(dataset)

Пример #2

0

Показать файл

Файл: eos_store.py Проект: cernopendata/data-curation

def get_dataset_index_file_base(dataset):
    "Return index file base for given dataset."
    filebase = EXPERIMENT.upper() + '_' + \
               MCDIR_BASE + '_' + \
               get_dataset_runperiod(dataset) + '_' + \
               get_dataset_name(dataset) + '_' + \
               get_dataset_format(dataset) + '_' + \
               get_dataset_version(dataset)
    return filebase

Пример #3

0

Показать файл

Файл: mcm_store.py Проект: mantasavas/data-curation

def get_genfragment_url(dataset, mcm_dir, das_dir):
    "return list of url's of the genfragments used"
    input_dataset = ''
    url = []

    # get GEN-SIM dataset
    if get_dataset_format(dataset) == 'AODSIM':
        dataset_json = get_das_store_json(dataset, 'mcm', das_dir)
        input_dataset = get_from_deep_json(dataset_json, 'input_dataset')
    else:
        input_dataset = dataset

    script_path = get_cmsDriver_script(input_dataset, mcm_dir)
    if script_path == None:
        return None

    with open(script_path, 'r') as script:
        for line in script:
            if 'curl' in line:
                curl = re.search('(?P<url>https?://[^\s]+)', line)
                if curl:
                    url.append(curl.group('url'))
    return url

Пример #4

0

Показать файл

Файл: dataset_records.py Проект: mantasavas/data-curation

def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir,
                  mcm_dir, conffiles_dir):
    """Create record for the given dataset."""

    rec = {}

    dataset = get_dataset(dataset_full_name)
    dataset_format = get_dataset_format(dataset_full_name)
    year_created = str(get_dataset_year(dataset_full_name))
    year_published = '2018'  # FIXME get from somewhere, do not hardcode it!
    run_period = ['Run' + year_created + 'A',
                  'Run' + year_created + 'B']  # FIXME remove the 'A'!!
    global_tag = get_global_tag(dataset_full_name, mcm_dir)
    release = get_cmssw_version(dataset_full_name, mcm_dir)

    additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data'

    rec['abstract'] = {}
    rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \
                                     '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>'

    rec['accelerator'] = "CERN-LHC"

    rec['collaboration'] = {}
    rec['collaboration']['name'] = 'CMS Collaboration'
    rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name)

    rec['collections'] = [
        'CMS-Simulated-Datasets',
    ]

    rec['collision_information'] = {}
    rec['collision_information']['energy'] = get_dataset_energy(
        dataset_full_name, mcm_dir)
    rec['collision_information']['type'] = 'pp'  # FIXME do not hardcode

    # FIXME cross section not working
    # we should try to get the cross section from the parent, and the parent-parent, and so on...
    generator_parameters = get_generator_parameters(dataset_full_name, das_dir)
    if generator_parameters:
        rec['cross_section'] = {}
        rec['cross_section']['value'] = generator_parameters.get(
            'cross_section', None)
        rec['cross_section']['filter_efficiency:'] = generator_parameters.get(
            'filter_efficiency', None)
        rec['cross_section'][
            'filter_efficiency_error:'] = generator_parameters.get(
                'filter_efficiency_error', None)
        rec['cross_section']['match_efficiency:'] = generator_parameters.get(
            'match_efficiency', None)
        rec['cross_section'][
            'match_efficiency error:'] = generator_parameters.get(
                'match_efficiency_error', None)

    rec['date_created'] = [year_created]
    rec['date_published'] = year_published
    rec['date_reprocessed'] = year_created

    rec['distribution'] = {}
    rec['distribution']['formats'] = [dataset_format.lower(), 'root']
    rec['distribution']['number_events'] = get_number_events(
        dataset_full_name, das_dir)
    rec['distribution']['number_files'] = get_number_files(
        dataset_full_name, das_dir)
    rec['distribution']['size'] = get_size(dataset_full_name, das_dir)

    if not dataset_full_name in doi_info:
        rec['distribution']['availability'] = 'ondemand'

    doi = get_doi(dataset_full_name, doi_info)
    if doi:
        rec['doi'] = doi

    rec['experiment'] = 'CMS'

    rec_files = get_dataset_index_files(dataset_full_name, eos_dir)
    if rec_files:
        rec['files'] = []  #TODO if no files: "Dataset available under request"
        for index_type in ['.json', '.txt']:
            index_files = [f for f in rec_files if f[0].endswith(index_type)]
            for file_number, (file_uri, file_size,
                              file_checksum) in enumerate(index_files):
                rec['files'].append({
                    'checksum':
                    'adler32:' + file_checksum,
                    'description':
                    dataset + dataset_format + ' dataset file index (' +
                    str(file_number + 1) + ' of ' + str(len(index_files)) +
                    ') for access to data via CMS virtual machine',
                    'size':
                    file_size,
                    'type':
                    'index' + index_type,
                    'uri':
                    file_uri
                })

    rec['license'] = {}
    rec['license']['attribution'] = 'CC0'

    rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir,
                                                mcm_dir, conffiles_dir)

    rec['note'] = {}
    rec['note'][
        'description'] = 'These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.'

    pileup_dataset_title = {
        '2010': None,
        '2011':
        '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM',
        '2012':
        '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM'
    }.get(year_created, 0)
    pileup_dataset_recid = {
        '2010': None,
        '2011': 36,
        '2012': 37
    }.get(year_created, 0)
    if pileup_dataset_recid:
        rec['pileup'] = {}
        rec['pileup'][
            'description'] = '<p>To make these simulated data comparable with the collision data, <a href="/docs/cms-guide-pileup-simulation">pile-up events</a> are added to the simulated event in this step.</p>'
        if pileup_dataset_recid:
            rec['pileup']['links'] = [
                {
                    'recid': str(pileup_dataset_recid),
                    'title': pileup_dataset_title
                },
            ]

    rec['publisher'] = 'CERN Open Data Portal'

    rec['recid'] = str(recid_info[dataset_full_name])

    # rec['relations'] = []
    # rec['relations']['title'] = ''  # FIXME
    # rec['relations']['type'] = 'isChildOf'

    rec['run_period'] = run_period

    # recomended global tag and cmssw release recommended for analysis
    recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name)
    recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name)
    rec['system_details'] = {}
    rec['system_details']['global_tag'] = recommended_gt
    rec['system_details']['release'] = recommended_cmssw

    rec['title'] = dataset_full_name

    rec['title_additional'] = additional_title

    topic = guess_title_category(dataset_full_name)
    category = topic.split('/')[0]
    subcategory = None
    if len(topic.split('/')) == 2:
        subcategory = topic.split('/')[1]
    rec['categories'] = {}
    rec['categories']['primary'] = category
    if subcategory:
        rec['categories']['secondary'] = [subcategory]
    rec['categories']['source'] = 'CMS Collaboration'

    rec['type'] = {}
    rec['type']['primary'] = 'Dataset'
    rec['type']['secondary'] = [
        'Simulated',
    ]

    year_getting_started = {
        '2010': 2010,
        '2011': 2011,
        '2012': 2011
    }.get(year_created, 2011)
    rec['usage'] = {}
    rec['usage'][
        'description'] = 'You can access these data through the CMS Virtual Machine. See the instructions for setting up the Virtual Machine and getting started in'
    rec['usage']['links'] = [{
        "description":
        "How to install the CMS Virtual Machine",
        "url":
        "/docs/cms-virtual-machine-{}".format(year_created)
    }, {
        "description":
        "Getting started with CMS open data",
        "url":
        "/docs/cms-getting-started-{}".format(year_getting_started)
    }]

    rec['validation'] = {}
    rec['validation'][
        'description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures."
    #rec['validation']['links'] = 'FIXME'

    return rec

Пример #5

0

Показать файл

Файл: dataset_records.py Проект: cernopendata/data-curation

def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir,
                  mcm_dir, conffiles_dir):
    """Create record for the given dataset."""

    rec = {}

    dataset = get_dataset(dataset_full_name)
    dataset_format = get_dataset_format(dataset_full_name)
    year_created = str(get_dataset_year(dataset_full_name))
    year_published = '2021'  # FIXME get from somewhere, do not hardcode it!
    run_period = ['Run2015C', 'Run2015D']  # FIXME Hardcoded!!
    global_tag = get_global_tag(dataset_full_name, mcm_dir)
    release = get_cmssw_version(dataset_full_name, das_dir, mcm_dir)

    additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data'

    rec['abstract'] = {}
    rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \
                                     '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>' + \
                                     '<p>These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.</p>'

    rec['accelerator'] = "CERN-LHC"

    rec['collaboration'] = {}
    rec['collaboration']['name'] = 'CMS Collaboration'
    rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name)

    rec['collections'] = [
        'CMS-Simulated-Datasets',
    ]

    rec['collision_information'] = {}
    rec['collision_information']['energy'] = get_dataset_energy(
        dataset_full_name, mcm_dir)
    rec['collision_information']['type'] = 'pp'  # FIXME do not hardcode

    # FIXME cross section not working
    # we should try to get the cross section from the parent, and the parent-parent, and so on...
    generator_parameters = get_generator_parameters_from_mcm(
        dataset_full_name, mcm_dir)
    if generator_parameters:
        rec['cross_section'] = {}
        rec['cross_section']['value'] = generator_parameters.get(
            'cross_section', None)
        rec['cross_section']['filter_efficiency:'] = generator_parameters.get(
            'filter_efficiency', None)
        rec['cross_section'][
            'filter_efficiency_error:'] = generator_parameters.get(
                'filter_efficiency_error', None)
        rec['cross_section']['match_efficiency:'] = generator_parameters.get(
            'match_efficiency', None)
        rec['cross_section'][
            'match_efficiency error:'] = generator_parameters.get(
                'match_efficiency_error', None)

    rec['date_created'] = [year_created]
    rec['date_published'] = year_published
    rec['date_reprocessed'] = year_created

    rec['distribution'] = {}
    rec['distribution']['formats'] = [dataset_format.lower(), 'root']
    rec['distribution']['number_events'] = get_number_events(
        dataset_full_name, das_dir)
    rec['distribution']['number_files'] = get_number_files(
        dataset_full_name, das_dir)
    rec['distribution']['size'] = get_size(dataset_full_name, das_dir)

    #if not dataset_full_name in doi_info: FIXME
    #    rec['distribution']['availability'] = 'ondemand'

    doi = get_doi(dataset_full_name, doi_info)
    if doi:
        rec['doi'] = doi

    rec['experiment'] = 'CMS'

    rec_files = get_dataset_index_files(dataset_full_name, eos_dir)
    if rec_files:
        rec['files'] = []  #TODO if no files: "Dataset available under request"
        for index_type in ['.json', '.txt']:
            index_files = [f for f in rec_files if f[0].endswith(index_type)]
            for file_number, (file_uri, file_size,
                              file_checksum) in enumerate(index_files):
                rec['files'].append({
                    'checksum':
                    'adler32:' + file_checksum,
                    'description':
                    dataset + dataset_format + ' dataset file index (' +
                    str(file_number + 1) + ' of ' + str(len(index_files)) +
                    ') for access to data via CMS virtual machine',
                    'size':
                    file_size,
                    'type':
                    'index' + index_type,
                    'uri':
                    file_uri
                })

    rec['license'] = {}
    rec['license']['attribution'] = 'CC0'

    rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir,
                                                mcm_dir, conffiles_dir)

    pileup_dataset_name = ''
    parent = dataset_full_name
    while parent != '' and parent and not pileup_dataset_name:
        pileup_dataset_name = get_pileup_from_mcm(parent, mcm_dir)
        parent = get_parent_dataset(parent,
                                    das_dir) or get_parent_dataset_from_mcm(
                                        parent, das_dir, mcm_dir)

    pileup_dataset_recid = {
        '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM':
        36,  # 2011
        '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM':
        37,  # 2012
        '/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM':
        22314,  # 2015
        #'/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-magnetOffBS0T_MCRUN2_71_V1-v1/GEN-SIM': {recid}, # 2015 TODO
        '/MinBias_TuneCP5_13TeV-pythia8/RunIIFall18GS-IdealGeometry_102X_upgrade2018_design_v9-v1/GEN-SIM':
        12302  # 2018
    }.get(pileup_dataset_name, 0)

    if pileup_dataset_name:
        rec['pileup'] = {}
        if pileup_dataset_recid:
            rec['pileup'][
                'description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> are added to the simulated event in this step.</p>"
            rec['pileup']['links'] = [{
                "recid": str(pileup_dataset_recid),
                "title": pileup_dataset_name
            }]
        else:
            rec['pileup']['description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> from the dataset <code>"\
                                            + pileup_dataset_name\
                                            + "</code> are added to the simulated event in this step.</p>"

    rec['publisher'] = 'CERN Open Data Portal'

    rec['recid'] = str(recid_info[dataset_full_name])

    # rec['relations'] = []
    # rec['relations']['title'] = ''  # FIXME
    # rec['relations']['type'] = 'isChildOf'

    rec['run_period'] = run_period

    # recomended global tag and cmssw release recommended for analysis
    recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name)
    recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name)
    rec['system_details'] = {}
    rec['system_details'][
        'global_tag'] = "76X_mcRun2_asymptotic_RunIIFall15DR76_v1"  # FIXME
    rec['system_details']['release'] = "CMSSW_7_6_7"  # FIXME

    rec['title'] = dataset_full_name

    rec['title_additional'] = additional_title

    topic = guess_title_category(dataset_full_name)
    category = topic.split('/')[0]
    subcategory = None
    if len(topic.split('/')) == 2:
        subcategory = topic.split('/')[1]
    rec['categories'] = {}
    rec['categories']['primary'] = category
    if subcategory:
        rec['categories']['secondary'] = [subcategory]
    rec['categories']['source'] = 'CMS Collaboration'

    rec['type'] = {}
    rec['type']['primary'] = 'Dataset'
    rec['type']['secondary'] = [
        'Simulated',
    ]

    year_getting_started = {
        '2010': 2010,
        '2011': 2011,
        '2012': 2011
    }.get(year_created, 2011)
    rec['usage'] = {}
    rec['usage'][
        'description'] = "You can access these data through the CMS Open Data container or the CMS Virtual Machine. See the instructions for setting up one of the two alternative environments and getting started in"  # FIXME
    rec['usage']['links'] = [  # FIXME
        {
            "description": "Running CMS analysis code using Docker",
            "url": "/docs/cms-guide-docker"
        }, {
            "description": "How to install the CMS Virtual Machine",
            "url": "/docs/cms-virtual-machine-2015"
        }, {
            "description": "Getting started with CMS open data",
            "url": "/docs/cms-getting-started-2015"
        }
    ]

    rec['validation'] = {}
    rec['validation'][
        'description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures."
    #rec['validation']['links'] = 'FIXME'

    return rec

Python get_dataset_format примеры использования