Exemplos de flatten_dict em Python, exemplos de hax.utils.flatten_dict em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: pmt_plot.py Projeto: XENON1T/hax

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

from pax import units
from pax.configuration import load_configuration

from hax.utils import flatten_dict

##
# Load the PMT data from the pax configuration
##

# Convert PMT/channel map to record array
pax_config = load_configuration('XENON1T')      # TODO: depends on experiment, should do after init
pmt_data = pd.DataFrame([flatten_dict(info, separator=':')
                         for info in pax_config['DEFAULT']['pmts']
                         if 'array' in info])
pmt_numbering_start = pmt_data['pmt_position'].min()


##
# Plotting functions
##

def _pad_to_length_of(a, b):
    """Pads a with zeros until it has the length of b"""
    lendiff = len(b) - len(a)
    if lendiff < 0:
        raise ValueError("Cannot pad a negative number of zeros!")
    elif lendiff > 0:

Exemplo n.º 2

0

Exibir arquivo

Arquivo: runs.py Projeto: skazama/hax

def update_datasets(query=None):
    """Update hax.runs.datasets to contain latest datasets.
    Currently just loads XENON100 run 10 runs from a csv file.
    query: custom query, in case you only want to update partially??
    """
    global datasets
    experiment = hax.config['experiment']

    version_policy = hax.config['pax_version_policy']

    if not hax.config.get('use_runs_db', True):
        hax.log.info("Not looking for datasets in runs, db since you put use_runs_db = False")
        return

    if experiment == 'XENON100':
        # Fetch runs information from static csv files in runs info
        for rundbfile in glob(os.path.join(hax.config['runs_info_dir'], '*.csv')):
            tpc, run = os.path.splitext(os.path.basename(rundbfile))[0].split('_')
            dsets = pd.read_csv(rundbfile)
            dsets = pd.concat((dsets, pd.DataFrame([{'tpc': tpc, 'run': run}] * len(dsets))), axis=1)
            if datasets is not None and len(datasets):
                datasets = pd.concat((datasets, dsets), ignore_index=True)
            else:
                datasets = dsets

    elif experiment == 'XENON1T':
        collection = get_rundb_collection()
        docs = []

        if query is None:
            query = {}
        query['detector'] = hax.config.get('detector', hax.config['detector'])

        log.debug("Updating datasets from runs database... ")
        cursor = collection.find(query,
                                 ['name', 'number', 'start', 'end', 'source',
                                  'reader.self_trigger', 'reader.ini.name',
                                  'trigger.events_built', 'trigger.status',
                                  'tags.name',
                                  'data'])
        for doc in cursor:
            # Process and flatten the doc
            # Convert tags to single string
            doc['tags'] = ','.join([t['name'] for t in doc.get('tags', [])])
            doc = flatten_dict(doc, separator='__')
            del doc['_id']  # Remove the Mongo document ID
            if 'data' in doc:
                data_docs = doc['data']
                del doc['data']
            else:
                data_docs = []
            doc = flatten_dict(doc, separator='__')

            if version_policy != 'loose':

                # Does the run db know where to find the processed data at this host?
                processed_data_docs = [d for d in data_docs
                                       if (d['type'] == 'processed'
                                           and hax.config['cax_key'] in d['host']
                                           and d['status'] == 'transferred')]

                if version_policy != 'latest':
                    # Filter out versions not consistent with the version policy.
                    # We will take the latest of the remaining ones later
                    processed_data_docs = [
                        d for d in processed_data_docs if version_is_consistent_with_policy(d['pax_version'])]

                # If there is a processed data consistent with the version
                # policy, set its location
                doc['location'] = ''
                doc['pax_version'] = ''
                if len(processed_data_docs):
                    # Take the data doc with the most recent policy-consistent
                    # pax version
                    data_we_take = max(processed_data_docs, key=lambda x: LooseVersion(x['pax_version']))
                    doc['location'] = data_we_take['location']
                    doc['pax_version'] = data_we_take['pax_version'][1:]

            docs.append(doc)

        datasets = pd.DataFrame(docs)
        log.debug("... done.")

    # These may or may not have been set already:
    if 'pax_version' not in datasets:
        datasets['pax_version'] = [''] * len(datasets)
    if 'location' not in datasets:
        datasets['location'] = [''] * len(datasets)
    if 'raw_data_subfolder' not in datasets:
        datasets['raw_data_subfolder'] = [''] * len(datasets)
    if 'raw_data_found' not in datasets:
        datasets['raw_data_found'] = [False] * len(datasets)
    if 'raw_data_used_local_path' not in datasets:
        datasets['raw_data_used_local_path'] = [''] * len(datasets)
    dataset_names = datasets['name'].values

    if version_policy == 'loose':
        # Walk through main_data_paths, looking for root files
        # Reversed, since if we find a dataset again, we overwrite, and
        # usually people put first priority stuff at the front.
        for data_dir in reversed(hax.config.get('main_data_paths', [])):
            for candidate in glob(os.path.join(data_dir, '*.root')):
                # What dataset is this file for?
                dsetname = os.path.splitext(os.path.basename(candidate))[0]
                bla = np.where(dataset_names == dsetname)[0]
                if len(bla):
                    # Dataset was found, index is in bla[0]
                    datasets.loc[bla[0], 'location'] = candidate

    # For the raw data, we may need to look in subfolders ('run_10' etc)
    # don't do os.path.exist for each dataset, it will take minutes, at least
    # over sshfs
    if hax.config['raw_data_access_mode'] == 'local':
        for raw_data_path in hax.config['raw_data_local_path']:
            for subfolder, dsets_in_subfolder in datasets.groupby(
                    'raw_data_subfolder'):
                subfolder_path = os.path.join(raw_data_path, subfolder)
                if not os.path.exists(subfolder_path):
                    log.debug(
                        "Folder %s not found when looking for raw data" %
                        subfolder_path)
                    continue
                for candidate in os.listdir(subfolder_path):
                    bla = np.where(dataset_names == candidate)[0]
                    if len(bla):
                        if not datasets.loc[bla[0], 'raw_data_found']:
                            datasets.loc[bla[0],
                                         'raw_data_used_local_path'] = raw_data_path
                        datasets.loc[bla[0], 'raw_data_found'] = True

Exemplo n.º 3

0

Exibir arquivo

Arquivo: runs.py Projeto: XENON1T/hax

def update_datasets(query=None):
    """Update hax.runs.datasets to contain latest datasets.
    Currently just loads XENON100 run 10 runs from a csv file.
    query: custom query, in case you only want to update partially??
    """
    global datasets
    experiment = hax.config['experiment']

    version_policy = hax.config['pax_version_policy']

    if experiment == 'XENON100':
        # Fetch runs information from static csv files in runs info
        for rundbfile in glob(os.path.join(hax.config['runs_info_dir'], '*.csv')):
            tpc, run = os.path.splitext(os.path.basename(rundbfile))[0].split('_')
            dsets = pd.read_csv(rundbfile)
            dsets = pd.concat((dsets, pd.DataFrame([{'tpc': tpc, 'run': run}] * len(dsets))),
                              axis=1)
            if datasets is not None and len(datasets):
                datasets = dsets
            else:
                datasets = pd.concat((datasets, dsets))

    elif experiment == 'XENON1T':
        collection = get_rundb_collection()
        docs = []

        if query is None:
            query = {}
        query['detector'] = hax.config.get('detector', hax.config['detector'])

        log.debug("Updating datasets from runs database... ")
        cursor = collection.find(query,
                                ['name', 'number', 'start', 'end', 'source',
                                 'reader.self_trigger',
                                 'trigger.events_built', 'trigger.status',
                                 'tags.name',
                                 'data'])
        for doc in cursor:
            # Process and flatten the doc
            doc['tags'] = ','.join([t['name'] for t in doc.get('tags', [])])   # Convert tags to single string
            doc = flatten_dict(doc, separator='__')
            del doc['_id']   # Remove the Mongo document ID
            if 'data' in doc:
                data_docs = doc['data']
                del doc['data']
            else:
                data_docs = []
            doc = flatten_dict(doc, separator='__')

            if version_policy != 'loose':

                # Does the run db know where to find the processed data at this host?
                processed_data_docs = [d for d in data_docs
                                       if (d['type'] == 'processed'
                                           and hax.config['cax_key'] in d['host']
                                           and d['status'] == 'transferred')]

                # Choose whether to use this data / which data to use, based on the version policy
                doc['location'] = ''
                if processed_data_docs:
                    if version_policy == 'latest':
                        doc['location'] = max(processed_data_docs,
                                              key=lambda x: LooseVersion(x['pax_version']))['location']
                    else:
                        for dd in processed_data_docs:
                            if dd['pax_version'][1:] == hax.config['pax_version_policy']:
                                doc['location'] = dd['location']

            docs.append(doc)

        datasets = pd.DataFrame(docs)
        log.debug("... done.")

    # These may or may not have been set already:
    if not 'location' in datasets:
        datasets['location'] = [''] * len(datasets)
    if not 'raw_data_subfolder' in datasets:
        datasets['raw_data_subfolder'] = [''] * len(datasets)
    if not 'raw_data_found' in datasets:
        datasets['raw_data_found'] = [False] * len(datasets)
    dataset_names = datasets['name'].values

    if version_policy == 'loose':
        # Walk through main_data_paths, looking for root files
        # Reversed, since if we find a dataset again, we overwrite, and 
        # usually people put first priority stuff at the front.
        for data_dir in reversed(hax.config.get('main_data_paths', [])):
            for candidate in glob(os.path.join(data_dir, '*.root')):
                # What dataset is this file for?
                dsetname = os.path.splitext(os.path.basename(candidate))[0]
                bla = np.where(dataset_names == dsetname)[0]
                if len(bla):
                    # Dataset was found, index is in bla[0]
                    datasets.loc[bla[0], 'location'] = candidate

    # For the raw data, we may need to look in subfolders ('run_10' etc)
    # don't do os.path.exist for each dataset, it will take minutes, at least over sshfs
    if hax.config['raw_data_access_mode'] == 'local':
        for subfolder, dsets_in_subfolder in datasets.groupby('raw_data_subfolder'):
            subfolder_path = os.path.join(hax.config['raw_data_local_path'], subfolder)
            if not os.path.exists(subfolder_path):
                log.debug("Folder %s not found when looking for raw data" % subfolder_path)
                continue
            for candidate in os.listdir(subfolder_path):
                bla = np.where(dataset_names == candidate)[0]
                if len(bla):
                    datasets.loc[bla[0], 'raw_data_found'] = True

Exemplo n.º 4

0

Exibir arquivo

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from pax.configuration import load_configuration
from hax.utils import flatten_dict

##
# Load the PMT data from the pax configuration
##

# Convert PMT/channel map to record array
# TODO: depends on experiment, should do after init
pax_config = load_configuration('XENON1T')
pmt_data = pd.DataFrame([
    flatten_dict(info, separator=':') for info in pax_config['DEFAULT']['pmts']
    if 'array' in info
])
pmt_numbering_start = pmt_data['pmt_position'].min()

##
# Plotting functions
##


def _pad_to_length_of(a, b):
    """Pads a with zeros until it has the length of b"""
    lendiff = len(b) - len(a)
    if lendiff < 0:
        raise ValueError("Cannot pad a negative number of zeros!")
    elif lendiff > 0: