import matplotlib import matplotlib.pyplot as plt import pandas as pd from pax import units from pax.configuration import load_configuration from hax.utils import flatten_dict ## # Load the PMT data from the pax configuration ## # Convert PMT/channel map to record array pax_config = load_configuration('XENON1T') # TODO: depends on experiment, should do after init pmt_data = pd.DataFrame([flatten_dict(info, separator=':') for info in pax_config['DEFAULT']['pmts'] if 'array' in info]) pmt_numbering_start = pmt_data['pmt_position'].min() ## # Plotting functions ## def _pad_to_length_of(a, b): """Pads a with zeros until it has the length of b""" lendiff = len(b) - len(a) if lendiff < 0: raise ValueError("Cannot pad a negative number of zeros!") elif lendiff > 0:
def update_datasets(query=None): """Update hax.runs.datasets to contain latest datasets. Currently just loads XENON100 run 10 runs from a csv file. query: custom query, in case you only want to update partially?? """ global datasets experiment = hax.config['experiment'] version_policy = hax.config['pax_version_policy'] if not hax.config.get('use_runs_db', True): hax.log.info("Not looking for datasets in runs, db since you put use_runs_db = False") return if experiment == 'XENON100': # Fetch runs information from static csv files in runs info for rundbfile in glob(os.path.join(hax.config['runs_info_dir'], '*.csv')): tpc, run = os.path.splitext(os.path.basename(rundbfile))[0].split('_') dsets = pd.read_csv(rundbfile) dsets = pd.concat((dsets, pd.DataFrame([{'tpc': tpc, 'run': run}] * len(dsets))), axis=1) if datasets is not None and len(datasets): datasets = pd.concat((datasets, dsets), ignore_index=True) else: datasets = dsets elif experiment == 'XENON1T': collection = get_rundb_collection() docs = [] if query is None: query = {} query['detector'] = hax.config.get('detector', hax.config['detector']) log.debug("Updating datasets from runs database... ") cursor = collection.find(query, ['name', 'number', 'start', 'end', 'source', 'reader.self_trigger', 'reader.ini.name', 'trigger.events_built', 'trigger.status', 'tags.name', 'data']) for doc in cursor: # Process and flatten the doc # Convert tags to single string doc['tags'] = ','.join([t['name'] for t in doc.get('tags', [])]) doc = flatten_dict(doc, separator='__') del doc['_id'] # Remove the Mongo document ID if 'data' in doc: data_docs = doc['data'] del doc['data'] else: data_docs = [] doc = flatten_dict(doc, separator='__') if version_policy != 'loose': # Does the run db know where to find the processed data at this host? processed_data_docs = [d for d in data_docs if (d['type'] == 'processed' and hax.config['cax_key'] in d['host'] and d['status'] == 'transferred')] if version_policy != 'latest': # Filter out versions not consistent with the version policy. # We will take the latest of the remaining ones later processed_data_docs = [ d for d in processed_data_docs if version_is_consistent_with_policy(d['pax_version'])] # If there is a processed data consistent with the version # policy, set its location doc['location'] = '' doc['pax_version'] = '' if len(processed_data_docs): # Take the data doc with the most recent policy-consistent # pax version data_we_take = max(processed_data_docs, key=lambda x: LooseVersion(x['pax_version'])) doc['location'] = data_we_take['location'] doc['pax_version'] = data_we_take['pax_version'][1:] docs.append(doc) datasets = pd.DataFrame(docs) log.debug("... done.") # These may or may not have been set already: if 'pax_version' not in datasets: datasets['pax_version'] = [''] * len(datasets) if 'location' not in datasets: datasets['location'] = [''] * len(datasets) if 'raw_data_subfolder' not in datasets: datasets['raw_data_subfolder'] = [''] * len(datasets) if 'raw_data_found' not in datasets: datasets['raw_data_found'] = [False] * len(datasets) if 'raw_data_used_local_path' not in datasets: datasets['raw_data_used_local_path'] = [''] * len(datasets) dataset_names = datasets['name'].values if version_policy == 'loose': # Walk through main_data_paths, looking for root files # Reversed, since if we find a dataset again, we overwrite, and # usually people put first priority stuff at the front. for data_dir in reversed(hax.config.get('main_data_paths', [])): for candidate in glob(os.path.join(data_dir, '*.root')): # What dataset is this file for? dsetname = os.path.splitext(os.path.basename(candidate))[0] bla = np.where(dataset_names == dsetname)[0] if len(bla): # Dataset was found, index is in bla[0] datasets.loc[bla[0], 'location'] = candidate # For the raw data, we may need to look in subfolders ('run_10' etc) # don't do os.path.exist for each dataset, it will take minutes, at least # over sshfs if hax.config['raw_data_access_mode'] == 'local': for raw_data_path in hax.config['raw_data_local_path']: for subfolder, dsets_in_subfolder in datasets.groupby( 'raw_data_subfolder'): subfolder_path = os.path.join(raw_data_path, subfolder) if not os.path.exists(subfolder_path): log.debug( "Folder %s not found when looking for raw data" % subfolder_path) continue for candidate in os.listdir(subfolder_path): bla = np.where(dataset_names == candidate)[0] if len(bla): if not datasets.loc[bla[0], 'raw_data_found']: datasets.loc[bla[0], 'raw_data_used_local_path'] = raw_data_path datasets.loc[bla[0], 'raw_data_found'] = True
def update_datasets(query=None): """Update hax.runs.datasets to contain latest datasets. Currently just loads XENON100 run 10 runs from a csv file. query: custom query, in case you only want to update partially?? """ global datasets experiment = hax.config['experiment'] version_policy = hax.config['pax_version_policy'] if experiment == 'XENON100': # Fetch runs information from static csv files in runs info for rundbfile in glob(os.path.join(hax.config['runs_info_dir'], '*.csv')): tpc, run = os.path.splitext(os.path.basename(rundbfile))[0].split('_') dsets = pd.read_csv(rundbfile) dsets = pd.concat((dsets, pd.DataFrame([{'tpc': tpc, 'run': run}] * len(dsets))), axis=1) if datasets is not None and len(datasets): datasets = dsets else: datasets = pd.concat((datasets, dsets)) elif experiment == 'XENON1T': collection = get_rundb_collection() docs = [] if query is None: query = {} query['detector'] = hax.config.get('detector', hax.config['detector']) log.debug("Updating datasets from runs database... ") cursor = collection.find(query, ['name', 'number', 'start', 'end', 'source', 'reader.self_trigger', 'trigger.events_built', 'trigger.status', 'tags.name', 'data']) for doc in cursor: # Process and flatten the doc doc['tags'] = ','.join([t['name'] for t in doc.get('tags', [])]) # Convert tags to single string doc = flatten_dict(doc, separator='__') del doc['_id'] # Remove the Mongo document ID if 'data' in doc: data_docs = doc['data'] del doc['data'] else: data_docs = [] doc = flatten_dict(doc, separator='__') if version_policy != 'loose': # Does the run db know where to find the processed data at this host? processed_data_docs = [d for d in data_docs if (d['type'] == 'processed' and hax.config['cax_key'] in d['host'] and d['status'] == 'transferred')] # Choose whether to use this data / which data to use, based on the version policy doc['location'] = '' if processed_data_docs: if version_policy == 'latest': doc['location'] = max(processed_data_docs, key=lambda x: LooseVersion(x['pax_version']))['location'] else: for dd in processed_data_docs: if dd['pax_version'][1:] == hax.config['pax_version_policy']: doc['location'] = dd['location'] docs.append(doc) datasets = pd.DataFrame(docs) log.debug("... done.") # These may or may not have been set already: if not 'location' in datasets: datasets['location'] = [''] * len(datasets) if not 'raw_data_subfolder' in datasets: datasets['raw_data_subfolder'] = [''] * len(datasets) if not 'raw_data_found' in datasets: datasets['raw_data_found'] = [False] * len(datasets) dataset_names = datasets['name'].values if version_policy == 'loose': # Walk through main_data_paths, looking for root files # Reversed, since if we find a dataset again, we overwrite, and # usually people put first priority stuff at the front. for data_dir in reversed(hax.config.get('main_data_paths', [])): for candidate in glob(os.path.join(data_dir, '*.root')): # What dataset is this file for? dsetname = os.path.splitext(os.path.basename(candidate))[0] bla = np.where(dataset_names == dsetname)[0] if len(bla): # Dataset was found, index is in bla[0] datasets.loc[bla[0], 'location'] = candidate # For the raw data, we may need to look in subfolders ('run_10' etc) # don't do os.path.exist for each dataset, it will take minutes, at least over sshfs if hax.config['raw_data_access_mode'] == 'local': for subfolder, dsets_in_subfolder in datasets.groupby('raw_data_subfolder'): subfolder_path = os.path.join(hax.config['raw_data_local_path'], subfolder) if not os.path.exists(subfolder_path): log.debug("Folder %s not found when looking for raw data" % subfolder_path) continue for candidate in os.listdir(subfolder_path): bla = np.where(dataset_names == candidate)[0] if len(bla): datasets.loc[bla[0], 'raw_data_found'] = True
import numpy as np import matplotlib import matplotlib.pyplot as plt import pandas as pd from pax.configuration import load_configuration from hax.utils import flatten_dict ## # Load the PMT data from the pax configuration ## # Convert PMT/channel map to record array # TODO: depends on experiment, should do after init pax_config = load_configuration('XENON1T') pmt_data = pd.DataFrame([ flatten_dict(info, separator=':') for info in pax_config['DEFAULT']['pmts'] if 'array' in info ]) pmt_numbering_start = pmt_data['pmt_position'].min() ## # Plotting functions ## def _pad_to_length_of(a, b): """Pads a with zeros until it has the length of b""" lendiff = len(b) - len(a) if lendiff < 0: raise ValueError("Cannot pad a negative number of zeros!") elif lendiff > 0: