def load_single_dataset(run_id, treemakers, preselection=None, force_reload=False, event_list=None): """Run multiple treemakers on a single run :returns: (pandas DataFrame, list of dicts describing cut histories) :param run_id: name or number of the run to load :param treemakers: list of treemaker classes / names to load :param preselection: String or list of strings passed to pandas.eval. Should return bool array, to be used for pre-selecting events to load for each dataset. :param force_reload: always remake the minitrees, never load any from disk. :param event_list: List of event numbers to visit. Disables load from / save to file. """ if isinstance(treemakers, (type, str)): treemakers = [treemakers] if isinstance(preselection, str): preselection = [preselection] if preselection is None: preselection = [] dataframes = [] for treemaker in treemakers: try: dataset_frame = load_single_minitree( run_id, treemaker, force_reload=force_reload, event_list=event_list) except NoMinitreeAvailable as e: log.debug(str(e)) return pd.DataFrame([], columns=['event_number', 'run_number']), [] dataframes.append(dataset_frame) # Merge mini-trees of all types by inner join # (propagating "cuts" applied by skipping rows in MultipleRowExtractor) if not len(dataframes): raise RuntimeError("No data was extracted? What's going on??") result = dataframes[0] for i in range(1, len(dataframes)): result = _merge_minitrees(result, dataframes[i]) # Apply the unblinding selection if required. # Normally this is already done by minitrees.load, but perhaps someone calls # load_single_dataset_directly. if (hax.unblinding.unblinding_selection not in preselection and ('Corrections' in treemakers or hax.treemakers.corrections.Corrections in treemakers) and hax.unblinding.is_blind(run_id)): preselection = [hax.unblinding.unblinding_selection] + preselection # Apply pre-selection cuts before moving on to the next dataset for ps in preselection: result = cuts.eval_selection(result, ps, quiet=True) return result, cuts._get_history(result)
def save_cache_file(data, cache_file, **kwargs): """Save minitree dataframe + cut history to a cache file Any kwargs will be passed to pandas HDFStore. Defaults are: complib='blosc' complevel=9 """ kwargs.setdefault('complib', 'blosc') kwargs.setdefault('complevel', 9) dirname = os.path.dirname(cache_file) if dirname and not os.path.exists(dirname): os.makedirs(dirname) store = pd.HDFStore(cache_file, **kwargs) store.put('data', data) # Store the cuts history for the data store.get_storer('data').attrs.cut_history = cuts._get_history(data) store.close()
def load_single_dataset(run_id, treemakers, preselection, force_reload=False): """Return pandas DataFrame resulting from running multiple treemakers on run_id (name or number), list of dicts describing cut histories. :param run_id: name or number of the run to load :param treemakers: list of treemaker class / instances to load :param preselection: String or list of strings passed to pandas.eval. Should return bool array, to be used for pre-selecting events to load for each dataset. :param force_reload: always remake the minitrees, never load any from disk. """ if isinstance(treemakers, (type, str)): treemakers = [treemakers] if isinstance(preselection, str): preselection = [preselection] if preselection is None: preselection = [] dataframes = [] for treemaker in treemakers: try: dataset_frame = load_single_minitree(run_id, treemaker, force_reload=force_reload) except NoMinitreeAvailable as e: log.debug(str(e)) return pd.DataFrame([], columns=['event_number', 'run_number']), [] dataframes.append(dataset_frame) # Merge mini-trees of all types by inner join # (propagating "cuts" applied by skipping rows in MultipleRowExtractor) if not len(dataframes): raise RuntimeError("No data was extracted? What's going on??") result = dataframes[0] for i in range(1, len(dataframes)): d = dataframes[i] # To avoid creation of duplicate columns (which will get _x and _y suffixes), # look which column names already exist and do not include them in the merge cols_to_use = ['run_number', 'event_number'] + d.columns.difference(result.columns).tolist() result = pd.merge(d[cols_to_use], result, on=['run_number', 'event_number'], how='inner') # Apply pre-selection cuts before moving on to the next dataset for ps in preselection: result = cuts.eval_selection(result, ps, quiet=True) return result, cuts._get_history(result)
def load(datasets, treemakers=tuple(['Fundamentals', 'Basics']), preselection=None, force_reload=False, delayed=False, num_workers=1, compute_options=None, cache_file=None, remake_cache=False): """Return pandas DataFrame with minitrees of several datasets and treemakers. :param datasets: names or numbers of datasets (without .root) to load :param treemakers: treemaker class (or string with name of class) or list of these to load. :param preselection: string or list of strings parseable by pd.eval. Should return bool array, to be used for pre-selecting events to load for each dataset. :param force_reload: if True, will force mini-trees to be re-made whether they are outdated or not. :param delayed: Instead of computing a pandas DataFrame, return a dask DataFrame (default False) :param num_workers: Number of dask workers to use in computation (if delayed=False) :param compute_options: Dictionary of extra options passed to dask.compute :param cache_file: Save/load the result to an hdf5 file with filename specified by cahce_file. Useful if you load in a large volume of data with many preselections. :param remake_cache: If True, and cache file given, reload (don't remake) minitrees and overwrite the cache file. """ if cache_file and not remake_cache and os.path.exists(cache_file): # We don't have to do anything and can just load from the cache file store = pd.HDFStore(cache_file) result = store['data'] result.cut_history = store.get_storer('data').attrs.cut_history store.close() return result if isinstance(datasets, (str, int, np.int64, np.int, np.int32)): datasets = [datasets] if compute_options is None: compute_options = {} compute_options.setdefault('get', dask.multiprocessing.get) partial_results = [] partial_histories = [] for dataset in datasets: mashup = dask.delayed(load_single_dataset)(dataset, treemakers, preselection, force_reload=force_reload) partial_results.append(dask.delayed(lambda x: x[0])(mashup)) partial_histories.append(dask.delayed(lambda x: x[1])(mashup)) result = dask.dataframe.from_delayed(partial_results, meta=partial_results[0].compute()) if not delayed: # Dask doesn't seem to want to descend into the lists beyond the first. # So we mash things into one list before calling compute, then split it again mashedup_result = dask.compute(*([result] + partial_histories), num_workers=num_workers, **compute_options) result = mashedup_result[0] if 'index' in result.columns: # Clean up index, remove 'index' column # Probably we're doing something weird with pandas, this doesn't seem like the well-trodden path... log.debug("Removing weird index column") result.drop('index', axis=1, inplace=True) result = result.reset_index() result.drop('index', axis=1, inplace=True) # Combine the histories of partial results. # For unavailable minitrees, the histories will be empty: filter these empty histories out partial_histories = mashedup_result[1:] partial_histories = [x for x in partial_histories if len(x)] if len(partial_histories): cuts.record_combined_histories(result, partial_histories) else: # Magic for tracking of cut histories while using dask.dataframe here... pass if cache_file: # Save the result to the cache file dirname = os.path.dirname(cache_file) if dirname and not os.path.exists(dirname): os.makedirs(dirname) store = pd.HDFStore(cache_file) store.put('data', result) # Store the cuts history for the data store.get_storer('data').attrs.cut_history = cuts._get_history(result) store.close() return result