Exemplo n.º 1
0
def load_single_dataset(run_id, treemakers, preselection=None, force_reload=False, event_list=None):
    """Run multiple treemakers on a single run

    :returns: (pandas DataFrame, list of dicts describing cut histories)

    :param run_id: name or number of the run to load

    :param treemakers: list of treemaker classes / names to load

    :param preselection: String or list of strings passed to pandas.eval. Should return bool array, to be used
                         for pre-selecting events to load for each dataset.

    :param force_reload: always remake the minitrees, never load any from disk.

    :param event_list: List of event numbers to visit. Disables load from / save to file.

    """
    if isinstance(treemakers, (type, str)):
        treemakers = [treemakers]
    if isinstance(preselection, str):
        preselection = [preselection]
    if preselection is None:
        preselection = []
    dataframes = []

    for treemaker in treemakers:
        try:
            dataset_frame = load_single_minitree(
                run_id, treemaker, force_reload=force_reload, event_list=event_list)
        except NoMinitreeAvailable as e:
            log.debug(str(e))
            return pd.DataFrame([], columns=['event_number', 'run_number']), []
        dataframes.append(dataset_frame)

    # Merge mini-trees of all types by inner join
    # (propagating "cuts" applied by skipping rows in MultipleRowExtractor)
    if not len(dataframes):
        raise RuntimeError("No data was extracted? What's going on??")
    result = dataframes[0]
    for i in range(1, len(dataframes)):
        result = _merge_minitrees(result, dataframes[i])

    # Apply the unblinding selection if required.
    # Normally this is already done by minitrees.load, but perhaps someone calls
    # load_single_dataset_directly.
    if (hax.unblinding.unblinding_selection not in preselection and
        ('Corrections' in treemakers or
         hax.treemakers.corrections.Corrections in treemakers) and
            hax.unblinding.is_blind(run_id)):
        preselection = [hax.unblinding.unblinding_selection] + preselection

    # Apply pre-selection cuts before moving on to the next dataset
    for ps in preselection:
        result = cuts.eval_selection(result, ps, quiet=True)

    return result, cuts._get_history(result)
Exemplo n.º 2
0
def save_cache_file(data, cache_file, **kwargs):
    """Save minitree dataframe + cut history to a cache file
    Any kwargs will be passed to pandas HDFStore. Defaults are:
        complib='blosc'
        complevel=9
    """
    kwargs.setdefault('complib', 'blosc')
    kwargs.setdefault('complevel', 9)
    dirname = os.path.dirname(cache_file)
    if dirname and not os.path.exists(dirname):
        os.makedirs(dirname)
    store = pd.HDFStore(cache_file, **kwargs)
    store.put('data', data)

    # Store the cuts history for the data
    store.get_storer('data').attrs.cut_history = cuts._get_history(data)
    store.close()
Exemplo n.º 3
0
def load_single_dataset(run_id, treemakers, preselection, force_reload=False):
    """Return pandas DataFrame resulting from running multiple treemakers on run_id (name or number),
    list of dicts describing cut histories.
    :param run_id: name or number of the run to load
    :param treemakers: list of treemaker class / instances to load
    :param preselection: String or list of strings passed to pandas.eval. Should return bool array, to be used
    for pre-selecting events to load for each dataset.
    :param force_reload: always remake the minitrees, never load any from disk.
    """
    if isinstance(treemakers, (type, str)):
        treemakers = [treemakers]
    if isinstance(preselection, str):
        preselection = [preselection]
    if preselection is None:
        preselection = []
    dataframes = []

    for treemaker in treemakers:
        try:
            dataset_frame = load_single_minitree(run_id, treemaker, force_reload=force_reload)
        except NoMinitreeAvailable as e:
            log.debug(str(e))
            return pd.DataFrame([], columns=['event_number', 'run_number']), []
        dataframes.append(dataset_frame)

    # Merge mini-trees of all types by inner join
    # (propagating "cuts" applied by skipping rows in MultipleRowExtractor)
    if not len(dataframes):
        raise RuntimeError("No data was extracted? What's going on??")
    result = dataframes[0]
    for i in range(1, len(dataframes)):
        d = dataframes[i]
        # To avoid creation of duplicate columns (which will get _x and _y suffixes),
        # look which column names already exist and do not include them in the merge
        cols_to_use = ['run_number', 'event_number'] + d.columns.difference(result.columns).tolist()
        result = pd.merge(d[cols_to_use], result, on=['run_number', 'event_number'], how='inner')

    # Apply pre-selection cuts before moving on to the next dataset
    for ps in preselection:
        result = cuts.eval_selection(result, ps, quiet=True)

    return result, cuts._get_history(result)
Exemplo n.º 4
0
def load(datasets, treemakers=tuple(['Fundamentals', 'Basics']), preselection=None, force_reload=False,
         delayed=False, num_workers=1, compute_options=None, cache_file=None, remake_cache=False):
    """Return pandas DataFrame with minitrees of several datasets and treemakers.
    :param datasets: names or numbers of datasets (without .root) to load
    :param treemakers: treemaker class (or string with name of class) or list of these to load.
    :param preselection: string or list of strings parseable by pd.eval. Should return bool array, to be used
    for pre-selecting events to load for each dataset.
    :param force_reload: if True, will force mini-trees to be re-made whether they are outdated or not.
    :param delayed:  Instead of computing a pandas DataFrame, return a dask DataFrame (default False)
    :param num_workers: Number of dask workers to use in computation (if delayed=False)
    :param compute_options: Dictionary of extra options passed to dask.compute
    :param cache_file: Save/load the result to an hdf5 file with filename specified by cahce_file.
                       Useful if you load in a large volume of data with many preselections.
    :param remake_cache: If True, and cache file given, reload (don't remake) minitrees and overwrite the cache file.
    """
    if cache_file and not remake_cache and os.path.exists(cache_file):
        # We don't have to do anything and can just load from the cache file
        store = pd.HDFStore(cache_file)
        result = store['data']
        result.cut_history = store.get_storer('data').attrs.cut_history
        store.close()
        return result

    if isinstance(datasets, (str, int, np.int64, np.int, np.int32)):
        datasets = [datasets]
    if compute_options is None:
        compute_options = {}
    compute_options.setdefault('get', dask.multiprocessing.get)

    partial_results = []
    partial_histories = []
    for dataset in datasets:
        mashup = dask.delayed(load_single_dataset)(dataset, treemakers, preselection, force_reload=force_reload)
        partial_results.append(dask.delayed(lambda x: x[0])(mashup))
        partial_histories.append(dask.delayed(lambda x: x[1])(mashup))

    result = dask.dataframe.from_delayed(partial_results, meta=partial_results[0].compute())

    if not delayed:
        # Dask doesn't seem to want to descend into the lists beyond the first.
        # So we mash things into one list before calling compute, then split it again
        mashedup_result = dask.compute(*([result] + partial_histories),
                                       num_workers=num_workers, **compute_options)
        result = mashedup_result[0]

        if 'index' in result.columns:
            # Clean up index, remove 'index' column
            # Probably we're doing something weird with pandas, this doesn't seem like the well-trodden path...
            log.debug("Removing weird index column")
            result.drop('index', axis=1, inplace=True)
            result = result.reset_index()
            result.drop('index', axis=1, inplace=True)

        # Combine the histories of partial results.
        # For unavailable minitrees, the histories will be empty: filter these empty histories out
        partial_histories = mashedup_result[1:]
        partial_histories = [x for x in partial_histories if len(x)]
        if len(partial_histories):
            cuts.record_combined_histories(result, partial_histories)

    else:
        # Magic for tracking of cut histories while using dask.dataframe here...
        pass

    if cache_file:
        # Save the result to the cache file
        dirname = os.path.dirname(cache_file)
        if dirname and not os.path.exists(dirname):
            os.makedirs(dirname)
        store = pd.HDFStore(cache_file)
        store.put('data', result)
        # Store the cuts history for the data
        store.get_storer('data').attrs.cut_history = cuts._get_history(result)
        store.close()

    return result