예제 #1
0
def test_columnrming():
    ds1 = from_root(
        ["tests/data/test_file.root"],
        name="myds",
        branches=["met", "sumet", "pT_jet2", "reg2j2b"],
        auxweights=["pT_lep1", "pT_lep2", "pT_jet1"],
    )

    ds1.rm_columns(["met", "sumet"])
    list_of_cols = list(ds1.df.columns)
    assert len(
        list_of_cols
    ) == 2 and "pT_jet2" in list_of_cols and "reg2j2b" in list_of_cols

    ds1 = from_root(["tests/data/test_file.root"], name="myds")
    list_of_cols = list(ds1.df.columns)
    assert "OS" in list_of_cols
    assert "SS" in list_of_cols
    assert "elmu" in list_of_cols
    assert "elel" in list_of_cols
    assert "mumu" in list_of_cols
    list_of_regs = [reg for reg in list_of_cols if "reg" in reg]
    ds1.rm_chargeflavor_columns()
    ds1.rm_region_columns()
    ds1.rm_weight_columns()
    list_of_cols_after = list(ds1.df.columns)
    assert "OS" not in list_of_cols_after
    assert "SS" not in list_of_cols_after
    assert "elmu" not in list_of_cols_after
    assert "mumu" not in list_of_cols_after
    assert "elel" not in list_of_cols_after
    assert "reg1j1b" not in list_of_cols_after
    for r in list_of_regs:
        assert r not in list_of_cols_after
예제 #2
0
def test_scale_weight_sum():
    ds1 = from_root(["tests/data/test_file.root"],
                    name="myds",
                    branches=branches)
    ds2 = from_root(["tests/data/test_file.root"],
                    name="ds2",
                    branches=branches)
    ds2.weights = np.random.randn(len(ds1)) * 10
    scale_weight_sum(ds1, ds2)
    testval = abs(1.0 - ds2.weights.sum() / ds1.weights.sum())
    assert testval < 1.0e-4
예제 #3
0
def test_auxweights():
    branches = ["pT_lep1", "pT_lep2", "eta_lep1", "eta_lep2"]
    ds1 = from_root(
        ["tests/data/test_file.root"],
        name="myds",
        branches=branches,
        auxweights=["phi_lep1", "phi_lep2"],
    )
    ds2 = from_root(
        ["tests/data/test_file.root"],
        name="ds2",
        branches=branches,
        auxweights=["phi_lep1", "phi_lep2"],
    )
    ds1.append(ds2)

    dsa = from_root(
        ["tests/data/test_file.root"],
        name="myds",
        branches=branches,
        auxweights=["phi_lep1", "phi_lep2"],
    )
    dsb = from_root(
        ["tests/data/test_file.root"],
        name="ds2",
        branches=branches,
        auxweights=["phi_lep1", "phi_lep2"],
    )
    dsc = dsa + dsb

    np.testing.assert_array_almost_equal(ds1.auxweights["phi_lep1"],
                                         dsc.auxweights["phi_lep1"], 5)

    dsc.change_weights("phi_lep2")
    assert dsc.weight_name == "phi_lep2"

    pl2 = uproot.open("tests/data/test_file.root")["WtLoop_nominal"].array(
        "phi_lep2")
    nw2 = uproot.open("tests/data/test_file.root")["WtLoop_nominal"].array(
        "weight_nominal")
    ds2.change_weights("phi_lep2")
    np.testing.assert_array_almost_equal(ds2.weights, pl2, 5)
    assert "phi_lep2" not in ds2.auxweights
    assert "weight_nominal" in ds2.auxweights

    ds2.to_pytables("outfile1.h5")
    ds2pt = from_pytables("outfile1.h5", "ds2", weight_name="phi_lep2")
    print(ds2pt.auxweights)
    np.testing.assert_array_almost_equal(
        ds2pt.auxweights["weight_nominal"].to_numpy(), nw2)
    os.remove("outfile1.h5")
    assert True
예제 #4
0
def test_selection_masks():
    ds2 = from_root(
        "tests/data/test_file.root",
        auxweights=["pT_lep1", "pT_lep2", "pT_jet1"],
        name="myds",
    )

    masks, sels = ds2.selection_masks({
        "s1": "(pT_lep2 > 30) & (pT_jet1 < 50)",
        "s2": "(reg2j1b==True)"
    })

    t = uproot.open("tests/data/test_file.root")["WtLoop_nominal"]
    pT_lep2_g30 = t.array("pT_lep2") > 30
    pT_jet1_l50 = t.array("pT_jet1") < 50
    reg2j1b_ist = t.array("reg2j1b") == True

    pT_lep1 = t.array("pT_lep1")
    s1_pT_lep1 = ds2[masks["s1"]].df.pT_lep1.to_numpy()
    s2_pT_lep1 = ds2[masks["s2"]].df.pT_lep1.to_numpy()

    pT_lep1_manual_s1 = pT_lep1[pT_lep2_g30 & pT_jet1_l50]
    pT_lep1_manual_s2 = pT_lep1[reg2j1b_ist]

    np.testing.assert_allclose(s1_pT_lep1, pT_lep1_manual_s1)
    np.testing.assert_allclose(s2_pT_lep1, pT_lep1_manual_s2)
예제 #5
0
def test_append():
    branches = ["pT_lep1", "pT_lep2", "eta_lep1", "eta_lep2"]
    ds1 = from_root(["tests/data/test_file.root"],
                    name="myds",
                    branches=branches)
    ds2 = from_root(["tests/data/test_file.root"],
                    name="ds2",
                    branches=branches)
    ds2.weights = ds2.weights * 5
    # raw
    comb_w = np.concatenate([ds1.weights, ds2.weights])
    comb_df = pd.concat([ds1.df, ds2.df])
    # appended
    ds1.append(ds2)
    # now test
    np.testing.assert_array_almost_equal(comb_w, ds1.weights, 5)
    np.testing.assert_array_almost_equal(comb_df.get_values(),
                                         ds1.df.get_values(), 5)
예제 #6
0
def test_label():
    ds2 = from_root(["tests/data/test_file.root"],
                    name="ds2",
                    branches=branches)
    assert ds2.label is None
    assert ds2.label_asarray() is None
    ds2.label = 6
    la = ds2.label_asarray()
    la_raw = np.ones_like(ds2.weights, dtype=np.int64) * 6
    np.testing.assert_array_equal(la, la_raw)
예제 #7
0
def test_selection():
    ds2 = from_root(
        ["tests/data/test_file.root"],
        name="ds2",
        selection="(reg2j2b==True) & (OS == True) & (pT_lep1 > 50)",
    )
    upt = uproot.open("tests/data/test_file.root")["WtLoop_nominal"]
    reg2j2b = upt.array("reg2j2b")
    OS = upt.array("OS")
    pT_lep1 = upt.array("pT_lep1")
    sel = np.logical_and(np.logical_and(reg2j2b, OS), pT_lep1 > 50)
    w = upt.array("weight_nominal")[sel]
    assert np.allclose(w, ds2.weights)
예제 #8
0
def test_add():
    ds2 = from_root(["tests/data/test_file.root"],
                    name="ds2",
                    branches=branches)
    ds2.weights = ds2.weights * 22
    combined = ds + ds2
    comb_w = np.concatenate([ds.weights, ds2.weights])
    comb_df = pd.concat([ds.df, ds2.df])
    np.testing.assert_array_almost_equal(comb_w, combined.weights, 5)
    np.testing.assert_array_almost_equal(comb_df.get_values(),
                                         combined.df.get_values(), 5)
    assert ds.name == combined.name
    assert ds.tree_name == combined.tree_name
    assert ds.label == combined.label
예제 #9
0
def test_columnkeeping():
    ds1 = from_root(
        ["tests/data/test_file.root"],
        name="myds",
        branches=["met", "sumet", "pT_jet2", "reg2j2b"],
        auxweights=["pT_lep1", "pT_lep2", "pT_jet1"],
    )
    keep_c = ["reg2j2b", "pT_jet2"]
    keep_w = ["pT_lep1", "pT_jet1"]
    ds1.keep_columns(keep_c)
    ds1.keep_weights(keep_w)
    list_of_col = list(ds1.df.columns)
    list_of_exw = list(ds1.auxweights.columns)
    assert keep_c == list_of_col
    assert keep_w == list_of_exw
예제 #10
0
파일: _apps.py 프로젝트: chnzhangrui/twaml
def root2pytables():
    """command line application which converts a set of ROOT files into a
    pytables HDF5 file via the :meth:`twaml.data.from_root` function and
    the :meth:`twaml.data.dataset.to_pytables` member function of the :class:`twaml.data.dataset`
    class.

    """
    parser = argparse.ArgumentParser(
        description=(
            "Convert ROOT files to a pytables hdf5 dataset "
            "via twaml.data.root_dataset and "
            "twaml.data.dataset.to_pytables"
        )
    )

    # fmt: off
    parser.add_argument("-i", "--input-files", type=str, nargs="+", required=True, help="input ROOT files")
    parser.add_argument("-n", "--name", type=str, required=True,
                        help="dataset name (required when reading back into twaml.data.dataset)")
    parser.add_argument("-o", "--out-file", type=str, required=True,
                        help="Output h5 file (existing file will be overwritten)")
    parser.add_argument("-b", "--branches", type=str, nargs="+", required=False,
                        help="branches to save (defaults to all)")
    parser.add_argument("--tree-name", type=str, required=False, default="WtLoop_nominal", help="tree name")
    parser.add_argument("--weight-name", type=str, required=False, default="weight_nominal", help="weight branch name")
    parser.add_argument("--auxweights", type=str, nargs="+", required=False, help="extra auxiliary weights to save")
    parser.add_argument("--selection", type=str, required=False,
                        help=("A selection string or YAML file containing a map of selections "
                              "(see `selection` argument docs in `twaml.data.from_root`)"))
    parser.add_argument("--detect-weights", action="store_true",
                        help="detect weights in the dataset, --auxweights overrides this")
    parser.add_argument("--nthreads", type=int, default=1, required=False,
                        help="number of threads to use via ThreadPoolExecutor")
    parser.add_argument("--aggro-strip", action="store_true",
                        help="call the `aggressively_strip()` function on the dataset before saving")
    parser.add_argument("--table-format", action="store_true",
                        help="Use the 'table' format keyword when calling DataFrame's to_hdf function")
    parser.add_argument("--use-lz4", action="store_true", help="Use lz4 compression")
    # fmt: on

    args = parser.parse_args()

    if not args.out_file.endswith(".h5"):
        raise ValueError("--out-file argument must end in .h5")

    to_hdf_kw = {}
    if args.table_format:
        to_hdf_kw["format"] = "table"
    if args.use_lz4:
        to_hdf_kw["complib"] = "blosc:lz4"

    ## if selection is not none and is a file ending in .yml or .yaml
    ## we do the yaml based selections. also a shortcut is implemented
    ## as a special case
    if args.selection is not None:
        if args.selection == "freq_shortcut":
            selection_yaml = {
                "r1j1b": twaml.utils.SELECTION_1j1b,
                "r2j1b": twaml.utils.SELECTION_2j1b,
                "r2j2b": twaml.utils.SELECTION_2j2b,
                "r3j1b": twaml.utils.SELECTION_3j1b,
                "r3jHb": twaml.utils.SELECTION_3jHb,
            }

        elif args.selection.endswith(".yml") or args.selection.endswith(".yaml"):
            with open(args.selection) as f:
                selection_yaml = yaml.full_load(f)

        full_ds = from_root(
            args.input_files,
            name=args.name,
            tree_name=args.tree_name,
            weight_name=args.weight_name,
            branches=args.branches,
            auxweights=args.auxweights,
            detect_weights=args.detect_weights,
            nthreads=args.nthreads if args.nthreads > 1 else None,
            wtloop_meta=False,
        )

        selected_masks, sel_logics = full_ds.selection_masks(selection_yaml)
        anchor = args.out_file.split(".h5")[0]
        for (sdk, sdv), sdl in zip(selected_masks.items(), sel_logics):
            temp_ds = full_ds[sdv]
            if args.aggro_strip:
                temp_ds.aggressively_strip()
            temp_ds.selection_formula = sdl
            temp_ds.to_pytables(f"{anchor}_{sdk}.h5", to_hdf_kw=to_hdf_kw)
            del temp_ds
        return 0

    ## otherwise just take the string or None
    ds = from_root(
        args.input_files,
        name=args.name,
        tree_name=args.tree_name,
        selection=args.selection,
        weight_name=args.weight_name,
        branches=args.branches,
        auxweights=args.auxweights,
        detect_weights=args.detect_weights,
        nthreads=args.nthreads if args.nthreads > 1 else None,
        aggressively_strip=args.aggro_strip,
        wtloop_meta=False,
    )
    ds.to_pytables(args.out_file, to_hdf_kw=to_hdf_kw)

    return 0
예제 #11
0
def test_with_executor():
    lds = from_root(["tests/data/test_file.root"],
                    branches=branches,
                    nthreads=4)
    np.testing.assert_array_almost_equal(lds.weights, ds.weights, 8)
예제 #12
0
def test_nothing():
    dst = from_root(["tests/data/test_file.root"], branches=branches)
    assert dst.files[0].exists()
예제 #13
0
def test_no_name():
    dst = from_root(["tests/data/test_file.root"], branches=branches)
    assert dst.name == "test_file.root"
예제 #14
0
import os
import pandas as pd
import numpy as np
import uproot
import h5py
from twaml.data import dataset
from twaml.data import scale_weight_sum
from twaml.data import from_root, from_pytables, from_h5

branches = ["pT_lep1", "pT_lep2", "eta_lep1", "eta_lep2"]
ds = from_root(["tests/data/test_file.root"],
               name="myds",
               branches=branches,
               TeXlabel=r"$t\bar{t}$")


def test_name():
    assert ds.name == "myds"
    assert ds.TeXlabel == "$t\\bar{t}$"


def test_no_name():
    dst = from_root(["tests/data/test_file.root"], branches=branches)
    assert dst.name == "test_file.root"


def test_content():
    ts = [uproot.open(f)[ds.tree_name] for f in ds.files]
    raws = [t.array("pT_lep1") for t in ts]
    raw = np.concatenate([raws])
    bins = np.linspace(0, 800, 21)