Exemplo n.º 1
0
def test_tree_handler():  # pylint: disable=too-many-statements
    """
    Test the TreeHandler class functionalities.
    """
    # define the working directory
    test_dir = Path(__file__).resolve().parent

    # initialize TreeHandler test
    test_data, references = init_tree_handler_test_workspace(test_dir)

    # instantiate tree handler objects
    data_hdlr = TreeHandler(test_data[0], 'treeMLDplus')
    prompt_hdlr = TreeHandler(test_data[1], 'treeMLDplus')
    data_pq_hdlr = TreeHandler(test_data[2])
    prompt_pq_hdlr = TreeHandler(test_data[3])
    mult_hdlr = TreeHandler(test_data[:2], 'treeMLDplus')
    mult_pq_hdlr = TreeHandler(test_data[2:])

    # open refernces objects
    reference_data_slice_df = pd.read_pickle(references[0])
    reference_prompt_slice_df = pd.read_pickle(references[1])
    with open(references[2], 'rb') as handle:
        reference_dict = pickle.load(handle)

    terminate_tree_handler_test_workspace(test_dir)

    # test that data is the same in root and parquet
    assert data_hdlr.get_data_frame().equals(data_pq_hdlr.get_data_frame()), \
        'data Dataframe from parquet file differs from the root file one!'
    assert prompt_hdlr.get_data_frame().equals(prompt_pq_hdlr.get_data_frame()), \
        'prompt Dataframe from parquet file differs from the root file one!'

    # test loading from multiple files
    merged_df = pd.concat(
        [data_hdlr.get_data_frame(),
         prompt_hdlr.get_data_frame()],
        ignore_index=True)
    assert mult_hdlr.get_data_frame().equals(
        merged_df), 'loading of multiple root files not working!'
    merged_pq_df = pd.concat(
        [data_pq_hdlr.get_data_frame(),
         prompt_pq_hdlr.get_data_frame()],
        ignore_index=True)
    assert mult_pq_hdlr.get_data_frame().equals(
        merged_pq_df), 'loading of multiple parquet files not working!'

    # define the info dict that will be compared with the reference
    info_dict = {}

    # get the number of candidates in the original data sample
    info_dict['n_data'] = data_hdlr.get_n_cand()
    info_dict['n_prompt'] = prompt_hdlr.get_n_cand()

    # get the original variable list
    info_dict['data_var_list'] = prompt_hdlr.get_var_names()
    info_dict['prompt_var_list'] = prompt_hdlr.get_var_names()

    # shuffle dataframes
    new_hndl = data_hdlr.shuffle_data_frame(size=10,
                                            random_state=5,
                                            inplace=False)
    copied_hndl = copy.deepcopy(data_hdlr)
    copied_hndl.shuffle_data_frame(size=10, random_state=5, inplace=True)
    assert copied_hndl.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after shuffling'

    # apply preselections
    preselections_data = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)'
    preselections_prompt = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)'

    new_hndl = data_hdlr.apply_preselections(preselections_data, inplace=False)
    data_hdlr.apply_preselections(preselections_data)
    assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after the preselections'

    prompt_hdlr.apply_preselections(preselections_prompt)

    # get the number of selected data
    info_dict['n_data_preselected'] = data_hdlr.get_n_cand()
    info_dict['n_prompt_preselected'] = prompt_hdlr.get_n_cand()

    # get the preselections
    info_dict['data_preselections'] = data_hdlr.get_preselections()
    info_dict['prompt_preselections'] = prompt_hdlr.get_preselections()

    # apply dummy eval() on the underlying data frame
    d_len_z_def = 'd_len_z = sqrt(d_len**2 - d_len_xy**2)'
    new_hndl = data_hdlr.eval_data_frame(d_len_z_def, inplace=False)
    data_hdlr.eval_data_frame(d_len_z_def)
    assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after eval'

    prompt_hdlr.eval_data_frame(d_len_z_def)

    # get the new variable list
    info_dict['data_new_var_list'] = prompt_hdlr.get_var_names()
    info_dict['prompt_new_var_list'] = prompt_hdlr.get_var_names()

    # get a random subset of the original data
    data_hdlr = data_hdlr.get_subset(size=3000, rndm_state=SEED)
    prompt_hdlr = prompt_hdlr.get_subset(size=55, rndm_state=SEED)

    # slice both data and prompt data frame respect to the pT
    bins = [[0, 2], [2, 10], [10, 25]]

    data_hdlr.slice_data_frame('pt_cand', bins)
    prompt_hdlr.slice_data_frame('pt_cand', bins)

    # store projection variable and binning
    info_dict['data_proj_variable'] = data_hdlr.get_projection_variable()
    info_dict['prompt_proj_variable'] = prompt_hdlr.get_projection_variable()

    info_dict['data_binning'] = data_hdlr.get_projection_binning()
    info_dict['prompt_binning'] = prompt_hdlr.get_projection_binning()

    # get info from a single data slice
    data_slice_df = data_hdlr.get_slice(2)
    prompt_slice_df = prompt_hdlr.get_slice(2)

    info_dict['n_data_slice'] = len(data_slice_df)
    info_dict['n_prompt_slice'] = len(prompt_slice_df)

    # test info_dict reproduction

    assert info_dict == reference_dict, 'dictionary with the data info differs from the reference!'

    # test sliced data frames reproduction
    assert data_slice_df.equals(
        reference_data_slice_df
    ), 'data sliced DataFrame differs from the reference!'
    assert prompt_slice_df.equals(
        reference_prompt_slice_df
    ), 'prompt sliced DataFrame differs from the reference!'
Exemplo n.º 2
0
PRESEL_DATA = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)'
PRESEL_PROMPT = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)'

DATA_HDLR.apply_preselections(PRESEL_DATA)
PROMPT_HDLR.apply_preselections(PRESEL_PROMPT)

# store number of selcted data
INFO_DICT['n_data_preselected'] = DATA_HDLR.get_n_cand()
INFO_DICT['n_prompt_preselected'] = PROMPT_HDLR.get_n_cand()

# store preselections
INFO_DICT['data_preselections'] = DATA_HDLR.get_preselections()
INFO_DICT['prompt_preselections'] = PROMPT_HDLR.get_preselections()

# apply dummy eval() on the underlying data frame
DATA_HDLR.eval_data_frame('d_len_z = sqrt(d_len**2 - d_len_xy**2)')
PROMPT_HDLR.eval_data_frame('d_len_z = sqrt(d_len**2 - d_len_xy**2)')

# store new variable list
INFO_DICT['data_new_var_list'] = PROMPT_HDLR.get_var_names()
INFO_DICT['prompt_new_var_list'] = PROMPT_HDLR.get_var_names()

# get a random subset of the original data
DATA_HDLR = DATA_HDLR.get_subset(size=3000, rndm_state=42)
PROMPT_HDLR = PROMPT_HDLR.get_subset(size=55, rndm_state=42)

# slice both data and prompt data frame respect to the pT
BINS = [[0, 2], [2, 10], [10, 25]]

DATA_HDLR.slice_data_frame('pt_cand', BINS)
PROMPT_HDLR.slice_data_frame('pt_cand', BINS)