def generate_efp_graphs():
    # Grab graphs
    prime_d7 = ef.EFPSet("d<=7", "p==1")
    chrom_4 = ef.EFPSet("d<=8", "p==1", "c==4")
    efpsets = [prime_d7, chrom_4]
    for efpset in efpsets:
        graphs = efpset.graphs()
        for efp_ix, graph in enumerate(graphs):
            n, e, d, v, k, c, p, h = efpset.specs[efp_ix]
            plot_graph(graph, n, d, k, f"graphs/pdf/efp_{n}_{d}_{k}.pdf")
            plot_graph(graph, n, d, k, f"graphs/png/efp_{n}_{d}_{k}.png")
            plot_graph(graph, n, d, k, f"graphs/eps/efp_{n}_{d}_{k}.eps")
Пример #2
0
def test_get_graph_components():
    efpset = ef.EFPSet()
    ps = np.array(
        [len(ef.utils.get_components(graph)) for graph in efpset.graphs()])

    # note that the empty graph is recorded as having 1 connected component by EFPSet
    assert np.all(ps[1:] == efpset.specs[1:, -2])
Пример #3
0
def main():
    f = pd.read_hdf(
        "/data/t3home000/spark/LHCOlympics/data/events_anomalydetection.h5")
    dt = f.values
    dt2 = dt[:, :2100]
    dt3 = dt[:, -1]

    dt2 = dt2.reshape((len(dt2), len(dt2[0]) // 3, 3))

    # data controls
    num_data = 1100000
    test_frac = 0.2

    # efp parameters
    dmax = 7
    measure = 'hadr'
    beta = 0.5

    print('Calculating d <= {} EFPs for {} jets... '.format(dmax, num_data),
          end='')
    efpset = ef.EFPSet(('d<=', dmax), measure='hadr', beta=beta)

    test = dt2[:1000]
    test_X = [x[x[:, 0] > 0] for x in test]
    X = efpset.batch_compute(test_X)
    print(X)
    print('done')
Пример #4
0
def generate_EFP():
    # Calculate HL variables from ET, eta, phi
    dtypes = ["et", "ht"]
    splits = ["test", "train", "valid"]
    for dtype in dtypes:
        for split in splits:
            print(f"Processing data type: {dtype}, {split}")
            X = pd.read_pickle(
                f"data/processed/{split}_{dtype}.pkl")["features"]
            y = pd.read_pickle(
                f"data/processed/y_{split}_{dtype}.pkl")["targets"]

            # Choose kappa, beta values
            kappas = [-1, 0, 0.5, 1, 2]
            betas = [0.5, 1, 2]

            # Grab graphs
            prime_d7 = ef.EFPSet("d<=7", "p==1")
            chrom_4 = ef.EFPSet("d<=8", "p==1", "c==4")
            efpsets = [prime_d7, chrom_4]
            for efpset in efpsets:
                graphs = efpset.graphs()
                t = tqdm(graphs)
                for efp_ix, graph in enumerate(t):
                    for kappa in kappas:
                        for beta in betas:
                            n, e, d, v, k, c, p, h = efpset.specs[efp_ix]
                            file_name = f"data/efp/{split}/{dtype}_efp_{n}_{d}_{k}_k_{kappa}_b_{beta}.feather"
                            if not os.path.exists(file_name):
                                graph = graphs[efp_ix]
                                t.set_description(
                                    f"Procesing: EFP[{n},{d},{k}](k={kappa},b={beta})"
                                )
                                efp_val = efp(
                                    data=X,
                                    graph=graph,
                                    kappa=kappa,
                                    beta=beta,
                                    normed=False,
                                )
                                efp_df = pd.DataFrame({
                                    f"features": efp_val,
                                    f"targets": y
                                })
                                efp_df.to_feather(file_name)
Пример #5
0
def test_batch_compute_vs_compute(measure, beta, kappa, normed):
    if measure == 'hadr' and kappa == 'pf':
        pytest.skip('hadr does not do pf')
    if 'efm' in measure and beta != 2:
        pytest.skip('only test efm when beta=2')
    events = ef.gen_random_events(10, 15)
    s = ef.EFPSet('d<=6', measure=measure, beta=beta, kappa=kappa, normed=normed)
    r_batch = s.batch_compute(events)
    r = np.asarray([s.compute(event) for event in events])
    assert epsilon_percent(r_batch, r, 10**-14)
Пример #6
0
def test_efpset_vs_efps(measure, beta, kappa, normed, event):
    # handle cases we want to skip
    if measure == 'hadr' and kappa == 'pf':
        pytest.skip('hadr does not do pf')
    if 'efm' in measure and beta != 2:
        pytest.skip('only test efm when beta=2')
    s1 = ef.EFPSet('d<=6', measure=measure, beta=beta, kappa=kappa, normed=normed)
    efps = [ef.EFP(g, measure=measure, beta=beta, kappa=kappa, normed=normed) for g in s1.graphs()]
    r1 = s1.compute(event)
    r2 = np.asarray([efp.compute(event) for efp in efps])
    assert epsilon_percent(r1, r2, 10**-12)
Пример #7
0
def test_batch_compute_vs_compute(measure, beta, kappa, normed):
    if measure == 'hadr' and kappa == 'pf':
        pytest.skip('hadr does not do pf')
    if kappa == 'pf' and normed:
        pytest.skip('normed not supported with kappa=pf')
    if ('efm' in measure) and (beta != 2):
        pytest.skip('only beta=2 can use efm measure')
    events = ef.gen_random_events(10, 15)
    s = ef.EFPSet('d<=6', measure=measure, beta=beta, kappa=kappa, normed=normed)
    r_batch = s.batch_compute(events, n_jobs=1)
    r = np.asarray([s.compute(event) for event in events])
    assert epsilon_percent(r_batch, r, 10**-14)
Пример #8
0
def test_efpset_vs_efps(measure, beta, kappa, normed, event):
    # handle cases we want to skip
    if measure == 'hadr' and kappa == 'pf':
        pytest.skip('hadr does not do pf')
    if kappa == 'pf' and normed:
        pytest.skip('normed not supported with kappa=pf')    
    if ('efm' in measure) and (beta != 2):
        pytest.skip('only beta=2 can use efm measure')
    s1 = ef.EFPSet('d<=6', measure=measure, beta=beta, kappa=kappa, normed=normed)
    efps = [ef.EFP(g, measure=measure, beta=beta, kappa=kappa, normed=normed) for g in s1.graphs()]
    r1 = s1.compute(event)
    r2 = np.asarray([efp.compute(event) for efp in efps])
    assert epsilon_percent(r1, r2, 10**-12)
Пример #9
0
def generate_EFP():
    hdf_file = path.join(data_path, "processed", "prep_data.h5")
    X = pd.read_hdf(hdf_file, "features").features.to_numpy()
    y = pd.read_hdf(hdf_file, "targets").targets.values

    # Choose kappa, beta values
    kappas = [-1, 0, 0.25, 0.5, 1, 2]
    betas = [0.25, 0.5, 1, 2, 3, 4]

    # Grab graphs
    prime_d7 = ef.EFPSet("d<=7", "p==1")
    chrom_4 = ef.EFPSet("d<=8", "p==1", "c==4")
    efpsets = [prime_d7, chrom_4]
    for efpset in efpsets:
        graphs = efpset.graphs()
        t = tqdm(graphs)
        for efp_ix, graph in enumerate(t):
            for kappa in kappas:
                for beta in betas:
                    n, e, d, v, k, c, p, h = efpset.specs[efp_ix]
                    file_name = f"data/efp/efp_{n}_{d}_{k}_k_{kappa}_b_{beta}.feather"
                    if not path.isfile(file_name):
                        graph = graphs[efp_ix]
                        t.set_description(
                            f"Procesing: EFP[{n},{d},{k}](k={kappa},b={beta})")
                        efp_val = efp(
                            data=X,
                            graph=graph,
                            kappa=kappa,
                            beta=beta,
                            normed=False,
                        )
                        efp_df = pd.DataFrame({
                            f"features": efp_val,
                            f"targets": y
                        })
                        efp_df.to_feather(file_name)
Пример #10
0
def efp(args, jets, mask=None, real=True):
    efpset = ef.EFPSet(('n==', 4), ('d==', 4), ('p==', 1),
                       measure='hadr',
                       beta=1,
                       normed=None,
                       coords='ptyphim')

    efp_format = ef_format(jets)

    if not real and args.mask:
        for i in range(jets.shape[0]):
            for j in range(args.num_hits):
                if not mask[i][j]:
                    for k in range(4):
                        efp_format[i][j][k] = 0

    logging.info("Batch Computing")

    return efpset.batch_compute(efp_format)
Пример #11
0
    # real_means.append(np.mean(np.array(real_w1s), axis=0))
    # real_stds.append(np.std(np.array(real_w1s), axis=0))
    gen_means.append(np.mean(np.array(gen_w1s), axis=0))
    gen_stds.append(np.std(np.array(gen_w1s), axis=0))

real_means
real_stds
gen_means
gen_stds

# Get all prime EFPs with n=4, d=4
# Specify EFPs set
efpset = ef.EFPSet(('n==', 4), ('d==', 4), ('p==', 1),
                   measure='hadr',
                   beta=1,
                   normed=None,
                   coords='ptyphim')

N = 100000
gen_out_efp_format = np.concatenate(
    (np.expand_dims(gen_out[:, :, 2], 2), gen_out[:, :, :2],
     np.zeros((gen_out.shape[0], gen_out.shape[1], 1))),
    axis=2)
X_efp_format = np.concatenate(
    (np.expand_dims(Xplot[:, :, 2], 2), Xplot[:, :, :2], np.zeros((N, 30, 1))),
    axis=2)

gen_out_efp = efpset.batch_compute(gen_out_efp_format)
X_efp = efpset.batch_compute(X_efp_format)
Пример #12
0
_jetsPL = np.load('jet_input.npy', allow_pickle=True)
# _jetsPL = _jetsPL[:2000]
print(len(_jetsPL))

BINS = np.linspace(-0.4, 0.4, num=utils.N_IMAGE_BINS + 1)
#print(len(bins))

jetsPL_train = []
# jetsDL_train = []
jetsPL_test = []
# jetsDL_test = []

jet_images_pl_test = []
# jet_images_dl_test = []

efpset = energyflow.EFPSet(('d<=', 4), measure='hadr', beta=0.5)
# masked_X = [x[x[:,0] > 0] for x in _jetsPL]
# X = efpset.compute(_jetsPL[3])
#print(len(X))
# print(X.shape)
efps_pl = efpset.batch_compute(_jetsPL, n_jobs=2)[:, 1:]

normalization = np.max(efps_pl, axis=0)
print(np.max(efps_pl, axis=0))
efps_pl = np.divide(efps_pl, normalization)
print(np.max(efps_pl, axis=0))

efps_pl_train = []
efps_pl_test = []
for efp in efps_pl:
Пример #13
0
    def __init__(self, input_path, store_n_jets, jet_delta_r,
                 max_n_constituents, efp_degree):
        """
        Reads input trees, recognizes input types, initializes EFP processor and prepares all arrays needed to
        store output variables.
        """

        self.set_input_paths_and_selections(input_path=input_path)

        # read files, trees and recognize input type
        self.files = {
            path: uproot.open(path)
            for path in self.input_file_paths
        }
        self.trees = {}
        self.input_types = {}
        self.read_trees()
        self.n_all_events = sum(
            [tree.num_entries for tree in self.trees.values()])
        self.n_events = sum(map(len, list(self.selections.values()))) + 1

        print("Found {0} file(s)".format(len(self.files)))
        print("Found {0} tree(s)".format(len(self.trees)))
        print("Found ", self.n_events - 1,
              " selected events, out of a total of ", self.n_all_events)

        # set internal parameters
        self.jet_delta_r = jet_delta_r
        self.max_n_constituents = max_n_constituents if max_n_constituents > 0 else 100
        self.max_n_jets = store_n_jets
        self.EFP_size = 0

        # initialize EFP set
        if efp_degree >= 0:
            print(
                "\n\n=======================================================")
            print("Creating energyflow particle set with degree d <= {0}...".
                  format(efp_degree))
            self.efpset = ef.EFPSet("d<={0}".format(efp_degree),
                                    measure='hadr',
                                    beta=1.0,
                                    normed=True,
                                    verbose=True)
            self.EFP_size = self.efpset.count()
            print("EFP set is size: {}".format(self.EFP_size))
            print(
                "=======================================================\n\n")

        # prepare arrays for event & jet features, EFPs and jet constituents

        self.output_arrays = {
            OutputTypes.EventFeatures:
            np.empty((self.n_events, len(Event.get_features_names()))),
            OutputTypes.JetFeatures:
            np.empty((self.n_events, self.max_n_jets,
                      len(Jet.get_feature_names()))),
            OutputTypes.JetConstituents:
            np.empty((self.n_events, self.max_n_jets, self.max_n_constituents,
                      len(Jet.get_constituent_feature_names()))),
            OutputTypes.EPFs:
            np.empty((self.n_events, self.max_n_jets, self.EFP_size))
        }

        self.output_names = {
            OutputTypes.EventFeatures: "event_features",
            OutputTypes.JetFeatures: "jet_features",
            OutputTypes.JetConstituents: "jet_constituents",
            OutputTypes.EPFs: "jet_eflow_variables"
        }

        self.output_labels = {
            OutputTypes.EventFeatures: Event.get_features_names(),
            OutputTypes.JetFeatures: Jet.get_feature_names(),
            OutputTypes.JetConstituents: Jet.get_constituent_feature_names(),
            OutputTypes.EPFs: [str(i) for i in range(self.EFP_size)]
        }

        self.save_outputs = {
            OutputTypes.EventFeatures: True,
            OutputTypes.JetFeatures: True,
            OutputTypes.JetConstituents:
            False if max_n_constituents < 0 else True,
            OutputTypes.EPFs: False if efp_degree < 0 else True
        }
Пример #14
0
beta = 0.5

# plotting
colors = ['tab:red', 'tab:orange', 'tab:olive', 'tab:green', 'tab:blue']

################################################################################

# load data
X, y = qg_jets.load(num_data)

print('Loaded quark and gluon jets')

# calculate EFPs
print('Calculating d <= {} EFPs for {} jets... '.format(dmax, num_data),
      end='')
efpset = ef.EFPSet(('d<=', dmax), measure='hadr', beta=beta)
masked_X = [x[x[:, 0] > 0] for x in X]
X = efpset.batch_compute(masked_X)
print('Done')

# train models with different numbers of EFPs as input
rocs = []
for d in range(1, dmax + 1):

    # build architecture
    model = LinearClassifier(linclass_type='lda')

    # select EFPs with degree <= d
    X_d = X[:, efpset.sel(('d<=', d))]

    # do train/val/test split
Пример #15
0
def test_linear_relations(measure):
    
    graphs ={# d=0
        'dot': [],

        # d=1
        'line': [(0,1)],

        # d=2
        'dumbbell': [(0,1), (0,1)],
        'wedge': [(0,1),(1,2)],
        'linesqd' : [(0,1),(2,3)],

        # d = 3
        'tribell' : [(0,1),(0,1),(0,1)],
        'triangle' : [(0,1),(1,2),(2,0)],
        'asymwedge' : [(0,1),(0,1),(1,2)],
        'birdfoot' : [(0,1),(0,2),(0,3)],
        'chain' : [(0,1),(1,2),(2,3)],
        'linedumbbell' : [(0,1),(2,3),(2,3)],
        'linewedge' : [(0,1),(2,3),(3,4)],
        'linecbd'  : [(0,1),(2,3),(4,5)],

        # d = 4
        'quadbell' : [(0,1),(0,1),(0,1),(0,1)],
        'doublewedge' : [(0,1),(0,1),(1,2),(1,2)],
        'icecreamcone' : [(0,1),(0,1),(1,2),(2,0)],
        'asymwedge2' : [(0,1),(0,1),(0,1),(1,2)],
        'square' : [(0,1),(1,2),(2,3),(3,0)],
        'flyswatter' : [(0,1),(1,2),(2,3),(3,1)],
        'chain2mid' : [(0,1),(1,2),(1,2),(2,3)],
        'chain2end' : [(0,1),(1,2),(2,3),(2,3)],
        'asymbirdfoot' : [(0,1),(0,1),(1,2),(1,3)],
        'bigbirdfoot' : [(0,1),(0,2),(0,3),(0,4)],
        'dog' : [(0,1),(1,2),(2,3),(2,4)],
        'bigchain' : [(0,1),(1,2),(2,3),(3,4)],

        'dumbbellwedge' : [(0,1),(0,1),(2,3),(3,4)],
        'triangleline' : [(0,1),(1,2),(2,0),(3,4)],
        'dumbbellsqd' : [(0,1),(0,1),(2,3),(2,3)],

        # d = 5
        'pentagon' : [(0,1),(1,2),(2,3),(3,4),(4,0)],
        'triangledumbbell': [(0,1),(0,1),(2,3),(3,4),(4,2)]
        }
    
    # pick a random event with 2 particles
    event = ef.gen_random_events(1, 2, dim=4)

    # compute the value of all of the EFPs on this event
    d = {name: ef.EFP(graph, measure=measure, coords='epxpypz')(event) for name,graph in graphs.items()}

    eps = 10**-8
    
    # check that the identities in the EFM paper are valid (i.e. = 0)
    assert epsilon_diff(2 * d['wedge'] - d['dumbbell'], 0, eps)
    assert epsilon_diff(2 * d['triangle'], 0, eps)
    assert epsilon_diff(d['tribell'] - 2 * d['asymwedge'], 0, eps)
    assert epsilon_diff(2 * d['chain'] - d['linedumbbell'] - d['triangle'], 0, eps)
    assert epsilon_diff(d['birdfoot'] + d['chain'] - d['asymwedge'], 0, eps)
    
    # Four Dimensions
    # pick a random event in 4 dimensions
    event = ef.gen_random_events(1, 25, dim=4)

    # compute the value of all of the EFPs on this event
    d = {name: ef.EFP(graph, measure=measure, coords='epxpypz')(event) for name,graph in graphs.items()}

    # check that the identity in the paper is valid (i.e. = 0)
    assert epsilon_percent(6*d['pentagon'], 5*d['triangledumbbell'], 10**-11)
    
    # count the number of leafless multigraphs (all or just connected) with degree d
    ds = np.arange(11)
    counts_all, counts_con = [], []

    # for each degree, get the graphs with edges<=d and check whether they are leafless
    for d in ds:
        counts_all.append(np.sum([leafless(graph) for graph in ef.EFPSet(('d<=',d)).graphs()]))
        counts_con.append(np.sum([leafless(graph) for graph in ef.EFPSet(('d<=',d), ('p==',1)).graphs()]))

    # note: computed counts are cumulative, must take the difference to get individual d    
    counts_all = np.asarray(counts_all[1:]) - np.asarray(counts_all[:-1])
    counts_con = np.asarray(counts_con[1:]) - np.asarray(counts_con[:-1])
    
    # ensure agreement with the table in the paper
    assert epsilon_diff(counts_all, [0,1,2,5,11,34,87,279,897,3129], eps)
    assert epsilon_diff(counts_con, [0,1,2,4,9,26,68,217,718,2553], eps)