Пример #1
0
    def __call__(self):  # -> generator object yielding np.ndarray, np.ndarray
        '''
            generate single(!) data-sample (batching done in tf.Dataset)
        '''

        # create new file data-reader, each time data-generator is called (otherwise file-data-reader generation not reset)
        generator = dare.DataReader(self.path).generate_event_parts_from_dir(
            parts_n=self.sample_part_n, **self.cuts)

        samples_read_n = 0
        # loop through whole dataset, reading sample_part_n events at a time
        for constituents, features in generator:
            samples = events_to_input_samples(constituents, features)
            indices = list(range(len(samples)))
            samples_read_n += len(samples)
            while indices:
                index = indices.pop(0)
                next_sample = samples[index]  #.copy()
                yield next_sample
            if self.sample_max_n is not None and (samples_read_n >=
                                                  self.sample_max_n):
                break

        print('[DataGenerator]: __call__() yielded {} samples'.format(
            samples_read_n))
        generator.close()
Пример #2
0
 def from_input_file(cls, name, path, **cuts):
     reader = dare.DataReader(path)
     constituents, jet_features = reader.read_events_from_file(**cuts)
     constituents_feature_names, jet_feature_names = reader.read_labels_from_file(
     )
     return cls(name, constituents, jet_features,
                constituents_feature_names, jet_feature_names)
Пример #3
0
 def from_input_dir(cls, name, path, read_n=None, **cuts):
     df, _ = dr.DataReader(path).read_jet_features_from_dir(
         read_n=read_n, features_to_df=True, **cuts)
     # convert any QR-selection colums from 0/1 to bool
     sel_cols = [c for c in df if c.startswith('sel')]
     for sel in sel_cols:  # convert selection column to bool
         df[sel] = df[sel].astype(bool)
     return cls(name, df)
 def setUp(self):
     self.dir_path = '/eos/user/k/kiwoznia/data/VAE_data/baby_events/qcd_sqrtshatTeV_13TeV_PU40'
     self.reader = dare.DataReader(self.dir_path)
     self.file_paths = fnames = [
         os.path.join(self.dir_path,
                      'qcd_sqrtshatTeV_13TeV_PU40_' + str(n) + '.h5')
         for n in [34, 721, 96]
     ]
     self.total_num_events_in_dir = 57964 + 36 + 58096
Пример #5
0
    def from_input_dir(cls, name, path, read_n=None, **cuts):
        ''' reading data in all files in 'path' to event sample'''
        reader = dare.DataReader(path)
        # import ipdb; ipdb.set_trace()

        constituents, constituents_feature_names, jet_features, jet_feature_names = reader.read_events_from_dir(
            read_n=read_n, **cuts)
        return cls(name, constituents, jet_features,
                   constituents_feature_names, jet_feature_names)
Пример #6
0
    def __call__(
            self):  # -> generator object yielding (np.ndarray, np.ndarray)

        # make background and signal sample generators
        generator_bg = dare.DataReader(
            self.path_bg).generate_constituents_parts_from_dir(
                parts_n=self.sample_bg_part_n)
        generator_sig = dare.DataReader(
            self.path_sig).generate_constituents_parts_from_dir(
                parts_n=self.sample_sig_part_n)

        sig_every_n_bg_samples = sample_bg_total_n // sample_sig_total_n

        samples_read_bg_n, samples_read_sig_n = 0

        while True:
            bg_constituents = constituents_to_input_samples(next(generator_bg))
            sig_constituents = constituents_to_input_samples(
                next(generator_sig))
Пример #7
0
    def get_mean_and_stdev(
            self):  # -> nd.array [num-features], nd.array [num-features]
        '''
            get mean and standard deviation of input samples constituents (first 1 million events) for each feature
        '''
        data_reader = dare.DataReader(self.path)

        constituents = data_reader.read_constituents_from_dir(read_n=int(1e6))
        constituents_j1j2 = np.vstack(
            [constituents[:, 0, :, :], constituents[:, 1, :, :]])
        return utfu.get_mean_and_stdev(constituents_j1j2)
Пример #8
0
def sample_from_file_generator(max_loops=10, samples_in_parts_n=1e4):
    path = os.path.join(sdi.path_dict['base_dir'],
                        sdi.path_dict['sample_dir']['qcdSide'])
    data_reader = dare.DataReader(path)

    for loop_n, (constituents, features) in enumerate(
            data_reader.generate_event_parts_from_dir(
                parts_n=samples_in_parts_n)):
        indices = list(range(len(constituents)))
        while indices:
            index = indices.pop(0)
            next_sample = constituents[index]  #.copy()
            yield next_sample, next_sample  # x == y in autoencoder
        if loop_n >= max_loops:
            break
def count_number_events_recursively(base_dir, **cuts):

    sample_dirs = glob.glob(base_dir + '/*')

    print(
        '*' * 10 +
        '\ncounting number of events and files with {} in subdirectories of {}\n'
        .format((cuts or 'no'), base_dir) + '*' * 10)

    num_files = []
    num_events = []

    for sample_dir in sample_dirs:
        print('reading events in {}'.format(sample_dir))
        reader = dare.DataReader(sample_dir)
        n_files, n_events = reader.count_files_events_in_dir(**cuts)
        num_files.append(n_files)
        num_events.append(n_events)

    for sample_dir, nn, ff in zip(sample_dirs, num_events, num_files):
        print("{: <52}: {: >10} events in {: >10} files".format(
            os.path.basename(sample_dir), nn, ff))
Пример #10
0
import sarewt.data_reader as dare
import util.data_generator as dage
import utilities as ut

# number setup
samples_max_n = int(1e6) # was 1m
batch_sz = 256
steps_per_epoch = samples_max_n // batch_sz 
events_valid_n = int(1e3) # was 1e3
steps_valid_per_epoch = events_valid_n*2 // batch_sz #(n events a 2 jets = 2n inputs)
gen_part_train_n = int(1e3) # was 1e4
gen_part_valid_n = int(1e3) # was 1e2

# data generator
path = os.path.join(sdi.path_dict['base_dir'], sdi.path_dict['sample_dir']['qcdSide'])
gen = dage.DataGenerator(path=path, sample_part_n=gen_part_train_n, sample_max_n=samples_max_n) # generate samples_max_n jet samples
# tf dataset from generator
tfds = tf.data.Dataset.from_generator(gen, output_types=(tf.float32, tf.float32), output_shapes=((100,3),(100,3))).batch(batch_sz, drop_remainder=True)

# validation data
path = os.path.join(sdi.path_dict['base_dir'], sdi.path_dict['sample_dir']['qcdSideExt'])
#gen_valid = dage.DataGenerator(path=path, samples_in_parts_n=gen_part_train_n, samples_max_n=events_valid_n)
data_valid = dage.constituents_to_input_samples(dare.DataReader(path=path).read_constituents_from_dir(read_n=events_valid_n))
tfds_valid = tf.data.Dataset.from_tensor_slices((data_valid, data_valid)).batch(batch_sz, drop_remainder=True)

# DNN
model = ut.get_simple_autoencoder()
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=1),tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),tf.keras.callbacks.TerminateOnNaN()]
model.fit(tfds, epochs=100, verbose=2, validation_data=tfds_valid, callbacks=callbacks)
# model.fit(tfds, epochs=100, verbose=2, validation_data=(x_valid, x_valid), validation_steps=10)
Пример #11
0
print('>>> Preparing training dataset generator')
data_train_generator = dage.DataGenerator(
    path=paths.sample_dir_path('qcdSide'),
    sample_part_n=params.gen_part_n,
    sample_max_n=params.train_total_n,
    **cuts.global_cuts)  # generate 10 M jet samples
train_ds = tf.data.Dataset.from_generator(
    data_train_generator,
    output_types=tf.float32,
    output_shapes=params.input_shape).batch(
        params.batch_n, drop_remainder=True)  # already shuffled

# validation (full tensor, 1M events -> 2M samples)
print('>>> Preparing validation dataset')
const_valid, _, features_valid, _ = dare.DataReader(
    path=paths.sample_dir_path('qcdSideExt')).read_events_from_dir(
        read_n=params.valid_total_n, **cuts.global_cuts)
data_valid = dage.events_to_input_samples(const_valid, features_valid)
valid_ds = tf.data.Dataset.from_tensor_slices(data_valid).batch(
    params.batch_n, drop_remainder=True)

# stats for normalization layer
mean_stdev = data_train_generator.get_mean_and_stdev()

# *******************************************************
#                       training options
# *******************************************************

optimizer = tf.keras.optimizers.Adam(learning_rate=params.learning_rate)
loss_fn = losses.threeD_loss