def __call__(self): # -> generator object yielding np.ndarray, np.ndarray ''' generate single(!) data-sample (batching done in tf.Dataset) ''' # create new file data-reader, each time data-generator is called (otherwise file-data-reader generation not reset) generator = dare.DataReader(self.path).generate_event_parts_from_dir( parts_n=self.sample_part_n, **self.cuts) samples_read_n = 0 # loop through whole dataset, reading sample_part_n events at a time for constituents, features in generator: samples = events_to_input_samples(constituents, features) indices = list(range(len(samples))) samples_read_n += len(samples) while indices: index = indices.pop(0) next_sample = samples[index] #.copy() yield next_sample if self.sample_max_n is not None and (samples_read_n >= self.sample_max_n): break print('[DataGenerator]: __call__() yielded {} samples'.format( samples_read_n)) generator.close()
def from_input_file(cls, name, path, **cuts): reader = dare.DataReader(path) constituents, jet_features = reader.read_events_from_file(**cuts) constituents_feature_names, jet_feature_names = reader.read_labels_from_file( ) return cls(name, constituents, jet_features, constituents_feature_names, jet_feature_names)
def from_input_dir(cls, name, path, read_n=None, **cuts): df, _ = dr.DataReader(path).read_jet_features_from_dir( read_n=read_n, features_to_df=True, **cuts) # convert any QR-selection colums from 0/1 to bool sel_cols = [c for c in df if c.startswith('sel')] for sel in sel_cols: # convert selection column to bool df[sel] = df[sel].astype(bool) return cls(name, df)
def setUp(self): self.dir_path = '/eos/user/k/kiwoznia/data/VAE_data/baby_events/qcd_sqrtshatTeV_13TeV_PU40' self.reader = dare.DataReader(self.dir_path) self.file_paths = fnames = [ os.path.join(self.dir_path, 'qcd_sqrtshatTeV_13TeV_PU40_' + str(n) + '.h5') for n in [34, 721, 96] ] self.total_num_events_in_dir = 57964 + 36 + 58096
def from_input_dir(cls, name, path, read_n=None, **cuts): ''' reading data in all files in 'path' to event sample''' reader = dare.DataReader(path) # import ipdb; ipdb.set_trace() constituents, constituents_feature_names, jet_features, jet_feature_names = reader.read_events_from_dir( read_n=read_n, **cuts) return cls(name, constituents, jet_features, constituents_feature_names, jet_feature_names)
def __call__( self): # -> generator object yielding (np.ndarray, np.ndarray) # make background and signal sample generators generator_bg = dare.DataReader( self.path_bg).generate_constituents_parts_from_dir( parts_n=self.sample_bg_part_n) generator_sig = dare.DataReader( self.path_sig).generate_constituents_parts_from_dir( parts_n=self.sample_sig_part_n) sig_every_n_bg_samples = sample_bg_total_n // sample_sig_total_n samples_read_bg_n, samples_read_sig_n = 0 while True: bg_constituents = constituents_to_input_samples(next(generator_bg)) sig_constituents = constituents_to_input_samples( next(generator_sig))
def get_mean_and_stdev( self): # -> nd.array [num-features], nd.array [num-features] ''' get mean and standard deviation of input samples constituents (first 1 million events) for each feature ''' data_reader = dare.DataReader(self.path) constituents = data_reader.read_constituents_from_dir(read_n=int(1e6)) constituents_j1j2 = np.vstack( [constituents[:, 0, :, :], constituents[:, 1, :, :]]) return utfu.get_mean_and_stdev(constituents_j1j2)
def sample_from_file_generator(max_loops=10, samples_in_parts_n=1e4): path = os.path.join(sdi.path_dict['base_dir'], sdi.path_dict['sample_dir']['qcdSide']) data_reader = dare.DataReader(path) for loop_n, (constituents, features) in enumerate( data_reader.generate_event_parts_from_dir( parts_n=samples_in_parts_n)): indices = list(range(len(constituents))) while indices: index = indices.pop(0) next_sample = constituents[index] #.copy() yield next_sample, next_sample # x == y in autoencoder if loop_n >= max_loops: break
def count_number_events_recursively(base_dir, **cuts): sample_dirs = glob.glob(base_dir + '/*') print( '*' * 10 + '\ncounting number of events and files with {} in subdirectories of {}\n' .format((cuts or 'no'), base_dir) + '*' * 10) num_files = [] num_events = [] for sample_dir in sample_dirs: print('reading events in {}'.format(sample_dir)) reader = dare.DataReader(sample_dir) n_files, n_events = reader.count_files_events_in_dir(**cuts) num_files.append(n_files) num_events.append(n_events) for sample_dir, nn, ff in zip(sample_dirs, num_events, num_files): print("{: <52}: {: >10} events in {: >10} files".format( os.path.basename(sample_dir), nn, ff))
import sarewt.data_reader as dare import util.data_generator as dage import utilities as ut # number setup samples_max_n = int(1e6) # was 1m batch_sz = 256 steps_per_epoch = samples_max_n // batch_sz events_valid_n = int(1e3) # was 1e3 steps_valid_per_epoch = events_valid_n*2 // batch_sz #(n events a 2 jets = 2n inputs) gen_part_train_n = int(1e3) # was 1e4 gen_part_valid_n = int(1e3) # was 1e2 # data generator path = os.path.join(sdi.path_dict['base_dir'], sdi.path_dict['sample_dir']['qcdSide']) gen = dage.DataGenerator(path=path, sample_part_n=gen_part_train_n, sample_max_n=samples_max_n) # generate samples_max_n jet samples # tf dataset from generator tfds = tf.data.Dataset.from_generator(gen, output_types=(tf.float32, tf.float32), output_shapes=((100,3),(100,3))).batch(batch_sz, drop_remainder=True) # validation data path = os.path.join(sdi.path_dict['base_dir'], sdi.path_dict['sample_dir']['qcdSideExt']) #gen_valid = dage.DataGenerator(path=path, samples_in_parts_n=gen_part_train_n, samples_max_n=events_valid_n) data_valid = dage.constituents_to_input_samples(dare.DataReader(path=path).read_constituents_from_dir(read_n=events_valid_n)) tfds_valid = tf.data.Dataset.from_tensor_slices((data_valid, data_valid)).batch(batch_sz, drop_remainder=True) # DNN model = ut.get_simple_autoencoder() callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=1),tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),tf.keras.callbacks.TerminateOnNaN()] model.fit(tfds, epochs=100, verbose=2, validation_data=tfds_valid, callbacks=callbacks) # model.fit(tfds, epochs=100, verbose=2, validation_data=(x_valid, x_valid), validation_steps=10)
print('>>> Preparing training dataset generator') data_train_generator = dage.DataGenerator( path=paths.sample_dir_path('qcdSide'), sample_part_n=params.gen_part_n, sample_max_n=params.train_total_n, **cuts.global_cuts) # generate 10 M jet samples train_ds = tf.data.Dataset.from_generator( data_train_generator, output_types=tf.float32, output_shapes=params.input_shape).batch( params.batch_n, drop_remainder=True) # already shuffled # validation (full tensor, 1M events -> 2M samples) print('>>> Preparing validation dataset') const_valid, _, features_valid, _ = dare.DataReader( path=paths.sample_dir_path('qcdSideExt')).read_events_from_dir( read_n=params.valid_total_n, **cuts.global_cuts) data_valid = dage.events_to_input_samples(const_valid, features_valid) valid_ds = tf.data.Dataset.from_tensor_slices(data_valid).batch( params.batch_n, drop_remainder=True) # stats for normalization layer mean_stdev = data_train_generator.get_mean_and_stdev() # ******************************************************* # training options # ******************************************************* optimizer = tf.keras.optimizers.Adam(learning_rate=params.learning_rate) loss_fn = losses.threeD_loss