def run(self, infile, outfile=None): """ Process the events from the infile, and save them to the outfile. Parameters ---------- infile : str Path to the input file. outfile : str, optional Path to the output file (will be created). If none is given, will auto generate the name and save it in the cwd. """ if outfile is None: outfile = os.path.join( os.getcwd(), "{}_dl.h5".format( os.path.splitext(os.path.basename(infile))[0]), ) if not self.overwrite: if os.path.isfile(outfile): raise FileExistsError(f"File exists: {outfile}") if self.seed: km.GlobalRandomState(seed=self.seed) pipe = self.build_pipe(infile, outfile) summary = pipe.drain() with h5py.File(outfile, "a") as f: self.finish_file(f, summary)
def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None, complib=None, complevel=None, legacy_mode=False, shuffle=True, event_skipper=None, filepath_output=None): """ Shuffles a .h5 file where each dataset needs to have the same number of rows (axis_0). The shuffled data is saved to a new .h5 file with the suffix < _shuffled.h5 >. Can also skip certain events if a event_skipper is given. Parameters ---------- filepath_input : str Filepath of the unshuffled input file. tool : bool Specifies if the function is accessed from the shuffle_h5_tool. In this case, the shuffled .h5 file is returned. seed : int Sets a fixed random seed for the shuffling. delete : bool Specifies if the old, unshuffled file should be deleted after extracting the data. chunksize : None/int Specifies the chunksize for axis_0 in the shuffled output files. If None, the chunksize is read from the input files. Else, a custom chunksize will be used. complib : None/str Specifies the compression library that should be used for saving the shuffled output files. If None, the compression library is read from the input files. Else, a custom compression library will be used. Currently available: 'gzip', or 'lzf'. complevel : None/int Specifies the compression level that should be used for saving the shuffled output files. A compression level is only available for gzip compression, not lzf! If None, the compression level is read from the input files. Else, a custom compression level will be used. legacy_mode : bool Boolean flag that specifies, if the legacy shuffle mode should be used instead of the standard one. A more detailed description of this mode can be found in the summary at the top of this python file. shuffle : bool If false, events will not be shuffled. event_skipper : func, optional Function that takes the blob as an input, and returns a bool. If the bool is true, the blob will be skipped. filepath_output : str, optional If given, this will be the name of the output file. Otherwise, a name is auto generated. Returns ------- output_file_shuffled : h5py.File H5py file instance of the shuffled output file. """ if event_skipper is None and not shuffle: raise ValueError("Either event_skipper or shuffle has to be set") complib_f, complevel_f, chunksize_f = get_f_compression_and_chunking(filepath_input) chunksize = chunksize_f if chunksize is None else chunksize complib = complib_f if complib is None else complib complevel = complevel_f if complevel is None else complevel if complib == 'lzf': complevel = None if filepath_output is None: filepath_output = get_filepath_output(filepath_input, shuffle, event_skipper) if not legacy_mode: # set random km3pipe (=numpy) seed print('Setting a Global Random State with the seed < 42 >.') km.GlobalRandomState(seed=seed) # km3pipe uses pytables for saving the shuffled output file, which has the name 'zlib' for the 'gzip' filter if complib == 'gzip': complib = 'zlib' pipe = kp.Pipeline(timeit=True) # add timeit=True argument for profiling pipe.attach(km.common.StatusBar, every=200) pipe.attach(km.common.MemoryObserver, every=200) pipe.attach(kp.io.hdf5.HDF5Pump, filename=filepath_input, shuffle=shuffle, reset_index=True) if event_skipper is not None: pipe.attach(EventSkipper, event_skipper=event_skipper) pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000) pipe.drain() # copy the used_files dataset to the new file copy_used_files(filepath_input, filepath_output) if delete: os.remove(filepath_input) # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them output_file_shuffled = h5py.File(filepath_output, 'r+') for folder_name in output_file_shuffled: if folder_name.startswith('_i_'): del output_file_shuffled[folder_name] else: input_file = h5py.File(filepath_input, 'r') folder_data_array_dict = {} for folder_name in input_file: folder_data_array = input_file[folder_name][()] # get whole numpy array into memory folder_data_array_dict[folder_name] = folder_data_array # workaround in order to be able to close the input file at the next step input_file.close() if delete: os.remove(filepath_input) output_file_shuffled = h5py.File(filepath_output, 'w') for n, dataset_key in enumerate(folder_data_array_dict): dataset = folder_data_array_dict[dataset_key] if n == 0: # get a particular seed for the first dataset such that the shuffling is consistent across the datasets r = np.random.RandomState(seed) state = r.get_state() r.shuffle(dataset) else: r.set_state(state) # recover shuffle seed of the first dataset r.shuffle(dataset) chunks = (chunksize,) + dataset.shape[1:] output_file_shuffled.create_dataset(dataset_key, data=dataset, dtype=dataset.dtype, chunks=chunks, compression=complib, compression_opts=complevel) # close file in the case of tool=True if tool is False: output_file_shuffled.close() else: return output_file_shuffled
def make_nn_images(fname, detx_filepath, config): """ Main code with config parameters. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used for a CNN. Parameters ---------- fname : str Filename (full path!) of the input file. detx_filepath : str String with the full filepath to the corresponding .detx file of the input file. Used for the binning and for the hits calibration if the input file is not calibrated yet (e.g. hits do not contain pos_x/y/z, time, ...). config : dict Dictionary that contains all configuration options of the make_nn_images function. An explanation of the config parameters can be found in orcasong/default_config.toml. """ # Load all parameters from the config # TODO put everything in a config class, this is horrible output_dirpath = config['output_dirpath'] chunksize, complib, complevel = config['chunksize'], config[ 'complib'], config['complevel'] flush_freq = config['flush_freq'] n_bins = tuple(config['n_bins']) timecut = (config['timecut_mode'], config['timecut_timespan']) do_mc_hits = config['do_mc_hits'] det_geo = config['det_geo'] do2d = config['do2d'] do2d_plots = (config['do2d_plots'], config['do2d_plots_n']) do3d = config['do3d'] do4d = (config['do4d'], config['do4d_mode']) prod_ident = config[ 'prod_ident'] if config['prod_ident'] != 'None' else None data_cuts = dict() data_cuts['triggered'] = config['data_cut_triggered'] data_cuts['energy_lower_limit'] = config[ 'data_cut_e_low'] if config['data_cut_e_low'] != 'None' else None data_cuts['energy_upper_limit'] = config[ 'data_cut_e_high'] if config['data_cut_e_high'] != 'None' else None data_cuts['throw_away_prob'] = config['data_cut_throw_away'] if config[ 'data_cut_throw_away'] != 'None' else None data_cuts[ 'custom_skip_function'] = config['data_cut_custom_func'] if config[ 'data_cut_custom_func'] != 'None' else None make_output_dirs(output_dirpath, do2d, do3d, do4d) filename = os.path.basename(os.path.splitext(fname)[0]) filename_output = filename.replace('.', '_') # set random km3pipe (=numpy) seed print('Setting a Global Random State with the seed < 42 >.') km.GlobalRandomState(seed=42) geo, x_bin_edges, y_bin_edges, z_bin_edges = calculate_bin_edges( n_bins, det_geo, detx_filepath, do4d) pdf_2d_plots = PdfPages(output_dirpath + '/orcasong_output/4dTo2d/' + filename_output + '_plots.pdf') if do2d_plots[0] is True else None file_particle_type = get_file_particle_type(fname) print('Generating histograms from the hits for files based on ' + fname) # Initialize OrcaSong Event Pipeline pipe = kp.Pipeline() # add timeit=True argument for profiling pipe.attach(km.common.StatusBar, every=200) pipe.attach(km.common.MemoryObserver, every=400) pipe.attach(kp.io.hdf5.HDF5Pump, filename=fname) pipe.attach(km.common.Keep, keys=[ 'EventInfo', 'Header', 'RawHeader', 'McTracks', 'Hits', 'McHits' ]) pipe.attach(EventDataExtractor, file_particle_type=file_particle_type, geo=geo, do_mc_hits=do_mc_hits, data_cuts=data_cuts, do4d=do4d, prod_ident=prod_ident) pipe.attach(km.common.Keep, keys=['event_hits', 'event_track']) pipe.attach(EventSkipper, data_cuts=data_cuts) pipe.attach(HistogramMaker, x_bin_edges=x_bin_edges, y_bin_edges=y_bin_edges, z_bin_edges=z_bin_edges, n_bins=n_bins, timecut=timecut, do2d=do2d, do2d_plots=do2d_plots, pdf_2d_plots=pdf_2d_plots, do3d=do3d, do4d=do4d) pipe.attach(km.common.Delete, keys=['event_hits']) if do2d: for proj in ['xy', 'xz', 'yz', 'xt', 'yt', 'zt']: savestr = output_dirpath + '/orcasong_output/4dTo2d/' + proj + '/' + filename_output + '_' + proj + '.h5' pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=flush_freq) if do3d: for proj in ['xyz', 'xyt', 'xzt', 'yzt', 'rzt']: savestr = output_dirpath + '/orcasong_output/4dTo3d/' + proj + '/' + filename_output + '_' + proj + '.h5' pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=flush_freq) if do4d[0]: proj = 'xyzt' if not do4d[1] == 'channel_id' else 'xyzc' savestr = output_dirpath + '/orcasong_output/4dTo4d/' + proj + '/' + filename_output + '_' + proj + '.h5' pipe.attach(kp.io.HDF5Sink, filename=savestr, blob_keys=[proj, 'event_track'], complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=flush_freq) # Execute Pipeline pipe.drain() if do2d_plots[0] is True: pdf_2d_plots.close()
def postproc_file(input_file, output_file=None, shuffle=True, event_skipper=None, delete=False, seed=42, statusbar_every=1000): """ Postprocess a file using km3pipe after it has been preprocessed in OrcaSong. Parameters ---------- input_file : str Path of the file that will be processed. output_file : str, optional If given, this will be the name of the output file. Otherwise, a name is auto generated. shuffle : bool Shuffle order of events. event_skipper : func, optional Function that takes the blob as an input, and returns a bool. If the bool is true, the event will be skipped. delete : bool Specifies if the input file should be deleted after processing. seed : int Sets a fixed random seed for the shuffling. statusbar_every : int or None After how many line a km3pipe status should be printed. Returns ------- output_file : str Path to the output file. """ if output_file is None: output_file = get_filepath_output(input_file, shuffle=shuffle, event_skipper=event_skipper) if os.path.exists(output_file): raise FileExistsError(output_file) print(f'Setting a Global Random State with the seed < {seed} >.') km.GlobalRandomState(seed=seed) comptopts = get_compopts(input_file) # km3pipe uses pytables for saving the shuffled output file, # which has the name 'zlib' for the 'gzip' filter if comptopts["complib"] == 'gzip': comptopts["complib"] = 'zlib' pipe = kp.Pipeline() if statusbar_every is not None: pipe.attach(km.common.StatusBar, every=statusbar_every) pipe.attach(km.common.MemoryObserver, every=statusbar_every) pipe.attach( kp.io.hdf5.HDF5Pump, filename=input_file, shuffle=shuffle, reset_index=True, ) if event_skipper is not None: pipe.attach(EventSkipper, event_skipper=event_skipper) pipe.attach( kp.io.hdf5.HDF5Sink, filename=output_file, complib=comptopts["complib"], complevel=comptopts["complevel"], chunksize=comptopts["chunksize"], flush_frequency=1000, ) pipe.drain() copy_used_files(input_file, output_file) copy_attrs(input_file, output_file) if delete: print("Deleting original file") os.remove(input_file) print("Done!") return output_file