Exemplo n.º 1
0
    def run(self, infile, outfile=None):
        """
        Process the events from the infile, and save them to the outfile.

        Parameters
        ----------
        infile : str
            Path to the input file.
        outfile : str, optional
            Path to the output file (will be created). If none is given,
            will auto generate the name and save it in the cwd.

        """
        if outfile is None:
            outfile = os.path.join(
                os.getcwd(),
                "{}_dl.h5".format(
                    os.path.splitext(os.path.basename(infile))[0]),
            )
        if not self.overwrite:
            if os.path.isfile(outfile):
                raise FileExistsError(f"File exists: {outfile}")
        if self.seed:
            km.GlobalRandomState(seed=self.seed)
        pipe = self.build_pipe(infile, outfile)
        summary = pipe.drain()
        with h5py.File(outfile, "a") as f:
            self.finish_file(f, summary)
Exemplo n.º 2
0
def shuffle_h5(filepath_input, tool=False, seed=42, delete=False, chunksize=None,
               complib=None, complevel=None, legacy_mode=False, shuffle=True,
               event_skipper=None, filepath_output=None):
    """
    Shuffles a .h5 file where each dataset needs to have the same number of rows (axis_0).
    The shuffled data is saved to a new .h5 file with the suffix < _shuffled.h5 >.

    Can also skip certain events if a event_skipper is given.

    Parameters
    ----------
    filepath_input : str
        Filepath of the unshuffled input file.
    tool : bool
        Specifies if the function is accessed from the shuffle_h5_tool.
        In this case, the shuffled .h5 file is returned.
    seed : int
        Sets a fixed random seed for the shuffling.
    delete : bool
        Specifies if the old, unshuffled file should be deleted after extracting the data.
    chunksize : None/int
        Specifies the chunksize for axis_0 in the shuffled output files.
        If None, the chunksize is read from the input files.
        Else, a custom chunksize will be used.
    complib : None/str
        Specifies the compression library that should be used for saving the shuffled output files.
        If None, the compression library is read from the input files.
        Else, a custom compression library will be used.
        Currently available: 'gzip', or 'lzf'.
    complevel : None/int
        Specifies the compression level that should be used for saving the shuffled output files.
        A compression level is only available for gzip compression, not lzf!
        If None, the compression level is read from the input files.
        Else, a custom compression level will be used.
    legacy_mode : bool
        Boolean flag that specifies, if the legacy shuffle mode should be used instead of the standard one.
        A more detailed description of this mode can be found in the summary at the top of this python file.
    shuffle : bool
        If false, events will not be shuffled.
    event_skipper : func, optional
        Function that takes the blob as an input, and returns a bool.
        If the bool is true, the blob will be skipped.
    filepath_output : str, optional
        If given, this will be the name of the output file. Otherwise, a name
        is auto generated.

    Returns
    -------
    output_file_shuffled : h5py.File
        H5py file instance of the shuffled output file.

    """
    if event_skipper is None and not shuffle:
        raise ValueError("Either event_skipper or shuffle has to be set")

    complib_f, complevel_f, chunksize_f = get_f_compression_and_chunking(filepath_input)

    chunksize = chunksize_f if chunksize is None else chunksize
    complib = complib_f if complib is None else complib
    complevel = complevel_f if complevel is None else complevel

    if complib == 'lzf':
        complevel = None

    if filepath_output is None:
        filepath_output = get_filepath_output(filepath_input, shuffle,
                                              event_skipper)

    if not legacy_mode:
        # set random km3pipe (=numpy) seed
        print('Setting a Global Random State with the seed < 42 >.')
        km.GlobalRandomState(seed=seed)

        # km3pipe uses pytables for saving the shuffled output file, which has the name 'zlib' for the 'gzip' filter
        if complib == 'gzip':
            complib = 'zlib'

        pipe = kp.Pipeline(timeit=True)  # add timeit=True argument for profiling
        pipe.attach(km.common.StatusBar, every=200)
        pipe.attach(km.common.MemoryObserver, every=200)
        pipe.attach(kp.io.hdf5.HDF5Pump, filename=filepath_input, shuffle=shuffle, reset_index=True)

        if event_skipper is not None:
            pipe.attach(EventSkipper, event_skipper=event_skipper)

        pipe.attach(kp.io.hdf5.HDF5Sink, filename=filepath_output, complib=complib, complevel=complevel, chunksize=chunksize, flush_frequency=1000)
        pipe.drain()

        # copy the used_files dataset to the new file
        copy_used_files(filepath_input, filepath_output)

        if delete:
            os.remove(filepath_input)

        # delete folders with '_i_' that are created by pytables in the HDF5Sink, we don't need them
        output_file_shuffled = h5py.File(filepath_output, 'r+')
        for folder_name in output_file_shuffled:
            if folder_name.startswith('_i_'):
                del output_file_shuffled[folder_name]

    else:
        input_file = h5py.File(filepath_input, 'r')
        folder_data_array_dict = {}

        for folder_name in input_file:
            folder_data_array = input_file[folder_name][()]  # get whole numpy array into memory
            folder_data_array_dict[folder_name] = folder_data_array  # workaround in order to be able to close the input file at the next step

        input_file.close()

        if delete:
            os.remove(filepath_input)

        output_file_shuffled = h5py.File(filepath_output, 'w')
        for n, dataset_key in enumerate(folder_data_array_dict):

            dataset = folder_data_array_dict[dataset_key]

            if n == 0:
                # get a particular seed for the first dataset such that the shuffling is consistent across the datasets
                r = np.random.RandomState(seed)
                state = r.get_state()
                r.shuffle(dataset)

            else:
                r.set_state(state)  # recover shuffle seed of the first dataset
                r.shuffle(dataset)

            chunks = (chunksize,) + dataset.shape[1:]
            output_file_shuffled.create_dataset(dataset_key, data=dataset, dtype=dataset.dtype, chunks=chunks,
                                                compression=complib, compression_opts=complevel)

    # close file in the case of tool=True
    if tool is False:
        output_file_shuffled.close()
    else:
        return output_file_shuffled
Exemplo n.º 3
0
def make_nn_images(fname, detx_filepath, config):
    """
    Main code with config parameters. Reads raw .hdf5 files and creates 2D/3D histogram projections that can be used
    for a CNN.

    Parameters
    ----------
    fname : str
        Filename (full path!) of the input file.
    detx_filepath : str
        String with the full filepath to the corresponding .detx file of the input file.
        Used for the binning and for the hits calibration if the input file is not calibrated yet
        (e.g. hits do not contain pos_x/y/z, time, ...).
    config : dict
        Dictionary that contains all configuration options of the make_nn_images function.
        An explanation of the config parameters can be found in orcasong/default_config.toml.

    """
    # Load all parameters from the config # TODO put everything in a config class, this is horrible
    output_dirpath = config['output_dirpath']
    chunksize, complib, complevel = config['chunksize'], config[
        'complib'], config['complevel']
    flush_freq = config['flush_freq']
    n_bins = tuple(config['n_bins'])
    timecut = (config['timecut_mode'], config['timecut_timespan'])
    do_mc_hits = config['do_mc_hits']
    det_geo = config['det_geo']
    do2d = config['do2d']
    do2d_plots = (config['do2d_plots'], config['do2d_plots_n'])
    do3d = config['do3d']
    do4d = (config['do4d'], config['do4d_mode'])
    prod_ident = config[
        'prod_ident'] if config['prod_ident'] != 'None' else None
    data_cuts = dict()
    data_cuts['triggered'] = config['data_cut_triggered']
    data_cuts['energy_lower_limit'] = config[
        'data_cut_e_low'] if config['data_cut_e_low'] != 'None' else None
    data_cuts['energy_upper_limit'] = config[
        'data_cut_e_high'] if config['data_cut_e_high'] != 'None' else None
    data_cuts['throw_away_prob'] = config['data_cut_throw_away'] if config[
        'data_cut_throw_away'] != 'None' else None
    data_cuts[
        'custom_skip_function'] = config['data_cut_custom_func'] if config[
            'data_cut_custom_func'] != 'None' else None

    make_output_dirs(output_dirpath, do2d, do3d, do4d)

    filename = os.path.basename(os.path.splitext(fname)[0])
    filename_output = filename.replace('.', '_')

    # set random km3pipe (=numpy) seed
    print('Setting a Global Random State with the seed < 42 >.')
    km.GlobalRandomState(seed=42)

    geo, x_bin_edges, y_bin_edges, z_bin_edges = calculate_bin_edges(
        n_bins, det_geo, detx_filepath, do4d)
    pdf_2d_plots = PdfPages(output_dirpath + '/orcasong_output/4dTo2d/' +
                            filename_output +
                            '_plots.pdf') if do2d_plots[0] is True else None

    file_particle_type = get_file_particle_type(fname)

    print('Generating histograms from the hits for files based on ' + fname)

    # Initialize OrcaSong Event Pipeline

    pipe = kp.Pipeline()  # add timeit=True argument for profiling
    pipe.attach(km.common.StatusBar, every=200)
    pipe.attach(km.common.MemoryObserver, every=400)
    pipe.attach(kp.io.hdf5.HDF5Pump, filename=fname)
    pipe.attach(km.common.Keep,
                keys=[
                    'EventInfo', 'Header', 'RawHeader', 'McTracks', 'Hits',
                    'McHits'
                ])
    pipe.attach(EventDataExtractor,
                file_particle_type=file_particle_type,
                geo=geo,
                do_mc_hits=do_mc_hits,
                data_cuts=data_cuts,
                do4d=do4d,
                prod_ident=prod_ident)
    pipe.attach(km.common.Keep, keys=['event_hits', 'event_track'])
    pipe.attach(EventSkipper, data_cuts=data_cuts)
    pipe.attach(HistogramMaker,
                x_bin_edges=x_bin_edges,
                y_bin_edges=y_bin_edges,
                z_bin_edges=z_bin_edges,
                n_bins=n_bins,
                timecut=timecut,
                do2d=do2d,
                do2d_plots=do2d_plots,
                pdf_2d_plots=pdf_2d_plots,
                do3d=do3d,
                do4d=do4d)
    pipe.attach(km.common.Delete, keys=['event_hits'])

    if do2d:
        for proj in ['xy', 'xz', 'yz', 'xt', 'yt', 'zt']:
            savestr = output_dirpath + '/orcasong_output/4dTo2d/' + proj + '/' + filename_output + '_' + proj + '.h5'
            pipe.attach(kp.io.HDF5Sink,
                        filename=savestr,
                        blob_keys=[proj, 'event_track'],
                        complib=complib,
                        complevel=complevel,
                        chunksize=chunksize,
                        flush_frequency=flush_freq)

    if do3d:
        for proj in ['xyz', 'xyt', 'xzt', 'yzt', 'rzt']:
            savestr = output_dirpath + '/orcasong_output/4dTo3d/' + proj + '/' + filename_output + '_' + proj + '.h5'
            pipe.attach(kp.io.HDF5Sink,
                        filename=savestr,
                        blob_keys=[proj, 'event_track'],
                        complib=complib,
                        complevel=complevel,
                        chunksize=chunksize,
                        flush_frequency=flush_freq)

    if do4d[0]:
        proj = 'xyzt' if not do4d[1] == 'channel_id' else 'xyzc'
        savestr = output_dirpath + '/orcasong_output/4dTo4d/' + proj + '/' + filename_output + '_' + proj + '.h5'
        pipe.attach(kp.io.HDF5Sink,
                    filename=savestr,
                    blob_keys=[proj, 'event_track'],
                    complib=complib,
                    complevel=complevel,
                    chunksize=chunksize,
                    flush_frequency=flush_freq)

    # Execute Pipeline
    pipe.drain()

    if do2d_plots[0] is True:
        pdf_2d_plots.close()
Exemplo n.º 4
0
def postproc_file(input_file,
                  output_file=None,
                  shuffle=True,
                  event_skipper=None,
                  delete=False,
                  seed=42,
                  statusbar_every=1000):
    """
    Postprocess a file using km3pipe after it has been preprocessed in OrcaSong.

    Parameters
    ----------
    input_file : str
        Path of the file that will be processed.
    output_file : str, optional
        If given, this will be the name of the output file.
        Otherwise, a name is auto generated.
    shuffle : bool
        Shuffle order of events.
    event_skipper : func, optional
        Function that takes the blob as an input, and returns a bool.
        If the bool is true, the event will be skipped.
    delete : bool
        Specifies if the input file should be deleted after processing.
    seed : int
        Sets a fixed random seed for the shuffling.
    statusbar_every : int or None
        After how many line a km3pipe status should be printed.

    Returns
    -------
    output_file : str
        Path to the output file.

    """
    if output_file is None:
        output_file = get_filepath_output(input_file,
                                          shuffle=shuffle,
                                          event_skipper=event_skipper)
    if os.path.exists(output_file):
        raise FileExistsError(output_file)

    print(f'Setting a Global Random State with the seed < {seed} >.')
    km.GlobalRandomState(seed=seed)

    comptopts = get_compopts(input_file)
    # km3pipe uses pytables for saving the shuffled output file,
    # which has the name 'zlib' for the 'gzip' filter
    if comptopts["complib"] == 'gzip':
        comptopts["complib"] = 'zlib'

    pipe = kp.Pipeline()
    if statusbar_every is not None:
        pipe.attach(km.common.StatusBar, every=statusbar_every)
        pipe.attach(km.common.MemoryObserver, every=statusbar_every)
    pipe.attach(
        kp.io.hdf5.HDF5Pump,
        filename=input_file,
        shuffle=shuffle,
        reset_index=True,
    )
    if event_skipper is not None:
        pipe.attach(EventSkipper, event_skipper=event_skipper)
    pipe.attach(
        kp.io.hdf5.HDF5Sink,
        filename=output_file,
        complib=comptopts["complib"],
        complevel=comptopts["complevel"],
        chunksize=comptopts["chunksize"],
        flush_frequency=1000,
    )
    pipe.drain()

    copy_used_files(input_file, output_file)
    copy_attrs(input_file, output_file)
    if delete:
        print("Deleting original file")
        os.remove(input_file)

    print("Done!")
    return output_file