Exemplo n.º 1
0
def split_telescope_data(input_path, output_basename, fraction, name):

    array_events = read_data(input_path, key='array_events')
    telescope_events = read_data(input_path, key='telescope_events')
    runs = read_data(input_path, key='runs')

    # split by runs
    ids = set(runs.run_id)
    log.debug(f'All runs:{ids}')
    n_total = len(ids)

    log.info(f'Found a total of {n_total} runs in the file')
    num_runs = split_indices(ids, n_total, fractions=fraction)

    for n, part_name in zip(num_runs, name):
        selected_run_ids = np.random.choice(list(ids), size=n, replace=False)
        selected_runs = runs[runs.run_id.isin(selected_run_ids)]
        selected_array_events = array_events[array_events.run_id.isin(selected_run_ids)]
        selected_telescope_events = telescope_events[telescope_events.run_id.isin(selected_run_ids)]

        path = output_basename + '_' + part_name + '.hdf5'
        log.info('Writing {} runs events to: {}'.format(n, path))
        write_data(selected_runs, path, key='runs', use_h5py=True, mode='w')
        write_data(selected_array_events, path, key='array_events', 
                    use_h5py=True, mode='a')
        write_data(selected_telescope_events, path, key='telescope_events', 
                    use_h5py=True, mode='a')
        log.debug(f'selected runs {set(selected_run_ids)}')
        log.debug(f'Runs minus selected runs {ids - set(selected_run_ids)}')
        ids = ids - set(selected_run_ids)
Exemplo n.º 2
0
def split_single_telescope_data(input_path, output_basename, fmt, inkey, key, fraction, name):

    if fmt in ['hdf5', 'hdf', 'h5']:
        data = read_data(input_path, key=inkey)
    elif fmt == 'csv':
        data = read_data(input_path)

    assert len(fraction) == len(name), 'You must give a name for each fraction'

    if sum(fraction) != 1:
        warnings.warn('Fractions do not sum up to 1')

    ids = data.index.values
    n_total = len(data)

    log.info('Found a total of {} single-telescope events in the file'.format(len(data)))

    num_ids = split_indices(ids, n_total, fractions=fraction)

    for n, part_name in zip(num_ids, name):
        selected_ids = np.random.choice(ids, size=n, replace=False)
        selected_data = data.loc[selected_ids]

        if fmt in ['hdf5', 'hdf', 'h5']:
            path = output_basename + '_' + part_name + '.hdf5'
            log.info('Writing {} telescope-array events to: {}'.format(n, path))
            write_data(selected_data, path, key=key, use_h5py=True, mode='w')

        elif fmt == 'csv':
            filename = output_basename + '_' + part_name + '.csv'
            log.info('Writing {} telescope-array events to: {}'.format(n, filename))
            selected_data.to_csv(filename, index=False)

        data = data.loc[list(set(data.index.values) - set(selected_data.index.values))]
        ids = data.index.values
Exemplo n.º 3
0
def read_telescope_data(path,
                        config,
                        columns,
                        feature_generation_config=None,
                        n_sample=None,
                        first=None,
                        last=None):
    '''
    Read given columns from data and perform a random sample if n_sample is supplied.
    Returns a single pandas data frame
    '''
    telescope_event_columns = None
    array_event_columns = None

    join_keys = [config.run_id_column, config.array_event_id_column]
    if columns:
        with h5py.File(path, 'r') as f:
            array_event_columns = set(
                f[config.array_events_key].keys()) & set(columns)
            telescope_event_columns = set(
                f[config.telescope_events_key].keys()) & set(columns)
            array_event_columns |= set(join_keys)
            telescope_event_columns |= set(join_keys)

    telescope_events = read_data(
        file_path=path,
        key=config.telescope_events_key,
        columns=telescope_event_columns,
        first=first,
        last=last,
    )
    array_events = read_data(
        file_path=path,
        key=config.array_events_key,
        columns=array_event_columns,
    )

    df = pd.merge(left=array_events,
                  right=telescope_events,
                  left_on=join_keys,
                  right_on=join_keys)

    if n_sample is not None:
        if n_sample > len(df):
            raise ValueError(
                'number of sampled events'
                ' {} must be smaller than number events in file {} ({})'.
                format(n_sample, path, len(df)))
        log.info('Randomly sample {} events'.format(n_sample))
        state = np.random.RandomState()
        state.set_state(np.random.get_state())
        df = df.sample(n_sample, random_state=state)

    # generate features if given in config
    if feature_generation_config:
        feature_generation(df, feature_generation_config, inplace=True)

    return df
Exemplo n.º 4
0
def read_telescope_data(path, aict_config, columns, feature_generation_config=None, n_sample=None, first=None, last=None):
    '''
    Read given columns from data and perform a random sample if n_sample is supplied.
    Returns a single pandas data frame
    '''
    telescope_event_columns = None
    array_event_columns = None
    if aict_config.has_multiple_telescopes:
        join_keys = [aict_config.run_id_column, aict_config.array_event_id_column]
        if columns:
            with h5py.File(path, 'r') as f:
                array_event_columns = set(f[aict_config.array_events_key].keys()) & set(columns)
                telescope_event_columns = set(f[aict_config.telescope_events_key].keys()) & set(columns)
                array_event_columns |= set(join_keys)
                telescope_event_columns |= set(join_keys)

        telescope_events = read_data(
            file_path=path,
            key=aict_config.telescope_events_key,
            columns=telescope_event_columns,
            first=first,
            last=last,
        )
        array_events = read_data(
            file_path=path,
            key=aict_config.array_events_key,
            columns=array_event_columns,
        )

        df = pd.merge(left=array_events, right=telescope_events, left_on=join_keys, right_on=join_keys)

    else:
        df = read_data(
            file_path=path,
            key=aict_config.telescope_events_key,
            columns=columns,
            first=first,
            last=last,
        )

    if n_sample is not None:
        if n_sample > len(df):
            raise ValueError(
                'number of sampled events'
                ' {} must be smaller than number events in file {} ({})'
                .format(n_sample, path, len(df))
            )
        log.info('Randomly sample {} events'.format(n_sample))
        state = np.random.RandomState()
        state.set_state(np.random.get_state())
        df = df.sample(n_sample, random_state=state)

    # generate features if given in config
    if feature_generation_config:
        feature_generation(df, feature_generation_config, inplace=True)

    return df
Exemplo n.º 5
0
def main(configuration_path, input_path, output_path, key, verbose):
    '''
    Apply cuts given in CONFIGURATION_PATH to the data in INPUT_PATH and
    write the result to OUTPUT_PATH.

    example:
    ```
    selection:
        length:
          - '<'
          - 0.06
    ```
    '''
    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    with open(configuration_path) as f:
        config = yaml.safe_load(f)

    selection = config.get('selection', {})

    array_events = read_data(input_path, key='array_events')
    telescope_events = read_data(input_path, key='telescope_events')

    mask_telescope = create_mask_h5py(input_path,
                                      selection,
                                      key='telescope_events')
    selected_telescope_events = telescope_events[mask_telescope]

    array_events['idx'] = array_events.index
    merge = pd.merge(selected_telescope_events[['run_id', 'array_event_id']],
                     array_events[['run_id', 'array_event_id', 'idx']],
                     on=['run_id', 'array_event_id'],
                     how='left')
    selected_array_events = array_events[array_events.idx.isin(merge.idx)]

    write_data(selected_telescope_events,
               output_path,
               key='telescope_events',
               use_h5py=True,
               mode='w')
    write_data(selected_array_events,
               output_path,
               key='array_events',
               use_h5py=True,
               mode='a')

    with h5py.File(input_path,
                   mode='r') as infile, h5py.File(output_path,
                                                  'r+') as outfile:
        if 'runs' in infile.keys():
            log.info('Copying runs group to outputfile')
            infile.copy('/runs', outfile['/'])
Exemplo n.º 6
0
def load_signal_events(gammas_path,
                       assumed_obs_time=30 * u.min,
                       columns=DEFAULT_COLUMNS,
                       calculate_weights=True):
    # crab_spectrum = spectrum.CrabSpectrum()
    crab_spectrum = spectrum.CrabLogParabola()

    gamma_runs = read_data(gammas_path, key='runs')

    if (gamma_runs.mc_diffuse.std() != 0).any():
        print(
            Fore.RED +
            f'Data given at {gammas_path} contains mix of diffuse and pointlike gammas.'
        )
        print(Fore.RESET)
        raise ValueError

    is_diffuse = (gamma_runs.mc_diffuse == 1).all()
    gammas = read_data(gammas_path, key='array_events', columns=columns)
    mc_production_gamma = spectrum.MCSpectrum.from_cta_runs(gamma_runs)

    if (gamma_runs.mc_diffuse == 1).all():
        source_az = gammas.mc_az.values * u.deg
        source_alt = gammas.mc_alt.values * u.deg
    else:
        source_az = gammas.mc_az.iloc[0] * u.deg
        source_alt = gammas.mc_alt.iloc[0] * u.deg

    gammas['theta'] = (calculate_distance_to_point_source(
        gammas, source_alt=source_alt, source_az=source_az).to(u.deg).value)

    if calculate_weights:
        if is_diffuse:
            print(
                Fore.RED +
                f'Data given at {gammas_path} is diffuse. Cannot calcualte weights according to crab spectrum which is pointlike'
            )
            print(Fore.RESET)
            raise ValueError

        gammas['weight'] = mc_production_gamma.reweigh_to_other_spectrum(
            crab_spectrum,
            gammas.mc_energy.values * u.TeV,
            t_assumed_obs=assumed_obs_time)

    return gammas, source_alt, source_az
def main(gamma_file, proton_file, electron_file, output, n_bins, threshold):

    t_assumed_obs = 50 * u.h
    bins, bin_center, bin_widths = make_energy_bins(e_min=0.008 * u.TeV,
                                                    e_max=200 * u.TeV,
                                                    bins=n_bins)

    spectra = [CrabSpectrum(), CTAProtonSpectrum(), CTAElectronSpectrum()]
    labels = ['Gamma (Crab)', 'Proton', 'Electrons']
    iterator = zip([gamma_file, proton_file, electron_file], spectra,
                   color_cycle, labels)
    for input_file, spectrum, color, label in iterator:

        events = read_data(input_file, key='array_events')
        runs = read_data(input_file, key='runs')
        mc_production = MCSpectrum.from_cta_runs(runs)

        if threshold > 0:
            events = events.loc[events.gamma_prediction_mean >= threshold]

        estimated_energies = events.gamma_energy_prediction_mean.values * u.TeV
        weights = mc_production.reweigh_to_other_spectrum(
            spectrum, estimated_energies, t_assumed_obs=t_assumed_obs)

        plt.hist(estimated_energies,
                 bins=bins,
                 weights=weights,
                 histtype='step',
                 lw=2,
                 color=color,
                 label=label)

    plt.legend()

    # plt.ylim([100, 1E8])
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel(r'$E_{\mathrm{Reco}} /  \mathrm{TeV}$')
    plt.ylabel(f'Triggered Counts in {t_assumed_obs}')
    plt.tight_layout()
    if output:
        plt.savefig(output)
    else:
        plt.show()
Exemplo n.º 8
0
def load_crab_training_data(N=-1, prediction_threshold=0.8):
    '''
    Returns array of images X and one-hot encoded labels Y. Both classes equally sampled.
    '''

    dl3 = fio.read_data('./data/dl3/open_crab_sample_dl3.hdf5', key='events')
    dl3 = dl3.set_index(['night', 'run_id', 'event_num'])

    f = h5py.File('./data/crab_images.hdf5', 'r')
    night = f['events/night'][:]
    run = f['events/run_id'][:]
    event = f['events/event_num'][:]

    df = pd.DataFrame({'night': night, 'run_id': run, 'event_num': event})
    df['int_index'] = df.index
    df = df.set_index(['night', 'run_id', 'event_num'])

    data = df.join(dl3, how='inner')

    indices = data.int_index.values
    data = data.set_index(indices)

    indices = list(sorted(indices))

    if N > 0:
        indices = indices[:N]
    else:
        N = len(indices)

    print('loading {} images'.format(len(indices)))
    images = load_images_with_index(indices)

    data = data.loc[indices]
    data = data.reset_index()

    gammas = data[data.gamma_prediction >= prediction_threshold]
    protons = data[data.gamma_prediction < prediction_threshold]

    ids_gamma = np.random.choice(gammas.index.values, N // 2)
    ids_proton = np.random.choice(protons.index.values, N // 2)
    ids = np.append(ids_gamma, ids_proton)

    X = images[ids]
    Y = np.where(data.loc[ids].gamma_prediction > 0.8, 1.0, 0.0)
    df = data.copy().loc[ids]

    print('Loaded {} positive labels and {} negative labels'.format(
        np.sum(Y), N - np.sum(Y)))

    Y = OneHotEncoder().fit_transform(Y.reshape(-1, 1)).toarray()

    df, X, Y = shuffle(df, X, Y)
    X = scale_images(X)

    return df, X, Y
Exemplo n.º 9
0
def apply(ctx, out_file, data, number_of_images):
    import fact.io as fio
    network = ctx.obj['network']
    model = load_model(network)

    p = '{}.index'.format(model_path)
    if not os.path.exists(p):
        print('No model trained yet. Do so first.')
        return

    if os.path.exists(out_file):
        click.confirm(
            'Do you want to overwrite existing file {}?'.format(out_file),
            abort=True)
        os.remove(out_file)

    if data == 'crab':
        df = image_io.apply_to_observation_data(model)

    elif data == 'gamma':
        df = image_io.apply_to_mc(model,
                                  path='./data/gamma_images.hdf5',
                                  N=number_of_images)
        shower_truth = fio.read_data('./data/gamma_images.hdf5', key='showers')
        fio.write_data(shower_truth,
                       file_path=out_file,
                       key='showers',
                       use_hp5y=True)

    elif data == 'proton':
        df = image_io.apply_to_mc(model,
                                  path='./data/proton_images.hdf5',
                                  N=number_of_images)
        shower_truth = fio.read_data('./data/proton_images.hdf5',
                                     key='showers')
        fio.write_data(shower_truth,
                       file_path=out_file,
                       key='showers',
                       use_hp5y=True)

    print('Writing {} events to file {}'.format(len(df), out_file))
    fio.write_data(df, out_file, key='events')
Exemplo n.º 10
0
def main(dl3_path, output_path, n_pix, source_name, background):
    '''
    Takes FACT dl3 output and plots a skymap which is being saved to the output_path.
    '''
    runs = fio.read_data(dl3_path, key='runs')
    dl3 = fio.read_data(dl3_path, key='events')

    data = pd.merge(runs, dl3, on=['run_id', 'night'])

    timestamps = pd.to_datetime(data.timestamp).values
    total_ontime = estimate_exposure_time(timestamps)
    print('Total estimated exposure time: {}'.format(total_ontime.to(u.h)))

    ra_pointing = data.right_ascension.values * u.hourangle
    dec_pointing = data.declination.values * u.deg
    pointing = SkyCoord(ra=ra_pointing, dec=dec_pointing)

    img = None
    wcs = None
    if background:
        img_center = SkyCoord(ra=pointing.ra.mean(), dec=pointing.dec.mean())
        img, wcs = get_sdss_sky_image(img_center=img_center,
                                      n_pix=n_pix,
                                      fov=9 * u.deg)

    mask, wcs = build_exposure_map(pointing, timestamps, shape=(n_pix, n_pix))

    ax = plot_exposure(mask, wcs, image=img)
    if source_name:
        source = SkyCoord.from_name('Crab Nebula')
        ax.scatter(
            source.ra.deg,
            source.dec.deg,
            transform=ax.get_transform('icrs'),
            label=source_name,
            s=10**2,
            facecolors='none',
            edgecolors='r',
        )
        ax.legend()

    plt.savefig(output_path, dpi=200)
Exemplo n.º 11
0
def load_background_events(protons_path,
                           electrons_path,
                           source_alt,
                           source_az,
                           assumed_obs_time=50 * u.h,
                           columns=DEFAULT_COLUMNS,
                           return_rate=False):
    # cosmic_ray_spectrum = spectrum.CosmicRaySpectrumPDG()
    cosmic_ray_spectrum = spectrum.CosmicRaySpectrum()
    electron_spectrum = spectrum.CTAElectronSpectrum()

    protons = read_data(protons_path, key='array_events', columns=columns)
    proton_runs = read_data(protons_path, key='runs')

    mc_production_proton = spectrum.MCSpectrum.from_cta_runs(proton_runs)
    protons['weight'] = mc_production_proton.reweigh_to_other_spectrum(
        cosmic_ray_spectrum,
        protons.mc_energy.values * u.TeV,
        t_assumed_obs=assumed_obs_time)
    protons['theta'] = (calculate_distance_to_point_source(
        protons, source_alt=source_alt, source_az=source_az).to(u.deg).value)
    protons['type'] = PROTON_TYPE

    electrons = read_data(electrons_path, key='array_events', columns=columns)
    electron_runs = read_data(electrons_path, key='runs')

    mc_production_electrons = spectrum.MCSpectrum.from_cta_runs(electron_runs)
    electrons['weight'] = mc_production_electrons.reweigh_to_other_spectrum(
        electron_spectrum,
        electrons.mc_energy.values * u.TeV,
        t_assumed_obs=assumed_obs_time)
    electrons['theta'] = (calculate_distance_to_point_source(
        electrons, source_alt=source_alt, source_az=source_az).to(u.deg).value)
    electrons['type'] = ELECTRON_TYPE

    background = pd.concat([protons, electrons], sort=False)
    if return_rate:
        event_rate = background['weight'].sum() / assumed_obs_time.to(u.s)
        # print(f'Background event rate :{event_rate}')
        return background, event_rate
    else:
        return background
Exemplo n.º 12
0
def main(predictions, threshold, bins):
    df = fio.read_data(predictions, key='events')
    df = df[df.predictions_convnet >= threshold]
    selected_event_energies = df.energy.values

    df = fio.read_data(predictions, key='showers')
    all_event_energies = df.energy.values
    ax = plt.gca()

    bins = np.logspace(
        np.log10(all_event_energies.min()),
        np.log10(all_event_energies.max()),
        bins + 1,
    )

    ret = collection_area(
        all_event_energies,
        selected_event_energies,
        impact=270 * u.m,
        bins=bins,
        log=False,
        sample_fraction=1.0,
    )
    area, bin_centers, bin_width, lower_conf, upper_conf = ret

    ax.errorbar(
        bin_centers,
        area.value,
        xerr=bin_width / 2,
        yerr=[(area - lower_conf).value, (upper_conf - area).value],
    )

    plt.yscale('log')
    plt.xscale('log')

    plt.show()
Exemplo n.º 13
0
def test_theta_offs():
    from fact.io import read_data
    from fact.analysis import calc_theta_offs_camera

    df = read_data('tests/resources/gammas.hdf5', key='events')

    theta_offs = calc_theta_offs_camera(df.source_x_prediction,
                                        df.source_y_prediction,
                                        df.zd_source_calc,
                                        df.az_source_calc,
                                        df.zd_tracking,
                                        df.az_tracking,
                                        n_off=5)

    assert len(theta_offs) == 5
    assert all(len(theta_off) == len(df) for theta_off in theta_offs)
Exemplo n.º 14
0
def test_theta():
    from fact.io import read_data
    from fact.analysis import calc_theta_camera

    df = read_data('tests/resources/gammas.hdf5', key='events')

    theta = calc_theta_camera(
        df.source_x_prediction,
        df.source_y_prediction,
        df.zd_source_calc,
        df.az_source_calc,
        df.zd_tracking,
        df.az_tracking,
    )

    assert len(theta) == len(df)
Exemplo n.º 15
0
def test_read_data_h5py():
    '''
    Create a h5py hdf5 file from a dataframe and read it back.
    '''
    from fact.io import write_data, read_data

    df = pd.DataFrame({
        'x': np.random.normal(size=50).astype('float32'),
        'N': np.random.randint(0, 10, dtype='uint8', size=50)
    }).sort_index(1)

    with tempfile.NamedTemporaryFile(suffix='.hdf5') as f:
        write_data(df, f.name, use_h5py=True, key='lecker_daten')

        df_from_file = read_data(f.name, key='lecker_daten').sort_index(1)
        assert set(df.columns) == set(df_from_file.columns)
        assert df.equals(df_from_file)
Exemplo n.º 16
0
def test_read_data_csv():
    '''
    Write a csv file from a dataframe and then read it back again.
    '''
    from fact.io import write_data, read_data

    df = pd.DataFrame({
        'x': np.random.normal(size=50).astype('float32'),
        'N': np.random.randint(0, 10, dtype='uint8', size=50)
    })

    with tempfile.NamedTemporaryFile(suffix='.csv') as f:
        write_data(df, f.name)

        dtypes = {'x': 'float32', 'N': 'uint8'}
        df_from_file = read_data(f.name, dtype=dtypes)

        assert df.equals(df_from_file)
Exemplo n.º 17
0
def main(inputfile, outputfile, input_key, output_key, verbose):
    """
    Convert a pandas style hdf5 file to a h5py style hdf5 file

    INPUTFILE: A pandas style hdf5 file
    OUTPUTFILE: path for the output file in h5py format
    """

    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    log.info("Reading input data")
    df = read_data(inputfile, key=input_key)
    log.info("done")

    with h5py.File(outputfile, "w") as f:
        g = f.create_group(output_key)

        for column, data in df.items():

            if data.dtype == object:
                log.debug("Columns has dtype object: {}".format(column))

                if isinstance(data.iloc[0], str):
                    log.debug("Columns is str: {}".format(column))
                    dt = h5py.special_dtype(vlen=str)
                    g.create_dataset(column,
                                     data=data.values,
                                     dtype=dt,
                                     maxshape=(None, ))

                elif isinstance(data.iloc[0], list):
                    log.debug("Columns is list: {}".format(column))
                    array = np.array([o for o in data.values])
                    shape = list(array.shape)
                    shape[0] = None
                    g.create_dataset(column, data=array, maxshape=tuple(shape))

                else:
                    log.warning(
                        "skipping object type column {}".format(column))
            else:
                log.debug("Writing out {}".format(column))
                g.create_dataset(column, data=data.values, maxshape=(None, ))
def main(inputfile, outputfile, input_key, output_key, verbose):
    '''
    Convert a pandas style hdf5 file to a h5py style hdf5 file

    INPUTFILE: A pandas style hdf5 file
    OUTPUTFILE: path for the output file in h5py format
    '''

    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    log.info('Reading input data')
    df = read_data(inputfile, key=input_key)
    log.info('done')

    with h5py.File(outputfile, 'w') as f:
        g = f.create_group(output_key)

        for column, data in df.items():

            if data.dtype == object:
                log.debug('Columns has dtype object: {}'.format(column))

                if isinstance(data.iloc[0], str):
                    log.debug('Columns is str: {}'.format(column))
                    dt = h5py.special_dtype(vlen=str)
                    g.create_dataset(
                        column, data=data.values, dtype=dt, maxshape=(None, )
                    )

                elif isinstance(data.iloc[0], list):
                    log.debug('Columns is list: {}'.format(column))
                    array = np.array([o for o in data.values])
                    shape = list(array.shape)
                    shape[0] = None
                    g.create_dataset(column, data=array, maxshape=tuple(shape))

                else:
                    log.warn('skipping object type column {}'.format(column))
            else:
                log.debug('Writing out {}'.format(column))
                g.create_dataset(column, data=data.values, maxshape=(None, ))
def main(method):

    phs = read_data('/home/ksedlaczek/OUT_DBSCAN/crab_data_precuts.hdf5',
                    key='events')

    df = phs
    #  df = phs.join(
    #      std,
    #      how='inner',
    #      rsuffix='_std',
    #      lsuffix='_phs',
    #      on=('run', 'event', 'reuse'),
    #  )
    df.sort_index(axis=1, inplace=True)

    with PdfPages('Energy_vs_size_{}.pdf'.format(method)) as pdf:
        print("Plotting 2D")
        plot_comparison(df, log=True)
        pdf.savefig()
        plt.close()
Exemplo n.º 20
0
def load_crab_data(
    start=0,
    end=1000,
):
    dl3 = fio.read_data('./data/dl3/open_crab_sample_dl3.hdf5', key='events')
    dl3 = dl3.set_index(['night', 'run_id', 'event_num'])

    f = h5py.File('./data/crab_images.hdf5', 'r')
    night = f['events/night'][start:end]
    run = f['events/run_id'][start:end]
    event = f['events/event_num'][start:end]
    images = f['events/image'][start:end]

    df = pd.DataFrame({'night': night, 'run_id': run, 'event_num': event})
    df['int_index'] = df.index
    df = df.set_index(['night', 'run_id', 'event_num'])

    data = df.join(dl3, how='inner')
    images = scale_images(images[data.int_index])

    return data, images
Exemplo n.º 21
0
def main(method, type):
    std = read_data(
        '/home/ksedlaczek/Packages/open_crab_sample_analysis/dl2/{}.hdf5'.
        format(type),
        key='events',
        columns=[
            'run_id',
            #  'corsika_event_header_event_number',
            #  'corsika_event_header_num_reuse',
            'width',
            'length',
            'delta',
            'size',
            'cog_x',
            'cog_y',
            # 'pointing_position_az',
            'skewness_long',
            'skewness_trans',
            'night',
            'event_num',
        ])

    std.rename(columns={
        'run_id': 'run',
        'event_num': 'event',
    }, inplace=True)
    std.set_index(['night', 'run', 'event'], inplace=True)

    phs = read_data(
        '/net/big-tank/POOL/projects/fact/photon-stream/features/{}/{}_data.hdf5'
        .format(method, type),
        key='events')

    df = phs.join(
        std,
        how='inner',
        rsuffix='_std',
        lsuffix='_phs',
        on=('night', 'run', 'event'),
    )
    df.sort_index(axis=1, inplace=True)

    with PdfPages(
            '/home/ksedlaczek/OUT_{}/pdf/std_phs_comparison_{}_{}.pdf'.format(
                method, method, type)) as pdf:
        print("Plotting width")
        plot_comparison(df, 'width')
        pdf.savefig()
        plt.close()

        print("Plotting length")
        plot_comparison(df, 'length')
        pdf.savefig()
        plt.close()

        #        print("Plotting delta")
        #        plot_comparison(df, 'delta')
        #        pdf.savefig()
        #        plt.close()

        #        print("Plotting skewness_long")
        #        plot_comparison(df, 'skewness_long')
        #        pdf.savefig()
        #        plt.close()

        #       print("Plotting skewness_trans")
        #       plot_comparison(df, 'skewness_trans', logz=False)
        #       pdf.savefig()
        #       plt.close()

        print("Plotting size")
        plot_comparison(df, 'size', log=True)
        pdf.savefig()
        plt.close()

#        print("Plotting cog x")
#        plot_comparison(df, 'cog_x', logz=False)
#        pdf.savefig()
#        plt.close()
#
#        print("Plotting cog y")
#        plot_comparison(df, 'cog_y', logz=False)
#        pdf.savefig()
#        plt.close()

#         print("Plotting 2D")
#         plot_comparison2(df, log=True)
#         pdf.savefig()
#         plt.close()
    with PdfPages(
            '/home/ksedlaczek/OUT_{}/pdf/std_phs_comparison_hist_same_{}_{}.pdf'
            .format(method, method, type)) as pdf:
        print("Plotting width")
        plot_comparison_hist(df, 'width')
        pdf.savefig()
        plt.close()

        print("Plotting length")
        plot_comparison_hist(df, 'length')
        pdf.savefig()
        plt.close()
        #
        #        print("Plotting delta")
        #        plot_comparison_hist(df, 'delta')
        #        pdf.savefig()
        #        plt.close()

        #         print("Plotting skewness_long")
        #         plot_comparison_hist(df, 'skewness_long')
        #         pdf.savefig()
        #         plt.close()

        #       print("Plotting skewness_trans")
        #       plot_comparison(df, 'skewness_trans', logz=False)
        #       pdf.savefig()
        #       plt.close()

        print("Plotting size")
        plot_comparison_hist(df, 'size', log=True)
        pdf.savefig()
        plt.close()

#        print("Plotting cog x")
#        plot_comparison_hist(df, 'cog_x', logz=False)
#        pdf.savefig()
#        plt.close()
#
#        print("Plotting cog y")
#        plot_comparison_hist(df, 'cog_y', logz=False)
#        pdf.savefig()
#        plt.close()

    with PdfPages(
            '/home/ksedlaczek/OUT_{}/pdf/std_phs_comparison_hist_all_{}_{}.pdf'
            .format(method, method, type)) as pdf:
        print("Plotting width")
        plot_comparison_hist_all(std, phs, 'width')
        pdf.savefig()
        plt.close()

        print("Plotting length")
        plot_comparison_hist_all(std, phs, 'length')
        pdf.savefig()
        plt.close()
        #
        #        print("Plotting delta")
        #        plot_comparison_hist_all(std, phs, 'delta')
        #        pdf.savefig()
        #        plt.close()
        #
        #        print("Plotting skewness_long")
        #        plot_comparison_hist_all(std, phs, 'skewness_long')
        #        pdf.savefig()
        #        plt.close()

        #       print("Plotting skewness_trans")
        #       plot_comparison(df, 'skewness_trans', logz=False)
        #       pdf.savefig()
        #       plt.close()

        print("Plotting size")
        plot_comparison_hist_all(std, phs, 'size', log=True)
        pdf.savefig()
        plt.close()
Exemplo n.º 22
0
def main(predictions, threshold, theta_cut, net):
    bins = 40
    alpha = 0.2
    limits = [0, 0.3]
    df = fio.read_data(predictions, key='events')
    print(df.columns)
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    if net:
        print('using cnn predictions')
        selected = df.query('predictions_convnet > {}'.format(threshold))
        ax.set_title('Neural Net predictions')
    else:
        print('using standard predictions')
        selected = df.query('gamma_prediction > {}'.format(threshold))
        ax.set_title('RF predictions')

    theta_on = selected.theta_deg
    theta_off = pd.concat(
        [selected['theta_deg_off_{}'.format(i)] for i in range(1, 6)])
    h_on, bin_edges = np.histogram(theta_on.apply(lambda x: x**2).values,
                                   bins=bins,
                                   range=limits)
    h_off, bin_edges, _ = ax.hist(
        theta_off.apply(lambda x: x**2).values,
        bins=bin_edges,
        range=limits,
        weights=np.full(len(theta_off), 0.2),
        histtype='stepfilled',
        color='lightgray',
    )

    bin_center = bin_edges[1:] - np.diff(bin_edges) * 0.5
    bin_width = np.diff(bin_edges)

    ax.errorbar(
        bin_center,
        h_on,
        yerr=np.sqrt(h_on) / 2,
        xerr=bin_width / 2,
        linestyle='',
        label='On',
    )
    ax.errorbar(
        bin_center,
        h_off,
        yerr=alpha * np.sqrt(h_off) / 2,
        xerr=bin_width / 2,
        linestyle='',
        label='Off',
        color='darkgray',
    )

    ax.axvline(theta_cut**2, color='black', alpha=0.3, linestyle='--')

    n_on = np.sum(theta_on < theta_cut)
    n_off = np.sum(theta_off < theta_cut)
    significance = li_ma_significance(n_on, n_off, alpha=alpha)

    print('N_on', n_on)
    print('N_off', n_off)
    print('Li&Ma: {}'.format(significance))

    ax.text(
        0.5,
        0.95,
        stats_box_template.format(
            n_on=n_on,
            n_off=n_off,
            alpha=alpha,
            n_excess=n_on - alpha * n_off,
            n_excess_err=np.sqrt(n_on + alpha**2 * n_off),
            significance=significance,
        ),
        transform=ax.transAxes,
        va='top',
        ha='center',
    )

    ax.set_xlim(*limits)
    ax.legend(loc='lower right')
    fig.tight_layout(pad=0)

    plt.show()
Exemplo n.º 23
0
from fact.io import read_data
import numpy as np
import matplotlib.pyplot as plt


size_cuts = np.logspace(0, 5, 500)


for eps in [0.03, 0.05, 0.06, 0.07, 0.1, None]:
    if eps:
        d = read_data(f'/net/big-tank/POOL/projects/fact/photon-stream/features/{eps:.2f}/crab_data.hdf5', key='events', columns=['size'])
    else:
        d = read_data(f'/net/big-tank/POOL/projects/fact/data/open/dl2/FACT-Tools/v1.1.2/open_crab_sample_facttools_dl2.hdf5', key='events', columns=['size'])

    n_events = []
    size = d['size'].values
    for cut in size_cuts:
        size = size[size >= cut]
        n_events.append(len(size))

    plt.plot(size_cuts, n_events, label=r'$\varepsilon =$ {:.2}'.format(eps) if eps else 'FACT-Tools')

plt.yscale('log')
plt.xscale('log')
plt.ylabel('events')
plt.xlabel('size-cut')
plt.grid()
plt.title(r'Data rates for different $\varepsilon$')
plt.legend()
plt.tight_layout()
plt.savefig('data_rates_eps.pdf')
Exemplo n.º 24
0
def main(method, path, file, feat, number):

    border_pix = get_border_pixel_mask()

    print("Reading in facttools dl1 file...")
    t = Table.read('/net/big-tank/POOL/projects/fact/photon-stream/facttools/crab/{}_dl1.fits'.format(file))
    print("Reading in facttools dl2 file...")
    dl2 = read_data('/home/ksedlaczek/Packages/open_crab_sample_analysis/dl2/crab.hdf5', key='events')
    print("Reading in PhotonStream data file...")
    reader = ps.EventListReader('/net/big-tank/POOL/projects/fact/photon-stream/stream_data/{}/{}.phs.jsonl.gz'.format(path, file))
    print("Done...")
    event = next(reader)

    fig, axs = plt.subplots(3, 1, figsize=(4, 10), constrained_layout=True)

    plots = [camera(np.zeros(1440), cmap='inferno', ax=ax) for ax in axs]
    cbars = [fig.colorbar(plot, ax=ax) for plot, ax in zip(plots, axs)]
    plots[1].set_cmap('RdBu_r')

    with PdfPages('pe_difference_{}_{}.pdf'.format(feat, file)) as pdf:
        for i in tqdm(range(number)):

            if is_simulation_event(event):
                event_num_phs = event.simulation_truth.event
                reuse_phs = event.simulation_truth.reuse
                run_id_phs = event.simulation_truth.run
            else:
                event_num_phs = event.observation_info.event
                reuse_phs = 42
                run_id_phs = event.observation_info.run

            if path != 'crab':
                run_id = file
                event_num = t[i]['MCorsikaEvtHeader.fEvtNumber']
                reuse = t[i]['MCorsikaEvtHeader.fNumReuse']
            else:
                run_id = file
                event_num = t[i]['EventNum']
                reuse = 42

            assert run_id != run_id_phs

            while (event_num_phs != event_num or reuse != reuse_phs):
                event = next(reader)
                if is_simulation_event(event):
                    event_num_phs = event.simulation_truth.event
                    reuse_phs = event.simulation_truth.reuse
                    run_id_phs = event.simulation_truth.run
                else:
                    event_num_phs = event.observation_info.event
                    reuse_phs = 42
                    run_id_phs = event.observation_info.run

            lol = event.photon_stream.list_of_lists
            # cut away unwanted time slices
            # lol = [[t for t in l if ((35 <= t) & (t < 75))] for l in lol]
            image = phs2image(lol)

            if method == 'DBSCAN':
                clustering = ps.photon_cluster.PhotonStreamCluster(event.photon_stream)
                biggest_cluster = np.argmax(np.bincount(clustering.labels[clustering.labels != -1]))
                mask = clustering.labels == biggest_cluster

                cleaned_pix_phs = np.zeros(len(image), dtype=bool)
                k = 0
                cleaned_img = np.zeros(len(image))
                for h in range(len(lol)):
                    for j in range(len(lol[h])):
                        k += 1
                        if mask[k-1]:
                            cleaned_pix_phs[h] = True
            else:
                cleaned_pix_phs = facttools_cleaning(image, lol, picture_thresh, boundary_thresh)
                if sum(cleaned_pix_phs) < 1:
                    break

            cleaned_pix = t[i]['shower']

            t[i]['photoncharge'][t[i]['photoncharge'] < 0] = 0.0
            pe_difference = image - t[i]['photoncharge']

            max_abs = np.max(np.abs(pe_difference))

            plots[0].set_array(image)
            plots[1].set_array(pe_difference)
            plots[2].set_array(t[i]['photoncharge'])

            plots[0].autoscale()
            plots[1].set_clim(-max_abs, max_abs)
            plots[2].autoscale()
            # mark_pixel(t[i]['shower'], color=(128/255, 186/255, 38/255), linewidth=2.5)
            for ax in axs:
                ax.axis('off')

            for cbar, plot in zip(cbars, plots):
                cbar.update_bruteforce(plot)

            # embed()
            if is_simulation_event(event):
                fig.suptitle('run {} event {} reuse {} mean {:.2f}'.format(run_id, event_num, reuse, np.mean(pe_difference)))
            else:
                # fig.suptitle('{} event {} mean {:.2f}'.format(file, event_num, np.mean(pe_difference)))
                fig.suptitle('{} event {}'.format(file[:8] + ' ' + file[9:], event_num))
            pdf.savefig(fig)
Exemplo n.º 25
0
gamma_1_300['sign_prediction'] = np.sign(gamma_1_300.disp_prediction)
gamma_1_300_cuts = gamma_1_300.query('sign_prediction == disp_sign')
gamma_1_300_cuts = gamma_1_300_cuts.query(f'gammaness > {gammaness_threshold}')

figures.append(plt.figure())
ax = figures[-1].add_subplot(1, 1, 1)
plotting.angular_res(gamma_1_150_cuts, 'mc_energy', ax, label='v0.5.1 and intensity > 150')
plotting.angular_res(gamma_1_300_cuts, 'mc_energy', ax, label='v0.5.1 and intensity > 300')
plotting.angular_res(gamma_2_150_cuts, 'mc_energy', ax, label='v0.5.2 and intensity > 150')
plotting.angular_res(gamma_2_300_cuts, 'mc_energy', ax, label='v0.5.2 and intensity > 300')
#ax.set_title(rf'correct sign and $p_\gamma > {gammaness_threshold}$')


# energy perfromance
energy_2_150 = read_data('../build/cv_regressor.h5', key='data')
energy_2_300 = read_data('../HDD/build_scaling_300/cv_regressor.h5', key = 'data')
energy_1_150 = read_data('../HDD/build_noscaling/cv_regressor.h5', key = 'data')
energy_1_300 = read_data('../HDD/build_noscaling_300/cv_regressor.h5', key = 'data')

figures.append(plt.figure())
ax = figures[-1].add_subplot(1, 1, 1)
plot_bias_resolution(energy_1_150, key='bias', label='v0.5.1 and intensity > 150', ax=ax)
plot_bias_resolution(energy_1_300, key='bias', label='v0.5.1 and intensity > 300', ax=ax)
plot_bias_resolution(energy_2_150, key='bias', label='v0.5.2 and intensity > 150', ax=ax)
plot_bias_resolution(energy_2_300, key='bias', label='v0.5.2 and intensity > 300', ax=ax)
#ax.set_title('Bias')

figures.append(plt.figure())
ax = figures[-1].add_subplot(1, 1, 1)
plot_bias_resolution(energy_1_150, key='resolution_quantiles', label='v0.5.1 and intensity > 150', ax=ax)
from fact.analysis import li_ma_significance, split_on_off_source_independent
from fact.io import read_data


df = read_data('crab_gammas_dl3.hdf5', key='events')


on, off = split_on_off_source_independent(
    df.query('gamma_prediction > 0.85'),
    0.025,
)

with open('build/significance.tex', 'w') as f:
    f.write(r'\SI{')
    f.write(
        '{:.1f}'.format(li_ma_significance(len(on), len(off), 0.2))
    )
    f.write(r'}{σ}')
Exemplo n.º 27
0
def load_runs(path):
    return read_data(path, key='runs')
def main(method, path, file, feat, number):

    border_pix = get_border_pixel_mask()

    print("Reading in facttools dl1 file...")
    t = Table.read('/net/big-tank/POOL/projects/fact/photon-stream/facttools/{}/{}_dl1.fits'.format(path, file))
    print("Reading in facttools dl2 file...")
    dl2 = read_data('/home/ksedlaczek/Packages/open_crab_sample_analysis/dl2/{}.hdf5'.format(path), key='events')
    print("Reading in PhotonStream data file...")
    reader = ps.EventListReader('/net/big-tank/POOL/projects/fact/photon-stream/stream_data/{}/{}.phs.jsonl.gz'.format(path, file))
    print("Done...")
    event = next(reader)

    all_pe_diff_mean = []
    all_pe_diff = []
    delta_delta = []
    delta_delta_diff = []
    delta_delta_diff_perc = []
    d_delta = []

    for i in tqdm(range(number)):

        if is_simulation_event(event):
            event_num_phs = event.simulation_truth.event
            reuse_phs = event.simulation_truth.reuse
            run_id_phs = event.simulation_truth.run
        else:
            event_num_phs = event.observation_info.event
            reuse_phs = 42
            run_id_phs = event.observation_info.run

        if path != 'crab':
            run_id = file
            event_num = t[i]['MCorsikaEvtHeader.fEvtNumber']
            reuse = t[i]['MCorsikaEvtHeader.fNumReuse']
        else:
            run_id = file
            event_num = t[i]['EventNum']
            reuse = 42

        assert run_id != run_id_phs

        while (event_num_phs != event_num or reuse != reuse_phs):
            event = next(reader)
            if is_simulation_event(event):
                event_num_phs = event.simulation_truth.event
                reuse_phs = event.simulation_truth.reuse
                run_id_phs = event.simulation_truth.run
            else:
                event_num_phs = event.observation_info.event
                reuse_phs = 42
                run_id_phs = event.observation_info.run

        lol = event.photon_stream.list_of_lists
        # cut away unwanted time slices
        # lol = [[t for t in l if ((35 <= t) & (t < 75))] for l in lol]
        image = phs2image(lol)

        if method == 'DBSCAN':
            clustering = ps.photon_cluster.PhotonStreamCluster(event.photon_stream)
            if clustering.number < 1:
                continue
            biggest_cluster = np.argmax(np.bincount(clustering.labels[clustering.labels != -1]))
            mask = clustering.labels == biggest_cluster

            cleaned_pix_phs = np.zeros(len(image), dtype=bool)
            k = 0
            cleaned_img = np.zeros(len(image))
            for h in range(len(lol)):
                for j in range(len(lol[h])):
                    k += 1
                    if mask[k-1]:
                        cleaned_pix_phs[h] = True
        else:
            cleaned_pix_phs = facttools_cleaning(image, lol, picture_thresh, boundary_thresh)
            if sum(cleaned_pix_phs) < 1:
                continue

        cleaned_pix = t[i]['shower']

        t[i]['photoncharge'][t[i]['photoncharge'] < 0] = 0.0
        pe_difference = image - t[i]['photoncharge']

        if is_simulation_event(event):
            delta = np.rad2deg(dl2.query('night == 20131104 & run_id == 162 & event_num == {}'.format(t[i]['EventNum']))['delta'].values[0])
        else:
            delta = np.rad2deg(dl2.query('night == 20131104 & run_id == 162 & event_num == {}'.format(t[i]['EventNum']))['delta'].values[0])


        delta_diff_same = calc_delta(image, cleaned_pix) - delta
        delta_diff_whole = calc_delta(image, cleaned_pix_phs) - delta
        delta_diff_whole_perc = calc_delta_perc(image, cleaned_pix_phs) - delta
        for val in pe_difference:
            all_pe_diff.append(val)
        all_pe_diff_mean.append(np.mean(pe_difference))
        delta_delta.append(delta_diff_same)
        delta_delta_diff.append(delta_diff_whole)
        delta_delta_diff_perc.append(delta_diff_whole_perc)
        d_delta.append(calc_delta_delta(event, cleaned_pix))


    plt.figure()
    plt.hist(all_pe_diff_mean, bins=100, histtype='step')
    plt.title('Means of pe differences per image')
    plt.tight_layout()
    plt.savefig('means_hist_{}_{}_{}.pdf'.format(method, feat, file))
    plt.clf()

    plt.figure()
    plt.hist(all_pe_diff, bins=100, histtype='step', density=True)
    plt.title('PE differences per pixel')
    plt.tight_layout()
    plt.savefig('diffs_hist_{}_{}_{}.pdf'.format(method, feat, file))
    plt.clf()

    plt.figure()
    plt.hist(all_pe_diff, bins=100, histtype='step')
    plt.title('PE differences per pixel')
    plt.semilogy()
    plt.ylabel('events')
    plt.xlabel(r'$\symup{\Delta}$PE')
    plt.tight_layout()
    plt.savefig('diffs_hist_{}_{}_{}_logy.pdf'.format(method, feat, file))
    plt.clf()

    plt.figure()
    plt.hist(delta_delta, bins=70, histtype='step', density=True)
    plt.title('$\mathrm{\Delta}\delta$ between phs and facttools')
    plt.tight_layout()
    plt.savefig('delta_hist_same_pixels_{}_{}_{}.pdf'.format(method, feat, file))
    plt.clf()

    plt.figure()
    plt.hist(delta_delta_diff, bins=70, histtype='step', density=True)
    plt.title(r'$\mathup{\Delta}\delta$ between phs and facttools')
    plt.xlabel(r'$\mathrm{\Delta}\delta / \textdegree$')
    plt.tight_layout()
    plt.savefig('delta_diff_hist_different_cleanings_{}_{}_{}.pdf'.format(method, feat, file))
    plt.clf()

    plt.figure()
    plt.hist(delta_delta_diff_perc, bins=70, histtype='step', density=True)
    plt.title(r'$\mathup{\Delta}\delta$ between phs and facttools')
    plt.xlabel(r'$\mathrm{\Delta}\delta / \textdegree$')
    plt.tight_layout()
    plt.savefig('delta_diff_hist_perc_{}_{}_{}.pdf'.format(method, feat, file))
    plt.clf()

    plt.figure()
    plt.hist(delta_delta_diff, bins=70, histtype='step', density=True, label='DBSCAN')
    plt.hist(delta_delta_diff_perc, bins=70, histtype='step', density=True, label='DBSCAN + > 1%')
    plt.title(r'$\mathup{\Delta}\delta$ between phs and facttools')
    plt.xlabel(r'$\mathrm{\Delta}\delta / \textdegree$')
    plt.legend()
    plt.tight_layout()
    plt.savefig('delta_diff_hist_perc_and_normal_{}_{}_{}.pdf'.format(method, feat, file))
    plt.clf()

    plt.figure()
    plt.hist(d_delta, bins=100, histtype='step', density=True)
    plt.title(r'$\mathrm{\Delta}\delta$ between phs on facttools cleaning to $\delta_{\mathrm{true}}$ per image')
    plt.tight_layout()
    plt.savefig('delta_true_diff_hist_{}_{}_{}.pdf'.format(method, feat, file))
    plt.clf()
Exemplo n.º 29
0
def runs():
    return fio.read_data(os.path.join(FIXTURE_DIR, 'crab_dl3_sample.hdf5'),
                         key='runs')
from astropy.coordinates import SkyCoord, AltAz
from fact.instrument.constants import LOCATION
from fact.coordinates.utils import to_astropy_time
import pandas as pd
from argparse import ArgumentParser
from IPython import embed

parser = ArgumentParser()
parser.add_argument('inputfile')
parser.add_argument('-o', '--outputfile')
parser.add_argument('-t', '--threshold', type=float)

args = parser.parse_args()

facttools = read_data(
    '/home/ksedlaczek/Packages/open_crab_sample_analysis/build/crab_precuts.hdf5',
    key='events')

facttools = facttools.query(f'gamma_prediction >= 0.8').copy()

phs = read_data(args.inputfile, key='events')

facttools.set_index(['run_id', 'event_num', 'night'], inplace=True)
df = phs.join(
    facttools,
    how='inner',
    rsuffix='_std',
    lsuffix='_phs',
    on=('run', 'event', 'night'),
)
df.sort_index(axis=1, inplace=True)
def main(method, path, file, feat, number):

    border_pix = get_border_pixel_mask()
    if method == "thresholds":
        reader = ps.EventListReader(
            '/net/big-tank/POOL/projects/fact/photon-stream/stream_data/{}/{}.phs.jsonl.gz'
            .format(path, file))
        with PdfPages('cleaning_thresh_{}_{}.pdf'.format(feat, file)) as pdf:
            for i in tqdm(range(number)):
                fig = plt.figure()
                ax = fig.add_axes([0.05, 0.05, 0.9, 0.9])
                #ax.set_axis_off()
                event = next(reader)

                lol = event.photon_stream.list_of_lists
                lol = [[t for t in l if ((35 <= t) & (t < 75))] for l in lol]
                image = phs2image(lol)  #, lower=30, upper=70)
                cleaned_pix = facttools_cleaning(image, lol, 35, 75,
                                                 picture_thresh,
                                                 boundary_thresh)
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    arrival_times = np.array([np.nanmedian(l) for l in lol])
                # cleaned_pix = cleaning(image, lol, picture_thresh, boundary_thresh)
                if len(cleaned_pix[cleaned_pix != 0]) > 1:
                    # border_ph = [(border_pix[i] and cleaned_pix[i]) for i in range(1440)]
                    # leakage = image[border_ph].sum()/image[cleaned_pix].sum()
                    df = calc_hillas_features_image(image, cleaned_pix)
                    # ell = Ellipse(
                    #     [df['cog_x'], df['cog_y']],
                    #     df['length']*2,
                    #     df['width']*2,
                    #     angle=np.rad2deg(df['delta']),
                    #     fill=False, linewidth=2, color='b'
                    # )
                    # ax.add_patch(ell)
                    ell = Ellipse([df['cog_x'], df['cog_y']],
                                  df['length'] * 4,
                                  df['width'] * 4,
                                  angle=np.rad2deg(df['delta']),
                                  fill=False,
                                  linewidth=1.5,
                                  color='b')
                    # ax.add_patch(ell)
                    if is_simulation_event(event):
                        fig.suptitle('run {} event {} reuse {}'.format(
                            event.simulation_truth.run,
                            event.simulation_truth.event,
                            event.simulation_truth.reuse))
                    else:
                        fig.suptitle('{} event {} delta {}'.format(
                            file, event.observation_info.event, df['delta']))
                    if feat == 'arrival_times':
                        with warnings.catch_warnings():
                            warnings.simplefilter("ignore")
                            x = arrival_times - np.nanmean(arrival_times)
                        x[np.isnan(x)] = 0
                        c = camera(x, cmap='Spectral', ax=ax)
                        mark_pixel(cleaned_pix, color='k', linewidth=2.5)
                    else:
                        c = camera(image, cmap='viridis', ax=ax)
                        mark_pixel(cleaned_pix,
                                   color=(128 / 255, 186 / 255, 38 / 255),
                                   linewidth=2.5)
                    ax.axis('off')
                    fig.colorbar(c)
                    pdf.savefig(fig)
                    ax.cla()
                plt.close(fig)

    if method == "DBSCAN":
        reader = ps.EventListReader(
            '/net/big-tank/POOL/projects/fact/photon-stream/stream_data/{}/{}.phs.jsonl.gz'
            .format(path, file))
        with PdfPages('cleaning_DBSCAN_biggest_{}_{}.pdf'.format(feat,
                                                                 file)) as pdf:
            for i in tqdm(range(number)):
                fig = plt.figure()
                ax = fig.add_axes([0.05, 0.05, 0.9, 0.9])
                event = next(reader)

                # clustering of events
                clustering = ps.photon_cluster.PhotonStreamCluster(
                    event.photon_stream)

                if clustering.number > 0:

                    lol = event.photon_stream.list_of_lists
                    image = phs2image(lol)
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        arrival_times = np.array(
                            [np.nanmedian(l) for l in lol])
                    # biggest cluster:
                    biggest_cluster = np.argmax(
                        np.bincount(
                            clustering.labels[clustering.labels != -1]))

                    mask = clustering.labels == biggest_cluster
                    # mask = clustering.labels != -1

                    xyt = event.photon_stream.point_cloud
                    x, y, t = xyt.T
                    cleaned_pix = np.zeros(len(image), dtype=bool)
                    k = 0
                    cleaned_img = np.zeros(len(image))
                    for i in range(len(lol)):
                        for j in range(len(lol[i])):
                            k += 1
                            if mask[k - 1]:
                                cleaned_pix[i] = True
                                cleaned_img[i] += 1

                    cleaned_pix_perc = np.zeros(1440, dtype=bool)
                    for i in range(1440):
                        if cleaned_pix[i] and (cleaned_img[i] >
                                               mask.sum() / 200):
                            cleaned_pix_perc[i] = True
                    df = calc_hillas_features_phs(event.photon_stream,
                                                  clustering)
                    # ell = Ellipse(
                    #     [df['cog_x'], df['cog_y']],
                    #     df['length']*2,
                    #     df['width']*2,
                    #     angle=np.rad2deg(df['delta']),
                    #     fill=False, linewidth=2, color='b'
                    # )
                    # ax.add_patch(ell)
                    ell = Ellipse([df['cog_x'], df['cog_y']],
                                  df['length'] * 4,
                                  df['width'] * 4,
                                  angle=np.rad2deg(df['delta']),
                                  fill=False,
                                  linewidth=1.5,
                                  color='b')
                    # ax.add_patch(ell)
                    if is_simulation_event(event):
                        fig.suptitle('run {} event {} reuse {}'.format(
                            event.simulation_truth.run,
                            event.simulation_truth.event,
                            event.simulation_truth.reuse))
                    else:
                        fig.suptitle('{} event {} delta {:.2f}'.format(
                            file, event.observation_info.event,
                            np.rad2deg(df['delta'])))
                    if feat == 'arrival_times':
                        with warnings.catch_warnings():
                            warnings.simplefilter("ignore")
                            x = arrival_times - np.nanmean(arrival_times)
                        c = camera(x, cmap='viridis', ax=ax)
                        mark_pixel(cleaned_pix,
                                   color=(128 / 255, 186 / 255, 38 / 255),
                                   linewidth=2.5)
                    else:
                        c = camera(image, cmap='viridis', ax=ax)
                        mark_pixel(cleaned_pix,
                                   color=(128 / 255, 186 / 255, 38 / 255),
                                   linewidth=2.5)
                        mark_pixel(cleaned_pix_perc,
                                   color='red',
                                   linewidth=1.5)
                    ax.axis('off')
                    fig.colorbar(c)
                    pdf.savefig(fig)
                    ax.cla()
                plt.close(fig)

    if method == "facttools":
        print('facttools')
        with PdfPages('cleaning_facttools_{}_{}.pdf'.format(feat,
                                                            file)) as pdf:
            t = Table.read(
                '/net/big-tank/POOL/projects/fact/photon-stream/facttools/{}/{}_dl1.fits'
                .format(path, file))
            dl2 = read_data(
                '/home/ksedlaczek/Packages/open_crab_sample_analysis/dl2/crab.hdf5',
                key='events')

            for i in tqdm(range(number)):
                fig = plt.figure()
                ax = fig.add_axes([0.05, 0.05, 0.9, 0.9])
                #                if path != 'crab':
                #                    fig.suptitle('run {} event {} reuse {}'.format(file, t[i]['MCorsikaEvtHeader.fEvtNumber'], t[i]['MCorsikaEvtHeader.fNumReuse']))
                #                else:
                #
                #                    fig.suptitle('{} event {} delta {:.4f}'.format(file, t[i]['EventNum'], dl2.query('night == 20131104 & run_id == 162 & event_num == {}'.format(t[i]['EventNum']))['delta'].values[0]))

                t[i]['photoncharge'][t[i]['photoncharge'] < 0] = 0.0
                if feat == 'arrival_times':
                    c = camera(t[i]['arrivalTime'] -
                               t[i]['arrivalTime'].mean(),
                               cmap='Spectral',
                               ax=ax)
                    # mark_pixel(t[i]['shower'], color='k', linewidth=2.5)
                    ax.axis('off')
                    cb = fig.colorbar(c)
                    cb.set_label(label=r'$t-\bar{t}$ / ns', fontsize=16)
                else:
                    c = camera(t[i]['photoncharge'], cmap='viridis', ax=ax)
                    ax.axis('off')
                    cb = fig.colorbar(c)
                    cb.set_label(label=r'Number of Photons', fontsize=16)

                    #mark_pixel(t[i]['shower'], color=(128/255, 186/255, 38/255), linewidth=2.5)
                # mark_pixel(t[i]['shower'], color=(128/255, 186/255, 38/255), linewidth=2.5)
                pdf.savefig(fig)
                ax.cla()
                plt.close(fig)
from fact.io import read_data
from astropy.coordinates import SkyCoord, AltAz
from fact.instrument.constants import LOCATION
from fact.coordinates.utils import to_astropy_time
import pandas as pd
from argparse import ArgumentParser

parser = ArgumentParser()
parser.add_argument('inputfile')
parser.add_argument('-o', '--outputfile')
parser.add_argument('-t', '--threshold', type=float)

args = parser.parse_args()

print("Reading in data...")
df = read_data(args.inputfile, key='events')
print("Done!")

if  args.threshold:
    df = df.query(f'gamma_prediction >= {args.threshold}').copy()

if 'source_position_zd' not in df.columns:
    frame = AltAz(location=LOCATION, obstime=to_astropy_time(pd.to_datetime(df['timestamp'])))

    crab = SkyCoord.from_name('Crab')
    crab_altaz = crab.transform_to(frame)

    df['source_position_zd'] = crab_altaz.zen.deg
    df['source_position_az'] = crab_altaz.az.deg

df['source_x'], df['source_y'] = horizontal_to_camera(