예제 #1
0
def test_to_h5py():
    from fact.io import to_h5py, read_h5py

    df = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df, f.name, key='test')

        with h5py.File(f.name, 'r') as hf:

            assert 'test' in hf.keys()

            g = hf['test']

            assert 'x' in g.keys()
            assert 'N' in g.keys()

        df2 = read_h5py(f.name, key='test')
        df2.sort_index(1, inplace=True)
        df.sort_index(1, inplace=True)

        assert all(df.dtypes == df2.dtypes)
        assert all(df['x'] == df2['x'])
        assert all(df['N'] == df2['N'])
def main(infile, outfile, tel_name):

    sim_runs = None
    parameters = pd.read_hdf(infile,
                             key=f'dl1/event/telescope/parameters/{tel_name}')
    focal_length = pd.read_hdf(
        infile, key='instrument/telescope/optics').drop_duplicates().set_index(
            'name').loc['LST', 'equivalent_focal_length']

    # renaming for simulations
    if 'mc_az' in parameters.columns:
        with tables.open_file(infile) as f:
            sim_runs = f.root.simulation.run_config[:]

        sim_runs_df = pd.DataFrame()
        for name in sim_runs.dtype.names:
            if name != 'run_array_direction':
                sim_runs_df[name] = sim_runs[name]

        parameters['az_tel'] = parameters.mc_az_tel
        parameters['alt_tel'] = parameters.mc_alt_tel

    parameters['focal_length'] = focal_length
    to_h5py(parameters, outfile, key='events', mode='w')

    if sim_runs is not None:
        to_h5py(sim_runs_df, outfile, key='corsika_runs', mode='a')
예제 #3
0
def test_write_lists_h5py():
    from fact.io import to_h5py, read_h5py

    df = pd.DataFrame({'x': [[1.0, 2.0], [3.0, 4.0]]})

    with tempfile.NamedTemporaryFile(suffix='.hdf5') as f:
        to_h5py(df, f.name)

        df = read_h5py(f.name, columns=['x'])

        assert df['x_0'].iloc[0] == 1.0
예제 #4
0
def test_multiple_config():
    from aict_tools.apply import create_mask_h5py

    config = [{"b": [">", 0]}, {"b": ["<", 5]}]

    with tempfile.NamedTemporaryFile(prefix="test_aict_", suffix=".hdf5") as f:
        to_h5py(df, f.name, key="events")

        mask = create_mask_h5py(h5py.File(f.name, "r"),
                                n_events=len(df),
                                selection_config=config)
        assert all(mask == [False, True, False, True])
예제 #5
0
def main(infile, outfile, tel_name):
    
    parameters = pd.read_hdf(infile, key = f'dl1/event/telescope/parameters/{tel_name}')
    focal_length = pd.read_hdf(infile, key = 'instrument/telescope/optics').drop_duplicates().set_index('name').loc['LST', 'equivalent_focal_length']

    # renaming for simulations
    if 'mc_az' in parameters.columns:
        parameters['az_tel'] = parameters.mc_az_tel
        parameters['alt_tel'] = parameters.mc_alt_tel

    parameters['focal_length'] = focal_length
    to_h5py(parameters, outfile, key='events', mode = 'w')
예제 #6
0
def test_dict_config():
    from aict_tools.apply import create_mask_h5py

    config = {'a': ['>', 2], 'b': ['<', 5]}

    with tempfile.NamedTemporaryFile(prefix='test_aict_', suffix='.hdf5') as f:
        to_h5py(df, f.name, key='events')

        mask = create_mask_h5py(h5py.File(f.name, 'r'),
                                n_events=len(df),
                                selection_config=config)
        assert all(mask == [False, False, False, True])
예제 #7
0
def test_to_h5py_string():
    from fact.io import to_h5py, read_h5py

    df = pd.DataFrame({
        'name': ['Mrk 501', 'Mrk 421', 'Crab'],
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df, f.name, key='test')
        df2 = read_h5py(f.name, key='test')

        assert all(df.dtypes == df2.dtypes)
        assert all(df['name'] == df2['name'])
예제 #8
0
def main(outputfile, inputdir):
    inputfiles = []
    for d in inputdir:
        inputfiles.extend(glob(os.path.join(d, 'cer*')))

    for f in inputfiles[:]:
        if f + '.gz' in inputfiles:
            inputfiles.remove(f + '.gz')

    print('Processing', len(inputfiles), 'files')

    with Pool(cpu_count()) as pool:
        results = pool.imap_unordered(get_headers, inputfiles)

        run_headers = []
        run_ends = []

        for run_header, event_headers, run_end in tqdm(results,
                                                       total=len(inputfiles)):

            run_headers.append(run_header)
            run_ends.append(run_end)

            df = pd.DataFrame(event_headers[event_columns])
            to_h5py(df, outputfile, key='corsika_events', mode='a')

        print('saving runwise information')
        runs = pd.DataFrame(np.array(run_headers)[run_header_columns])

        # some runs might have failed and thus no run end block
        for run_end in run_ends:
            if run_end is not None:
                dtype = run_end.dtype
                break
        else:
            raise IOError('All run_end blocks are None, all runs failed.')

        dummy = np.array([(b'RUNE', np.nan, np.nan)], dtype=dtype)[0]
        run_ends = [r if r is not None else dummy for r in run_ends]
        run_ends = np.array(run_ends)

        print('Number of failed runs:',
              np.count_nonzero(np.isnan(run_ends['n_events'])))

        runs['n_events'] = run_ends['n_events']

        to_h5py(runs, outputfile, key='corsika_runs', mode='a')
        print('done')
def main(output_file, input_file, n_jobs):

    if n_jobs == -1:
        n_jobs = 15

    print('Calculating features using', n_jobs, 'cores')

    if is_simulation_file(input_file[0]):
        print('Received simulation files as input.')
    else:
        print('Received data files as input.')

    with Pool(n_jobs) as pool:
        results = pool.imap_unordered(cluster_labels, input_file)
        for df in tqdm(results, total=len(input_file)):
            to_h5py(df, output_file, key="events", mode='a', index=False)
예제 #10
0
def test_to_h5py_append():
    from fact.io import to_h5py, read_h5py

    df1 = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })
    df2 = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df1, f.name, key='test', index=False)
        to_h5py(df2, f.name, key='test', mode='a', index=False)

        df_read = read_h5py(f.name, key='test')
        df_written = pd.concat([df1, df2], ignore_index=True)

        for col in df_written.columns:
            assert all(df_read[col] == df_written[col])
예제 #11
0
def write(
        typename,
        output_path,
        site_location,
        array_events_data,
        telescope_events_data,
        runs_all,
        positions,
        stereo,
        id_no):

    print('Writing ' + typename + " data...",datetime.now().time().strftime("%H:%M:%S"))

    telescope_events = pd.DataFrame(telescope_events_data)
    array_events = pd.DataFrame(array_events_data)
    runs = pd.DataFrame(runs_all)
    
    # Calculate and add telescope location to telescope_events
    telescope_events = add_tel_location(
        telescope_events, site_location, positions)

    if typename == 'gamma-diffuse':
        output_file = output_path + 'gammas-diffuse' + str(id_no) + '.hdf5'
    else:
        output_file = output_path + typename + 's' + str(id_no) + '.hdf5'

    # Save to hdf5 file
    to_h5py(telescope_events, output_file, key='telescope_events', mode='w')
    to_h5py(array_events, output_file, key='array_events', mode='a')
    to_h5py(runs, output_file, key='runs', mode='a')
예제 #12
0
def test_to_h5py_datetime():
    from fact.io import to_h5py, read_h5py

    df = pd.DataFrame({
        't_ns':
        pd.date_range('2017-01-01', freq='1ns', periods=100),
        't_us':
        pd.date_range('2017-01-01', freq='1us', periods=100),
        't_ms':
        pd.date_range('2017-01-01', freq='1ms', periods=100),
        't_s':
        pd.date_range('2017-01-01', freq='1s', periods=100),
        't_d':
        pd.date_range('2017-01-01', freq='1d', periods=100),
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df, f.name, key='test')
        df2 = read_h5py(f.name, key='test')

        for col in df.columns:
            assert all(df[col] == df2[col])
def main(inputfile, outputfile):
    logging.info('Opening file')
    f = uproot.open(inputfile)

    logging.info('Getting tree')
    tree = f['Events']
    branches = set(k.decode('ascii') for k in tree.keys())

    ids = np.arange(tree.numentries)
    dfs = []

    logging.info('Start reading telescope events')
    telescope_id = 1
    while f'MHillas_{telescope_id}.' in branches:
        columns = {
            k.format(telescope_id=telescope_id): v
            for k, v in TELESCOPE_COLUMNS.items()
        }

        df = tree.pandas.df(columns.keys())
        df.rename(columns=columns, inplace=True)
        df['event_id'] = ids
        df['telescope_id'] = telescope_id
        dfs.append(df)
        telescope_id += 1

    df = pd.concat(dfs)
    df = df[df.trigger_time != -100]

    logging.info(f'Writing {len(df)} telescope events to hdf5 file')
    to_h5py(df, outputfile, mode='w', key='telescope_events')
    logging.info('done')

    df = tree.pandas.df(ARRAY_COLUMNS.keys())
    df.rename(columns=ARRAY_COLUMNS, inplace=True)
    df['event_id'] = ids
    logging.info(f'Writing {len(df)} array events to hdf5 file')
    to_h5py(df, outputfile, mode='a', key='array_events')
    logging.info('done')
def main():
    args = parser.parse_args()
    runs = pd.read_csv(args.runlist)
    runs['night_date'] = pd.to_datetime(runs['night'].astype(str),
                                        format='%Y%m%d')

    initialised = False
    for idx, run in tqdm(runs.iterrows(), total=len(runs)):
        night = int('{:%Y%m%d}'.format(run.night_date))
        base = datepath(args.ganymed_base, run.night_date)

        ganymed_file = os.path.join(
            base, '{}_{:03d}-summary.root'.format(night, run.run_id))

        df = read_mars(ganymed_file, tree='Events')
        df['night'] = night
        df['run_id'] = run.run_id

        if not initialised:
            to_h5py(args.outputfile, df, key='events', mode='w')
            initialised = True
        else:
            to_h5py(args.outputfile, df, key='events', mode='a')
예제 #15
0
파일: io.py 프로젝트: fact-project/erna
    def append(self, df):
        if self.outputfile is None:
            return

        if self.fmt == 'jsonl':
            if self._file is None:
                self._file = open(self.outputfile, 'w')
            df.to_json(self._file,
                       lines=True,
                       date_format='iso',
                       orient='records')
            self._file.write('\n')

        elif self.fmt == 'csv':
            if self._file is None:
                self._file = open(self.outputfile, 'w')
            df.to_csv(self._file, header=not self.header_written)

        elif self.fmt == 'hdf5':
            mode = 'a' if self.header_written else 'w'
            to_h5py(df, self.outputfile, key='events', mode=mode)

        self.header_written = True
예제 #16
0
def main(output_file, input_file, eps, n_jobs, lower, upper):

    if n_jobs == -1:
        n_jobs = 48  # cpu_count()

    print('Calculating features using', n_jobs, 'cores')

    if is_simulation_file(input_file[0]):
        print('Received simulation files as input.')
    else:
        print('Received data files as input.')

    with Pool(n_jobs) as pool:
        results = [
            pool.apply_async(gen_features_norm,
                             kwds={
                                 'data_file': f,
                                 'lower': lower,
                                 'upper': upper
                             }) for f in input_file
        ]
        for df in tqdm(as_completed(results), total=len(input_file)):
            to_h5py(df, output_file, key="events", mode='a', index=False)
예제 #17
0
def test_to_h5py_append_second_group():
    from fact.io import to_h5py, read_h5py

    df1 = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })
    df2 = pd.DataFrame({
        'x': np.random.normal(size=50),
        'N': np.random.randint(0, 10, dtype='uint8')
    })

    with tempfile.NamedTemporaryFile() as f:
        to_h5py(df1, f.name, key='g1', index=False)
        to_h5py(df2, f.name, key='g2', index=False)

        df_g1 = read_h5py(f.name, key='g1')
        df_g2 = read_h5py(f.name, key='g2')

        for col in df_g1.columns:
            assert all(df_g1[col] == df1[col])

        for col in df_g2.columns:
            assert all(df_g2[col] == df2[col])
예제 #18
0
def main(outputfile, inputdir, infile_re, n_jobs):
    inputfiles = []
    file_re = re.compile(infile_re)

    for d in tqdm(inputdir):
        for root, dirs, files in os.walk(os.path.abspath(d)):
            for f in files:
                if file_re.match(f):
                    inputfiles.append(os.path.join(root, f))

    print('Processing', len(inputfiles), 'files')

    with ProcessPoolExecutor(n_jobs) as pool:
        futures = [pool.submit(get_headers, f) for f in inputfiles]

        run_headers = []
        run_ends = []

        reuses = []
        for future in tqdm(as_completed(futures), total=len(inputfiles)):
            run_header, event_headers, run_end = future.result()

            run_headers.append(run_header)
            run_ends.append(run_end)

            df = pd.DataFrame(event_headers[event_columns])
            to_h5py(df, outputfile, key='corsika_events', mode='a')
            reuses.append(df['n_reuse'].iloc[0])

        print('saving runwise information')
        runs = pd.DataFrame(np.array(run_headers)[run_header_columns])
        runs['n_events'] = np.array(run_ends)['n_events']
        runs['n_reuse'] = reuses

        to_h5py(runs, outputfile, key='corsika_runs', mode='a')
        print('done')
예제 #19
0
def main():
    description = ('Convert hillas file to h5py.')
    parser = argparse.ArgumentParser(description=description,
                                     formatter_class=Formatter)
    parser.add_argument('-f',
                        '--files',
                        dest='input_path',
                        help='path to the HDF5 hillas files')
    parser.add_argument('-o',
                        dest='output_path',
                        required=True,
                        help='output path to store the h5py file')
    args = parser.parse_args()

    input_path = args.input_path
    output_path = args.output_path

    with HDF5Reader(input_path) as reader:
        tel = reader.read('data')
        arr = reader.read('mc')
        pnt = reader.read('pointing')
        run = reader.read('mcheader')

    arr = arr.rename(
        columns={
            'energy': 'mc_energy',
            'alt': 'mc_alt',
            'az': 'mc_az',
            'core_x': 'mc_core_x',
            'core_y': 'mc_core_y',
            'h_first_int': 'mc_h_first_int',
            'shower_primary_id': 'mc_shower_primary_id',
            'x_max': 'mc_x_max',
            #'iobs':'run_id',
            #'iev': 'array_event_id'
        })
    pnt = pnt.drop(columns=['t_cpu'])
    arr = pd.merge(arr, pnt, on=['iobs', 'iev'])
    arr = arr.rename(columns={'iobs': 'run_id', 'iev': 'array_event_id'})

    tel['array_event_id'] = tel.iev.values
    tel = tel.rename(columns={'iev': 'telescope_event_id', 'iobs': 'run_id'})
    tel = tel.drop(columns=['t_cpu'])
    plate_scale = 37.56
    tel.x = tel.x * plate_scale
    tel.y = tel.y * plate_scale

    run = run.rename(columns={'iobs': 'run_id'})

    to_h5py(tel, output_path, key='telescope_events', mode='w')
    to_h5py(arr, output_path, key='array_events', mode='a')
    to_h5py(run, output_path, key='runs', mode='a')
예제 #20
0
def main():
    args = parser.parse_args()
    df = read_mars(args.inputfile, tree=args.tree, verbose=True)
    to_h5py(args.outputfile, df, key=args.tree, mode='w')
예제 #21
0
def main(
    configuration_path,
    data_path,
    separator_model_path,
    energy_model_path,
    disp_model_path,
    sign_model_path,
    output,
    key,
    chunksize,
    n_jobs,
    yes,
    verbose,
):
    '''
    Apply given model to data. Two columns are added to the file, energy_prediction
    and energy_prediction_std

    CONFIGURATION_PATH: Path to the config yaml file

    DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output

    SEPARATOR_MODEL_PATH: Path to the pickled separation model.

    ENERGY_MODEL_PATH: Path to the pickled energy regression model.

    DISP_MODEL_PATH: Path to the pickled disp model.

    SIGN_MODEL_PATH: Path to the pickled sign model.
    '''
    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    config = AICTConfig.from_yaml(configuration_path)

    if os.path.isfile(output):
        if not yes:
            click.confirm(
                'Outputfile {} exists. Overwrite?'.format(output),
                abort=True,
            )
        open(output, 'w').close()

    log.info('Loading model')
    separator_model = joblib.load(separator_model_path)
    energy_model = joblib.load(energy_model_path)
    disp_model = joblib.load(disp_model_path)
    sign_model = joblib.load(sign_model_path)
    log.info('Done')

    if n_jobs:
        separator_model.n_jobs = n_jobs
        energy_model.n_jobs = n_jobs
        disp_model.n_jobs = n_jobs
        sign_model.n_jobs = n_jobs

    columns = set(needed_columns)
    for model in ('separator', 'energy', 'disp'):
        model_config = getattr(config, model)
        columns.update(model_config.columns_to_read_apply)
    try:
        runs = read_h5py(data_path, key='runs')
        sources = runs['source'].unique()
        if len(sources) > 1:
            raise click.ClickException(
                'to_dl3 only supports files with a single source'
            )
        source = SkyCoord.from_name(sources[0])
        columns.update(['timestamp', 'night'])
    except (KeyError, OSError) as e:
        source = None
        columns.update(dl3_columns_sim_read)

    df_generator = read_telescope_data_chunked(
        data_path,
        config,
        chunksize=chunksize,
        columns=columns,
    )

    log.info('Predicting on data...')
    for df, start, end in tqdm(df_generator):
        df_sep = feature_generation(df, config.separator.feature_generation)
        df['gamma_prediction'] = predict_separator(
            df_sep[config.separator.features], separator_model,
        )

        df_energy = feature_generation(df, config.energy.feature_generation)
        df['gamma_energy_prediction'] = predict_energy(
            df_energy[config.energy.features],
            energy_model,
            log_target=config.energy.log_target,
        )

        df_disp = feature_generation(df, config.disp.feature_generation)
        disp = predict_disp(
            df_disp[config.disp.features], disp_model, sign_model
        )

        source_x = df.cog_x + disp * np.cos(df.delta)
        source_y = df.cog_y + disp * np.sin(df.delta)
        df['source_x_prediction'] = source_x
        df['source_y_prediction'] = source_y
        df['disp_prediction'] = disp

        if source:
            obstime = Time(pd.to_datetime(df['timestamp'].values).to_pydatetime())
            source_altaz = concat_results_altaz(parallelize_array_computation(
                partial(to_altaz, source=source),
                obstime,
                n_jobs=n_jobs,
            ))

            result = parallelize_array_computation(
                calc_source_features_obs,
                source_x,
                source_y,
                source_altaz.zen.deg,
                source_altaz.az.deg,
                df['pointing_position_zd'].values,
                df['pointing_position_az'].values,
                obstime,
                n_jobs=n_jobs,
            )
        else:

            result = parallelize_array_computation(
                calc_source_features_sim,
                source_x,
                source_y,
                df['source_position_zd'].values,
                df['source_position_az'].values,
                df['pointing_position_zd'].values,
                df['pointing_position_az'].values,
                df['cog_x'].values,
                df['cog_y'].values,
                df['delta'].values,
                n_jobs=n_jobs,
            )

        for k in result[0].keys():
            df[k] = np.concatenate([r[k] for r in result])

        if source:
            to_h5py(df[dl3_columns_obs], output, key='events', mode='a')
        else:
            to_h5py(df[dl3_columns_sim], output, key='events', mode='a')

    if source:
        log.info('Copying "runs" group')
        to_h5py(runs, output, key='runs', mode='a')
예제 #22
0
def main(file):
    run_meta = read_data('~/phs_analysis/open_crab_sample_runs.csv')
    to_h5py(run_meta.iloc[:], file, key='runs', mode='a')
예제 #23
0
def main(
    configuration_path,
    data_path,
    separator_model_path,
    energy_model_path,
    disp_model_path,
    sign_model_path,
    output,
    random_source,
    wobble_distance,
    key,
    chunksize,
    n_jobs,
    yes,
    verbose,
):
    '''
    Apply given model to data. Two columns are added to the file, energy_prediction
    and energy_prediction_std

    CONFIGURATION_PATH: Path to the config yaml file

    DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output

    SEPARATOR_MODEL_PATH: Path to the pickled separation model.

    ENERGY_MODEL_PATH: Path to the pickled energy regression model.

    DISP_MODEL_PATH: Path to the pickled disp model.

    SIGN_MODEL_PATH: Path to the pickled sign model.
    '''
    log = setup_logging()

    config = AICTConfig.from_yaml(configuration_path)

    if os.path.isfile(output):
        if not yes:
            click.confirm(
                'Outputfile {} exists. Overwrite?'.format(output),
                abort=True,
            )
        open(output, 'w').close()

    log.info('Loading model')
    separator_model = load_model(separator_model_path)
    energy_model = load_model(energy_model_path)
    disp_model = load_model(disp_model_path)
    sign_model = load_model(sign_model_path)
    log.info('Done')

    if n_jobs:
        separator_model.n_jobs = n_jobs
        energy_model.n_jobs = n_jobs
        disp_model.n_jobs = n_jobs
        sign_model.n_jobs = n_jobs

    columns = set(needed_columns)
    for model in ('separator', 'energy', 'disp'):
        model_config = getattr(config, model)
        columns.update(model_config.columns_to_read_apply)
    try:
        runs = read_h5py(data_path, key='runs')
        sources = runs['source'].unique()
        if len(sources) > 1:
            raise click.ClickException(
                'to_dl3 only supports files with a single source')
        source = SkyCoord.from_name(sources[0])
        columns.update(['timestamp', 'night'])
    except (KeyError, OSError):
        source = None
        columns.update(dl3_columns_sim_read)

    df_generator = read_telescope_data_chunked(
        data_path,
        config,
        chunksize=chunksize,
        columns=columns,
    )

    log.info('Predicting on data...')
    for df, start, end in tqdm(df_generator):
        df_sep = feature_generation(df, config.separator.feature_generation)
        df['gamma_prediction'] = predict_separator(
            df_sep[config.separator.features],
            separator_model,
        )

        df_energy = feature_generation(df, config.energy.feature_generation)
        df['gamma_energy_prediction'] = predict_energy(
            df_energy[config.energy.features],
            energy_model,
            log_target=config.energy.log_target,
        )

        df_disp = feature_generation(df, config.disp.feature_generation)
        disp = predict_disp(
            df_disp[config.disp.features],
            disp_model,
            sign_model,
            log_target=config.disp.log_target,
        )

        prediction_x = df.cog_x + disp * np.cos(df.delta)
        prediction_y = df.cog_y + disp * np.sin(df.delta)
        df['source_x_prediction'] = prediction_x
        df['source_y_prediction'] = prediction_y
        df['disp_prediction'] = disp

        if source:
            obstime = Time(
                pd.to_datetime(df['timestamp'].values).to_pydatetime())
            source_altaz = concat_results_altaz(
                parallelize_array_computation(
                    partial(to_altaz, source=source),
                    obstime,
                    n_jobs=n_jobs,
                ))

            result = parallelize_array_computation(
                calc_source_features_obs,
                prediction_x,
                prediction_y,
                source_altaz.zen.deg,
                source_altaz.az.deg,
                df['pointing_position_zd'].values,
                df['pointing_position_az'].values,
                obstime,
                n_jobs=n_jobs,
            )
        else:

            if random_source:
                zd, az = calc_random_source(
                    df['pointing_position_zd'],
                    df['pointing_position_az'],
                    wobble_distance,
                )
                df['source_position_zd'] = zd
                df['source_position_az'] = az

            result = parallelize_array_computation(
                calc_source_features_sim,
                prediction_x,
                prediction_y,
                df['source_position_zd'].values,
                df['source_position_az'].values,
                df['pointing_position_zd'].values,
                df['pointing_position_az'].values,
                df['cog_x'].values,
                df['cog_y'].values,
                df['delta'].values,
                project_disp=config.disp.project_disp,
                n_jobs=n_jobs,
            )

        for k in result[0].keys():
            df[k] = np.concatenate([r[k] for r in result])

        if source:
            to_h5py(df[dl3_columns_obs], output, key='events', mode='a')
        else:
            to_h5py(df[dl3_columns_sim], output, key='events', mode='a')

    with h5py.File(data_path, 'r') as f:
        sample_fraction = f.attrs.get('sample_fraction', 1.0)

    set_sample_fraction(output, sample_fraction)
    copy_runs_group(data_path, output)
예제 #24
0
def main(
    configuration_path,
    data_path,
    separator_model_path,
    energy_model_path,
    disp_model_path,
    sign_model_path,
    output,
    random_source,
    wobble_distance,
    key,
    chunksize,
    n_jobs,
    yes,
    verbose,
):
    """
    Apply given model to data. Two columns are added to the file, energy_prediction
    and energy_prediction_std

    CONFIGURATION_PATH: Path to the config yaml file

    DATA_PATH: path to the FACT data in a h5py hdf5 file, e.g. erna_gather_fits output

    SEPARATOR_MODEL_PATH: Path to the pickled separation model.

    ENERGY_MODEL_PATH: Path to the pickled energy regression model.

    DISP_MODEL_PATH: Path to the pickled disp model.

    SIGN_MODEL_PATH: Path to the pickled sign model.
    """
    log = setup_logging()

    config = AICTConfig.from_yaml(configuration_path)

    if os.path.isfile(output):
        if not yes:
            click.confirm(
                "Outputfile {} exists. Overwrite?".format(output),
                abort=True,
            )
        open(output, "w").close()

    log.info("Loading model")
    separator_model = load_model(separator_model_path)
    energy_model = load_model(energy_model_path)
    disp_model = load_model(disp_model_path)
    sign_model = load_model(sign_model_path)
    log.info("Done")

    if n_jobs:
        separator_model.n_jobs = n_jobs
        energy_model.n_jobs = n_jobs
        disp_model.n_jobs = n_jobs
        sign_model.n_jobs = n_jobs

    columns = set(needed_columns)
    for model in ("separator", "energy", "disp"):
        model_config = getattr(config, model)
        columns.update(model_config.columns_to_read_apply)
    try:
        runs = read_h5py(data_path, key="runs")
        sources = runs["source"].unique()
        if len(sources) > 1:
            raise click.ClickException(
                "to_dl3 only supports files with a single source")
        source = SkyCoord.from_name(sources[0])
        columns.update(["timestamp", "night"])
    except (KeyError, OSError):
        source = None
        columns.update(dl3_columns_sim_read)

    df_generator = read_telescope_data_chunked(
        data_path,
        config,
        chunksize=chunksize,
        columns=columns,
    )

    log.info("Predicting on data...")
    for df, start, end in tqdm(df_generator):
        df_sep = feature_generation(df, config.separator.feature_generation)
        df["gamma_prediction"] = predict_separator(
            df_sep[config.separator.features],
            separator_model,
        )

        df_energy = feature_generation(df, config.energy.feature_generation)
        df["gamma_energy_prediction"] = predict_energy(
            df_energy[config.energy.features],
            energy_model,
            log_target=config.energy.log_target,
        )

        df_disp = feature_generation(df, config.disp.feature_generation)
        disp = predict_disp(
            df_disp[config.disp.features],
            disp_model,
            sign_model,
            log_target=config.disp.log_target,
        )

        prediction_x = df.cog_x + disp * np.cos(df.delta)
        prediction_y = df.cog_y + disp * np.sin(df.delta)
        df["source_x_prediction"] = prediction_x
        df["source_y_prediction"] = prediction_y
        df["disp_prediction"] = disp

        if source:
            obstime = Time(df["timestamp"].to_numpy().astype("U"))
            source_altaz = concat_results_altaz(
                parallelize_array_computation(
                    partial(to_altaz, source=source),
                    obstime,
                    n_jobs=n_jobs,
                ))

            result = parallelize_array_computation(
                calc_source_features_obs,
                prediction_x,
                prediction_y,
                source_altaz.zen.deg,
                source_altaz.az.deg,
                df["pointing_position_zd"].to_numpy(),
                df["pointing_position_az"].to_numpy(),
                obstime,
                n_jobs=n_jobs,
            )
        else:

            if random_source:
                zd, az = calc_random_source(
                    df["pointing_position_zd"],
                    df["pointing_position_az"],
                    wobble_distance,
                )
                df["source_position_zd"] = zd
                df["source_position_az"] = az

            result = parallelize_array_computation(
                calc_source_features_sim,
                prediction_x,
                prediction_y,
                df["source_position_zd"].to_numpy(),
                df["source_position_az"].to_numpy(),
                df["pointing_position_zd"].to_numpy(),
                df["pointing_position_az"].to_numpy(),
                df["cog_x"].to_numpy(),
                df["cog_y"].to_numpy(),
                df["delta"].to_numpy(),
                project_disp=config.disp.project_disp,
                n_jobs=n_jobs,
            )

        for k in result[0].keys():
            df[k] = np.concatenate([r[k] for r in result])

        if source:
            to_h5py(df[dl3_columns_obs], output, key="events", mode="a")
        else:
            to_h5py(df[dl3_columns_sim], output, key="events", mode="a")

    with h5py.File(data_path, "r") as f:
        sample_fraction = f.attrs.get("sample_fraction", 1.0)

    set_sample_fraction(output, sample_fraction)
    copy_group(data_path, output, "runs")
    copy_group(data_path, output, "corsika_runs")
예제 #25
0
def main(xml_name, ft_version, outputfile, config, start, end, source,
         datacheck, runlist, run_type):
    '''
    Gather the fits outputfiles of the erna automatic processing into a hdf5 file.
    The hdf5 file is written using h5py and contains the level 2 features in the
    `events` group and some metadata for each run in the `runs` group.

    It is possible to only gather files that pass a given datacheck with the --datacheck
    option. The possible conditions are implemented in erna.datacheck_conditions/

    XML_NAME: name of the xml for which you want to gather output
    FT_VERSION: FACT Tools version for which you want to gather output
    OUTPUTFILE: the outputfile
    '''
    config = load_config(config)
    database.init(**config['processing_database'])
    database.connect()

    if datacheck and runlist:
        print('Only one of datacheck or runlist allowed')
        sys.exit(1)

    if datacheck is not None:
        if not (datacheck in datacheck_conditions
                or os.path.isfile(datacheck)):
            print('Conditions must be a file or any of: ')
            for key in datacheck_conditions:
                print(key)
            sys.exit(1)

    processing_db = create_mysql_engine(**config['processing_database'])
    fact_db = create_mysql_engine(**config['fact_database'])

    try:
        jar = (Jar.select(Jar.id,
                          Jar.version).where(Jar.version == ft_version).get())
    except Jar.DoesNotExist:
        print('FACT-Tools version not found, avaliable jars are')
        for jar in Jar.select(Jar.version):
            print(jar.version)
        sys.exit(1)

    try:
        xml = XML.get(jar=jar, name=xml_name)
    except XML.DoesNotExist:
        print('XML not found, avaliable xmls are:')
        for xml in XML.select(
                XML.name).join(Jar).where(Jar.version == ft_version):
            print(xml.name)
        sys.exit(1)

    job_query = (Job.select(
        RawDataFile.night.alias('night'),
        RawDataFile.run_id.alias('run_id'), Job.result_file,
        ProcessingState.description.alias('status')).join(RawDataFile).switch(
            Job).join(ProcessingState).where(
                Job.jar == jar,
                Job.xml == xml,
                RawDataFile.run_type_name == run_type,
            ))
    if start:
        start = dateutil.parser.parse(start).date()
        job_query = job_query.where(RawDataFile.night >= start)
    if end:
        end = dateutil.parser.parse(end).date()
        job_query = job_query.where(RawDataFile.night <= end)

    sql, params = job_query.sql()

    with processing_db.connect() as conn:
        jobs = pd.read_sql_query(sql, conn, params=params)
    if runlist is None:
        conditions = [
            'fNight <= {}'.format(jobs.night.max()),
            'fNight >= {}'.format(jobs.night.min()),
            'fSourceName = "{}"'.format(source),
        ]
    else:
        wanted_runs = pd.read_csv(runlist)
        conditions = [
            'fNight <= {}'.format(wanted_runs.night.max()),
            'fNight >= {}'.format(wanted_runs.night.min()),
        ]

    if datacheck is not None:
        if os.path.isfile(datacheck):
            with open(datacheck, 'r') as f:
                conditions.extend(f.read().splitlines())
        else:
            conditions.extend(datacheck_conditions[datacheck])

    runs = get_runs(fact_db,
                    conditions=conditions).set_index(['night', 'run_id'])
    jobs = jobs.join(runs, on=['night', 'run_id'], how='inner')

    if runlist is not None:
        jobs = wanted_runs.join(
            jobs.set_index(['night', 'run_id']),
            on=['night', 'run_id'],
            how='inner',
            lsuffix='user_input_',
        )

    successful_jobs = jobs.query('status == "success"')
    total = len(jobs)
    successful = len(successful_jobs)

    if runlist is not None:
        if len(wanted_runs) != len(jobs):
            click.confirm(
                'Only {} of {} runs available, continue?:'.format(
                    total, len(jobs)),
                abort=True,
            )

    if total != successful:
        click.confirm(
            'Only {} of {} jobs successful, continue?'.format(
                successful, total),
            abort=True,
        )

    print('Found {} runs with a total ontime of {:1.2f} h'.format(
        len(jobs),
        jobs.ontime.sum() / 3600))

    if os.path.isfile(outputfile):
        a = input('Outputfile exists! Overwrite? [y, N]: ')
        if not a.lower().startswith('y'):
            sys.exit()

    columns = [
        'night',
        'run_id',
        'source',
        'ontime',
        'right_ascension',
        'declination',
        'zenith',
        'azimuth',
        'run_start',
        'run_stop',
    ]
    to_h5py(successful_jobs[columns], outputfile, key='runs', mode='w')

    with h5py.File(outputfile, 'a') as f:
        if runlist is not None:
            f['runs'].attrs['datacheck'] = 'RUNLIST'
        else:
            f['runs'].attrs['datacheck'] = ' AND '.join(conditions)

    write_fits_to_hdf5(outputfile, successful_jobs.result_file, mode='a')