def make_data(): dbs = [PATH_TRAIN_DB, PATH_VAL_DB, PATH_TEST_DB] cleaned_key = 'srt_in_ice_pulses_event_length' energy_key = 'true_primary_energy' data_d = {cleaned_key: [], energy_key: []} transformer = joblib.load( open(PATH_DATA_OSCNEXT + '/sqlite_transformers.pickle', 'rb'))[energy_key] for db_path in dbs: # Load seq lengths db = SqliteFetcher(db_path) data_dicts = db.fetch_features(all_events=db.ids, meta_features=[cleaned_key], scalar_features=[energy_key]) data_d[cleaned_key].extend( [d[cleaned_key] for i, d in data_dicts.items()]) data_d[energy_key].extend( [d[energy_key] for i, d in data_dicts.items()]) data_d[energy_key] = np.squeeze( transformer.inverse_transform( np.array(data_d[energy_key]).reshape(-1, 1))) return data_d[energy_key], data_d[cleaned_key]
def make_data(): dbs = [PATH_TRAIN_DB, PATH_VAL_DB, PATH_TEST_DB] particles = ['electron_neutrino', 'muon_neutrino', 'tau_neutrino'] suffixes = ['_train.pickle', '_val.pickle', '_test.pickle'] key = 'true_primary_energy' transformer = joblib.load( open(PATH_DATA_OSCNEXT + '/sqlite_transformers.pickle', 'rb'))[key] data_d = {} i = 0 for particle in particles: all_energies = np.array([]) for db_path, suffix in zip(dbs, suffixes): # Load mask path = PATH_DATA_OSCNEXT + '/masks/' + particle + suffix mask = [str(e) for e in pickle.load(open(path, 'rb'))] # Load energy db = SqliteFetcher(db_path) data_dicts = db.fetch_features(all_events=mask, scalar_features=[key]) energies_trans = np.array([d[key] for e, d in data_dicts.items()]) # Inverse transform them energies = np.squeeze( transformer.inverse_transform(energies_trans.reshape(-1, 1))) # Add to all all_energies = np.append(all_energies, energies) print(i) i += 1 data_d[particle] = all_energies # Decide on bin size iqr = np.percentile(data_d['muon_neutrino'], 75) - np.percentile( data_d['muon_neutrino'], 25) n_data = data_d['muon_neutrino'].shape[0] bin_width = 2 * iqr / (n_data**0.3333) n_bins = int(4 / bin_width) hist_vals = {} for particle, data in data_d.items(): hist_vals[particle], edges = np.histogram(data, bins=n_bins, range=(0.0, 4.0)) hist_vals['edges'] = edges path = Path(os.path.realpath(__file__)) # Save data with open(str(path.parent) + '/data.pickle', 'wb') as f: pickle.dump(hist_vals, f)
def make_data(): db_path = PATH_TRAIN_DB key = 'dom_charge' transformer = joblib.load( open(PATH_DATA_OSCNEXT + '/sqlite_transformers.pickle', 'rb'))[key] db = SqliteFetcher(db_path) # Lets go with 1M ~ approximtely 1M/50 = 20k events ids = [str(e) for e in range(20000)] all_data = db.fetch_features(all_events=ids, seq_features=[key]) data_lists = [data[key] for event_id, data in all_data.items()] data_transformed = np.array(flatten_list_of_lists(data_lists)) data = np.squeeze( transformer.inverse_transform(data_transformed.reshape(-1, 1))) return data, data_transformed
def make_data(): dbs = [PATH_TRAIN_DB, PATH_VAL_DB, PATH_TEST_DB] cleaned_key = 'srt_in_ice_pulses_event_length' uncleaned_key = 'split_in_ice_pulses_event_length' data_d = {cleaned_key: [], uncleaned_key: []} for db_path in dbs: # Load seq lengths db = SqliteFetcher(db_path) data_dicts = db.fetch_features( all_events=db.ids, meta_features=[cleaned_key, uncleaned_key]) data_d[cleaned_key].extend( [d[cleaned_key] for i, d in data_dicts.items()]) data_d[uncleaned_key].extend( [d[uncleaned_key] for i, d in data_dicts.items()]) # Decide on bin size maxlen = 200 minlen = 0 bins = maxlen - minlen + 1 hist_vals = {} for key, data in data_d.items(): data_clipped = np.clip(data, 0, maxlen) hist_vals[key], edges = np.histogram(data_clipped, bins=bins, range=(minlen - 0.5, maxlen + 0.5)) hist_vals['edges'] = edges path = Path(os.path.realpath(__file__)) # Save data with open(str(path.parent) + '/data.pickle', 'wb') as f: pickle.dump(hist_vals, f)
Path(weights_dir).mkdir() for name in names: if args.interpolator: path = PATH_DATA_OSCNEXT + '/weights/' + name + '.pickle' interpolator = pickle.load(open(path, 'rb')) else: interpolator = None all_weights = {} for path, keyword in zip( [PATH_TRAIN_DB, PATH_VAL_DB], ['train', 'val'], ): # Get DB and mask db = SqliteFetcher(path) db_specific_masks = [e+'_'+keyword for e in args.masks] ids = load_pickle_mask(PATH_DATA_OSCNEXT, db_specific_masks) ids = [ str(i) for i in ids ] # If developing, use less data if args.dev: USE_N_EVENTS = 1000 PRINT_EVERY = 100 ids = ids[:1000] # Calculate weights and potentially interpolator if not interpolator:
import pickle from src.modules.reporting import * from src.modules.constants import * from matplotlib import pyplot as plt from src.modules.classes import SqliteFetcher from src.modules.thesis_plotting import * import os p = '/home/bjoernhm/CubeML/models/oscnext-genie-level5-v01-01-pass2/regression/full_reg/2020-04-16-11.34.16/data/predictions.h5' with h5.File(p, 'r') as f: en = f['true_primary_energy'][:] ids = np.array([str(i) for i in f['index'][:]]) db = SqliteFetcher(PATH_VAL_DB) db_ids = db.ids overlap_ids = np.isin(ids, db_ids) f_i = ids[overlap_ids] true_e = db.fetch_features( all_events=f_i, scalar_features=['true_primary_energy'] ) e_t = np.array([d['true_primary_energy'] for i, d in true_e.items()]) e_p = en[overlap_ids] error = e_p-e_t e_p, error = sort_pairs(e_p, error) bins = np.linspace(min(e_p), max(e_p), num=20) # e_p_bins, error_bins = bin_data(e_p, error, bins) e_p_bins, error_bins = bin_data(e_p, error, bins)
min_doms = args.min_doms max_doms = args.max_doms min_energy = args.min_energy max_energy = args.max_energy mask_dict = { 'mask_name': mask_name, 'min_doms': min_doms, 'max_doms': max_doms, 'min_energy': min_energy, 'max_energy': max_energy } # If maskdirectory doesn't exist, make it mask_dir = '/'.join([PATH_DATA_OSCNEXT, 'masks']) if not Path(mask_dir).exists(): Path(mask_dir).mkdir() # Loop over different DBs for path, ext in zip([PATH_TRAIN_DB, PATH_VAL_DB], ['_train.pickle', '_val.pickle']): db = SqliteFetcher(path) print(get_time(), '%s mask calculation begun.' % (mask_dict['mask_name'])) mask, mask_name = make_mask(db, **mask_dict) mask_path = '/'.join([mask_dir, mask_name + ext]) with open(mask_path, 'wb') as f: pickle.dump(mask, f) print(get_time(), 'Mask created at', mask_path, '\n')
# Locate the model directory model = locate_model(model_dir) model_name = Path(model_dir).name print('') print(get_time(), 'Used model: %s' % (model_name)) for path in [PATH_TRAIN_DB, PATH_VAL_DB]: preds, indices = calc_raw_predictions( model, n_predictions_wanted=args.n_predictions_wanted, db_path=path) predictions = {} for key in args.prediction_keys: predictions[key] = preds[key] indices = [str(entry) for entry in indices] db = SqliteFetcher(path) keys = [key for key in predictions] new_keys = [ key + '_' + remove_dots_and_lines(model_name) for key in predictions ] predictions_newnames = convert_keys(predictions, keys, new_keys) print(get_time(), 'Saving to db...') for name, values in predictions_newnames.items(): db.write('scalar', name, indices, values, astype='REAL') print(get_time(), 'Data saved.')
def make_data(): seq_keys = [ 'dom_charge', 'dom_x', 'dom_y', 'dom_z', 'dom_time', 'dom_atwd', 'dom_pulse_width' ] target_keys = [ 'true_primary_energy', 'true_primary_position_x', 'true_primary_position_y', 'true_primary_position_z', 'true_primary_time', 'true_primary_direction_x', 'true_primary_direction_y', 'true_primary_direction_z' ] db_path = PATH_TRAIN_DB key = 'dom_charge' transformers = joblib.load( open(PATH_DATA_OSCNEXT + '/sqlite_transformers.pickle', 'rb') ) db = SqliteFetcher(db_path) # Lets go with 1M ~ approximtely 1M/50 = 20k events ids = [str(e) for e in range(1000)] all_data = db.fetch_features( all_events=ids, seq_features=seq_keys, scalar_features=target_keys ) data_d = {key: [] for key in all_data['0']} for key in target_keys: data_d[key] = [data[key] for event_id, data in all_data.items()] for key in seq_keys: data_d[key].extend( flatten_list_of_lists( [data[key] for event_id, data in all_data.items()] ) ) # Calculate means and std's before and after transformation dicts = {} table = np.empty((5, len(seq_keys)+len(target_keys)), dtype=object) for i_key, key in enumerate(data_d): data = data_d[key] d = {} if key in transformers: if type(transformers[key]) == sklearn.preprocessing._data.QuantileTransformer: name = 'ToNormal' elif sklearn.preprocessing._data.RobustScaler: if key == 'true_primary_energy': name = 'LogRobust' else: name = 'Robust' table[0, i_key] = name table[3, i_key] = r'%.2f'%(np.mean(data)) table[4, i_key] = r'%.2f'%(np.std(data)) data_pre = np.squeeze( transformers[key].inverse_transform( np.array(data).reshape(-1, 1) ) ) if key == 'true_primary_energy': table[1, i_key] = r'%.2e'%(np.mean(10**data_pre)) table[2, i_key] = r'%.2e'%(np.std(10**data_pre)) else: table[1, i_key] = r'%.2e'%(np.mean(data_pre)) table[2, i_key] = r'%.2e'%(np.std(data_pre)) else: table[0, i_key] = 'None' table[1, i_key] = r'%.2f'%(np.mean(data)) table[2, i_key] = r'%.2f'%(np.std(data)) table[3, i_key] = r'-' table[4, i_key] = r'-' index = [r'Transformation', r'$\mu$, before', r'$\sigma$, before', r'$\mu$, after', r'$\sigma$, after'] columns = [] for col in [key for key in data_d]: split = col.split('_') new_col = r'\_'.join(split) columns.append(new_col) table_pd = pd.DataFrame( np.transpose(table), # values index=columns, # 1st column as index columns=index) # 1st row as the column names return table_pd
from src.modules.helper_functions import * from src.modules.reporting import * from src.modules.classes import * import numpy as np import pickle from src.modules.reporting import * from src.modules.constants import * from matplotlib import pyplot as plt from src.modules.classes import SqliteFetcher from src.modules.thesis_plotting import * import os # ! INSERT MASKNAMES HERE masks = ['tau_neutrino'] ids = [str(e) for e in load_sqlite_mask(PATH_DATA_OSCNEXT, masks, 'val')] # ! INSERT VARIABLENAME FOR INSPECTION HERE scalar_var = ['energy_balanced_alpha70', 'true_primary_energy'] db = SqliteFetcher(PATH_VAL_DB) events = db.fetch_features(all_events=ids[:10], scalar_features=scalar_var) for event, data in events.items(): print(event) print(data) print('')
from src.modules.db_utils import * from src.modules.classes import SqliteFetcher from src.modules.constants import * import joblib old_db = SqliteFetcher(PATH_DATA_OSCNEXT + '/val_set_sqlite.db') new_db = SqliteFetcher(PATH_DATA_OSCNEXT + '/val_transformed.db') transformers = joblib.load(open(PATH_TRANSFORMERS, 'rb')) create_transformed_db(old_db, new_db, transformers)