def unpack_remove_tars(): """Script to unpack .tars holding directories with pickled events. Uses multiprocessing to unpack tars with bash: > tar -xf <tar_location> -C <pickle_dir_location>. For now, where (currently tarball_dir) and to (pickle_dir) has to be hardcoded in the script. """ # * Where are tars located? tarball_dir = get_project_root( ) + '/data/oscnext-genie-level5-v01-01-pass2/tarballs' tarballs = [path for path in Path(tarball_dir).iterdir()] # * WHere should they be put? pickle_dir = get_project_root( ) + '/data/oscnext-genie-level5-v01-01-pass2/pickles/' if not Path(pickle_dir).exists(): Path(pickle_dir).mkdir() # * Multiprocess available_cores = cpu_count() pickle_dir_list = [pickle_dir] * len(tarballs) packed = [entry for entry in zip(tarballs, pickle_dir_list)] with Pool(available_cores + 2) as p: p.map(unpack_tar_remove, packed) print(get_time(), 'Finished unpacking tarballs!')
def make_mask(data_path, mask_name='any', min_doms=0, max_doms=np.inf): # * make mask directory if it doesn't exist data_path = hf.get_project_root() + hf.get_path_from_root(data_path) name = hf.get_dataset_name(data_path) dir_path = hf.get_project_root() + '/data/masks/'+name if not Path(dir_path).is_dir(): Path(dir_path).mkdir(parents=True) if mask_name == 'dom_interval': make_dom_interval_mask(data_path, dir_path, min_doms, max_doms) # * Make a .dvc-file to track mask dvc_path = hf.get_project_root() + '/data' subprocess.run(['dvc', 'add', 'masks'], cwd=dvc_path)
def move_tars(): """Scripts used to move tarballs of rpickled data from HEP to gpulab. Script must be run on gpulab - cannot ssh from HEP to gpulab, only other way around. Uses rsync to move tarballs. WHere, to and how many must be hardcoded for now. """ # * Setup - where to load data, how many events n_pickle_dirs = 1131 data_dir = get_project_root() + '/data/oscnext-genie-level5-v01-01-pass2/' if not Path(data_dir).exists(): Path(data_dir).mkdir() print(get_time(), 'Created directory %s' % (data_dir)) from_ = '[email protected]:/groups/hep/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/tarballs/' to_ = data_dir + 'tarballs/' if not Path(to_).exists(): Path(to_).mkdir() print(get_time(), 'Created directory %s' % (to_)) from_tarballs = [from_ + str(i) + '.tar' for i in range(n_pickle_dirs)] to_list = [to_ + str(i) + '.tar' for i in range(n_pickle_dirs)] # * Zip and multiprocess packed = [entry for entry in zip(from_tarballs, to_list)] with Pool() as p: p.map(move_tar, packed) print(get_time(), 'Finished copying tarballs!')
def find_nearest_doms(data_dir_path=get_project_root()+'/data/oscnext-genie-level5-v01-01-pass2/', multiprocess=True, d_name='dom_geom.pickle'): # * Load precalculated geometry dictionary d_geom = pickle.load(open(data_dir_path+d_name, 'rb')) # * For each entry, calculate distances to all other DOMs # * Extract coordinates and pair with ID dom_ids = [dom_id for dom_id in d_geom] coords = {key: items['coordinates'] for key, items in d_geom.items()} own_coords = [items['coordinates'] for key, items in d_geom.items()] print(get_time(), 'Calculation of nearest DOMs begun...') if multiprocess: # * prepare for multiprocessing - we loop over DOM IDs coords_list = [coords]*len(dom_ids) packed = [pack for pack in zip(dom_ids, own_coords, coords_list)] with Pool() as p: dicts = p.map(find_nearest_doms_multi, packed) else: raise ValueError('Only multiprocessing implemented!') print(get_time(), 'Calculation finished!') # * Update the geometry dictionary with the closest DOMs for dom_id, d in zip(dom_ids, dicts): d_geom[dom_id].update(d) return d_geom
def make_geom_dict(data_dir_path=get_project_root()+'/data/oscnext-genie-level5-v01-01-pass2/', multiprocess=True, d_name='dom_geom.pickle'): print(get_time(), 'Making geometry dictionary...') shelve_path = data_dir_path+'shelve/oscnext-genie-level5-v01-01-pass2' # * Get filenames with shelve.open(shelve_path) as f: filenames = [key for key in f] # * Prepare for multiprocessing path_list = [shelve_path]*len(filenames) packed = [entry for entry in zip(filenames, path_list)] # * Multiprocess if multiprocess: with Pool() as p: all_dicts = p.map(find_unique_ids, packed) # * Combine dictionaries print(get_time(), 'Combining dictionaries...') dom_geom_dict = {} for d in all_dicts: dom_geom_dict.update(d) print(get_time(), 'Dictionaries combined!') else: dom_geom_dict = {} for pack in packed: dom_geom_dict.update(find_unique_ids(pack)) return dom_geom_dict
def fit_feature_transformers(pack): # * Unpack key, d, clip_dict, file_list, \ n_wanted_sample, n_wanted_histogram, particle_code, transformer = pack # * Read some data all_data = [] for file in file_list: # * once enough data has been read, break out if len(all_data) > n_wanted_sample: break data = hf.read_h5_dataset(file, key, prefix='raw/') if data[0].shape: for entry in data: all_data.extend(entry) else: all_data.extend(data) # * Data read. Now draw a random sample indices = np.array(range(len(all_data))) random.shuffle(indices) random_subsample = sorted( indices[:min(len(indices), int(n_wanted_histogram))]) # * Draw histogram and save it. plot_data = np.array(sorted(np.array(all_data)[random_subsample])) plot_data_unclipped = np.array(sorted( np.array(all_data)[random_subsample])) if clip_dict: minimum = clip_dict['min'] maximum = clip_dict['max'] plot_data = np.clip(plot_data, minimum, maximum) d['data'] = [plot_data] d['title'] = key + '- Entries = %.1e' % (plot_data_unclipped.shape[0]) path = hf.get_project_root() + '/reports/plots/features/' d['savefig'] = path + particle_code + '_' + key + '.png' fig = rpt.make_plot(d) # * Fit a transformer/scaler transformer.fit(plot_data_unclipped.reshape(-1, 1)) # * Transform plot data plot_data_transformed = transformer.transform( plot_data_unclipped.reshape(-1, 1)) # * Plot and save d_transformed = {'data': [plot_data_transformed]} d_transformed['title'] = key + ' transformed - Entries = %.1e' % ( plot_data_unclipped.shape[0]) d_transformed[ 'savefig'] = path + particle_code + '_transformed_' + key + '.png' fig = rpt.make_plot(d_transformed) d_transformer = {key: transformer} return d_transformer
def make_tars(): """Script to pack pickle-directories with single events into .tars Must hardcode where pickles are located and where tars should be put. """ # * Setup - where to load data, how many events data_dir = get_project_root() + '/data/oscnext-genie-level5-v01-01-pass2/' from_ = data_dir + 'pickles' to_ = data_dir + 'tarballs' pickle_dirs = [path for path in Path(from_).iterdir()] # * Zip and multiprocess to_list = [to_] * len(pickle_dirs) packed = [entry for entry in zip(pickle_dirs, to_list)] available_cores = cpu_count() with Pool(available_cores + 2) as p: p.map(make_tar, packed) print(get_time(), 'Finished making tarballs!')
def make_mask(data_path, dirs, mask_name='all', min_doms=0, max_doms=np.inf, min_energy=-np.inf, max_energy=np.inf): # * make mask directory if it doesn't exist data_path = get_project_root() + get_path_from_root(data_path) if mask_name == 'dom_interval': mask_path = make_dom_interval_mask(data_path, dirs, min_doms, max_doms) if mask_name == 'dom_interval_SRTInIcePulses': mask_path = make_dom_interval_mask(data_path, dirs, min_doms, max_doms, dom_mask='SRTInIcePulses') elif mask_name == 'all': mask_path = make_all_mask(data_path, dirs) elif mask_name == 'muon_neutrino': mask_path = make_particle_mask(data_path, dirs, mask_name) elif mask_name == 'energy_interval': mask_path = make_energy_interval_mask(data_path, dirs, min_energy, max_energy) return mask_path
import argparse import multiprocessing from src.modules.main_funcs import run_experiment from src.modules.helper_functions import get_project_root from pathlib import Path if __name__ == '__main__': description = 'Runs an experiment from the experiments folder.' parser = argparse.ArgumentParser(description=description) parser.add_argument('--gpu', nargs='+', default='0', type=str, help='Sets the IDs of the GPUs to use') args = parser.parse_args() # * Fetch an experiment - run the oldest first. exp_dir = get_project_root() + '/experiments' exps = sorted(Path(exp_dir).glob('*.json')) # ! Someone online set to add next line to ensure CUDA works... multiprocessing.set_start_method('spawn') print('WHAT THE F**K IS UP') # run_experiment(exps[0], gpu_id=args.gpu[0])
help='Saves figure(s) in root directory', action='store_true') args = parser.parse_args() if __name__ == '__main__': # * First create plot dictionaries plot_dicts = [] for model in args.inputs: #* Locate the model directory paths = hf.find_files(model) for path in paths: if path.split('/')[-1] == model: break # * Make a plotting dictionary with the datasets from the different models plot_dicts = rprt.get_performance_plot_dicts(path, plot_dicts) # * Now display (or save) desired performance plots for i, plot_dict in enumerate(plot_dicts): if args.save: plot_dict['savefig'] = hf.get_project_root( ) + '/comparisons/' + plot_dict['title'] + '.png' try: fig = rprt.make_plot(plot_dict) except FileNotFoundError: Path(hf.get_project_root() + '/comparisons/').mkdir() fig = rprt.make_plot(plot_dict)
def load_and_fit_transformer(pack): ids, (key, feature_dict), db_path, n_data = pack with shelve.open(db_path, 'r') as db: id_iter = iter(ids) data =np.array([]) loaded = 0 transformer = feature_dict['transformer'] clip_d = feature_dict.get('clip', None) # * If we are dealing with a feature that needs to be transforme, make the transformer! if transformer: # * Extract the function needed for derived features fnc = feature_dict['feature_calculator'] # * Loop until we have enough samples for the transformer while loaded < n_data: # * If we iterated over all data, thats it - just exit loop. try: event = db[next(id_iter)]['raw'] except StopIteration: break # * If dealing with a derived feature, calculate it! if fnc: new_data = fnc(event) # * If not, just load it else: new_data = event[key] data = np.append(data, new_data) if isinstance(new_data, np.ndarray): loaded += new_data.shape[0] elif isinstance(new_data, (float, int)): loaded += 1 else: raise ValueError('load_and_fit_transformer: Unknown type (%s) encountered'%(type(new_data))) # * Save plot of pre-transformed data path = get_project_root()+'/reports/shelve_data' if not Path(path).exists(): Path(path).mkdir() plot_d = {'data': [data], 'savefig': path+'/%s.png'%key} _ = make_plot(plot_d) # * Now fit a transformer transformer.fit(data.reshape(-1, 1)) # * save plot of transformed data if clip_d: data_transformed = np.clip(data, clip_d['min'], clip_d['max']) data_transformed = transformer.transform(data_transformed.reshape(-1, 1)) else: data_transformed = transformer.transform(data.reshape(-1, 1)) plot_d = {'data': [data_transformed], 'savefig': path+'/%s_transformed.png'%key} _ = make_plot(plot_d) return {key: transformer}
parser.add_argument('--n_transform', default=500000, type=int, help='Sets the amount of datapoints to use in approximating their distribution during fitting of transformer') parser.add_argument('--n_cpus', default=cpu_count(), type=int, help='Sets the amount of datapoints to use in approximating their distribution during fitting of transformer') parser.add_argument('--path', default='None', type=str, help='Path to shelve-file.') parser.add_argument('--fit_transformers', action='store_true', help='Whether or not to fit new transformers.') parser.add_argument('--new_name', default='None', type=str, help='Sets the new databases name.') args = parser.parse_args() if args.path == 'None': raise KeyError(r'A path must be supplied! Use flag --path') if args.new_name == 'None': raise KeyError(r'A new name must be supplied! Use flag --new_name') # * Setup - where to load data, how many events path_db = Path(get_project_root()+'/'+get_path_from_root(args.path)) path_geom_dict = str(path_db.parent)+'/dom_geom.pickle' path_transformer = str(path_db.parent)+'/transformers.pickle' path_new_db = str(path_db.parent)+'/'+args.new_name n_data = args.n_transform if not args.dev else 1000 chunksize = args.chunksize if not args.dev else 1000 n_cpus = args.n_cpus if not args.dev else 2 feature_dicts = get_feature_dicts() geom_features = get_geom_features() # * Fit and save transformers if args.fit_transformers: transformers = fit_transformers(str(path_db), n_data, feature_dicts, n_cpus=n_cpus) with open(path_transformer, 'wb') as f: pickle.dump(transformers, f)
import joblib from pathlib import Path from multiprocessing import Pool, cpu_count import src.modules.helper_functions as hf import src.modules.preprocessing as pp if __name__ == '__main__': # * For every datafile, make a new datafile to not f**k shit up data_dir = hf.get_project_root( ) + '/data/oscnext-genie-level5-v01-01-pass2_copy' particle_code = '140000' prefix = 'transform1' # * Load transformers, keys and prepare filenames transformer_path = data_dir + '/transformers/' + particle_code + '_' + prefix + '.pickle' transformers = joblib.load(open(transformer_path, 'rb')) file_list = [ str(file) for file in Path(data_dir).iterdir() if file.suffix == '.h5' and hf.confirm_particle_type(particle_code, file) ] keys = pp.get_feature_keys() # * Pack each filepath with transformers and keys for multiprocessing N_FILES = len(file_list) transformers_list = [transformers for i_file in range(N_FILES)] keys_list = [keys for i_file in range(N_FILES)] prefix_list = [prefix for i_file in range(N_FILES)] packed = [
# * drop the first entry, since this is itself d = {'closest': [key for key, value in sorted(dists.items(), key=lambda kv: kv[1])][1:]} return d if __name__ == '__main__': # * Parse arguments! description = 'Creates a dictionary of DOM-IDs and their positions by looping over all DOMs.' parser = argparse.ArgumentParser(description=description) parser.add_argument('--multiprocess', action='store_true', help='Enables multiprocessing.') parser.add_argument('--create_geom_dict', action='store_true', help='Calculates a geometry dictionary.') parser.add_argument('--find_nearest', action='store_true', help='Finds the nearest DOMs for each DOM.') args = parser.parse_args() data_dir_path = get_project_root()+'/data/oscnext-genie-level5-v01-01-pass2/' d_name = 'dom_geom.pickle' # * If geometry dictionary does not exist, make it first if args.create_geom_dict or not Path(data_dir_path+d_name).exists(): dom_geom_dict = make_geom_dict(data_dir_path=data_dir_path, multiprocess=args.multiprocess, d_name=d_name) # * Save geometry as a dict with DOM ID: np.array([x, y, z]) n_doms_found = len([key for key in dom_geom_dict]) pickle.dump(dom_geom_dict, open(data_dir_path+d_name, 'wb')) print(get_time(), 'Found %d DOMs in total.'%(n_doms_found)) print(get_time(), 'Saved file at %s'%(get_path_from_root(data_dir_path+d_name))) # * Calculate distance to all other DOMs if it doesn't already exist if args.find_nearest: d_geom = find_nearest_doms(data_dir_path=data_dir_path, multiprocess=args.multiprocess, d_name=d_name)
d['label'].append('Std. mean') # * Greatest fractional difference. # * Calculated by requiring (maxprod/minprd)**n = 10 greatest_frac_diff = (max(weights_prods)/min(weights_prods)) new_exp = np.log(10.0)/np.log(greatest_frac_diff) gfd_10 = weights_prods**new_exp gfd_10_normed = gfd_10/np.mean(gfd_10) d['x'].append(x_vals) d['y'].append(gfd_10_normed) d['label'].append('GFD=10') # * Make a spline interpolator_linear = interpolate.interp1d(x_vals, gfd_10_normed, fill_value="extrapolate", kind='quadratic') x_extrapolate = np.linspace(0.0, 3.0, 200) gfd_10_extrapolate = interpolator_linear(x_extrapolate) d['x'].append(x_extrapolate) d['y'].append(gfd_10_extrapolate) d['label'].append('GFD=10, quadratic interp.') # # * Print values # for count, e_sigma, gmean, prod, mean, gfd in zip(counts_weights_normed, energy_weights_normed, gmeans_normed, weights_prods_normed, weights_meaned, gfd_10_normed): # print('%.3f, %.3f, %.3f, %.3f, %.3f, %.3f'%(count, e_sigma, gmean, prod, mean, gfd)) d['yscale'] = 'log' d['savefig'] = get_project_root() + '/reports/plots/energyreg_weight_propositions.png' d['title'] = 'Combination of entries in each range + Icecube performance' d['xlabel'] = r'log(E) [E/GeV]' d['ylabel'] = r'Weight value' fig = make_plot(d)
def generate_gms_table_converters(losses="all"): """Generate converters for expected values of muon length <--> muon energy based on the tabulated muon energy loss model [1], spline-interpolated for smooth behavior within the range of tabulated energies / lengths. Note that "gms" in the name comes from the names of the authors of the table used. Parameters ---------- losses : comma-separated str or iterable of strs Valid sub-values are {"all", "ionization", "brems", "photonucl", "pair_prod"} where if any in the list is specified to be "all" or if all of {"ionization", "brems", "photonucl", and "pair_prod"} are specified, this supercedes all other choices and the CSDA range values from the table are used.. Returns ------- muon_energy_to_length : callable Call with a muon energy to return its expected length muon_length_to_energy : callable Call with a muon length to return its expected energy energy_bounds : tuple of 2 floats (lower, upper) energy limits of table; below the lower limit, lengths are estimated to be 0 and above the upper limit, a ValueError is raised; corresponding behavior is enforced for lengths passed to `muon_length_to_energy` as well. References ---------- [1] D. E. Groom, N. V. Mokhov, and S. I. Striganov, Atomic Data and Nuclear Data Tables, Vol. 78, No. 2, July 2001, p. 312. Table II-28. """ if isinstance(losses, string_types): losses = tuple(x.strip().lower() for x in losses.split(",")) VALID_MECHANISMS = ("ionization", "brems", "pair_prod", "photonucl", "all") for mechanism in losses: assert mechanism in VALID_MECHANISMS if "all" in losses or set(losses) == set( m for m in VALID_MECHANISMS if m != "all"): losses = ("all", ) fpath = get_project_root( ) + "/src/modules/retro_data/muon_stopping_power.csv" table = np.loadtxt(fpath, delimiter=",") kinetic_energy = table[:, 0] # (GeV) total_energy = kinetic_energy + MUON_REST_MASS mev_per_gev = 1e-3 cm_per_m = 1e2 if "all" in losses: # Continuous-slowing-down-approximation (CSDA) range (cm * g / cm^3) csda_range = table[:, 7] mask = np.isfinite(csda_range) csda_range = csda_range[mask] ice_csda_range_m = csda_range / NOMINAL_ICE_DENSITY / cm_per_m # (m) energy_bounds = (np.min(total_energy[mask]), np.max(total_energy[mask])) _, muon_energy_to_length = generate_lerp( x=total_energy[mask], y=ice_csda_range_m, low_behavior="constant", high_behavior="extrapolate", low_val=0, ) _, muon_length_to_energy = generate_lerp( x=ice_csda_range_m, y=total_energy[mask], low_behavior="constant", high_behavior="extrapolate", low_val=0, ) else: from scipy.interpolate import UnivariateSpline # All stopping powers given in (MeV / cm * cm^3 / g) stopping_power_by_mechanism = dict( ionization=table[:, 2], brems=table[:, 3], pair_prod=table[:, 4], photonucl=table[:, 5], ) stopping_powers = [] mask = np.zeros(shape=table.shape[0], dtype=bool) for mechanism in losses: addl_stopping_power = stopping_power_by_mechanism[mechanism] mask |= np.isfinite(addl_stopping_power) stopping_powers.append(addl_stopping_power) stopping_power = np.nansum(stopping_powers, axis=0)[mask] stopping_power *= cm_per_m * mev_per_gev * NOMINAL_ICE_DENSITY valid_energies = total_energy[mask] energy_bounds = (valid_energies.min(), valid_energies.max()) sample_energies = np.logspace( start=np.log10(valid_energies.min()), stop=np.log10(valid_energies.max()), num=1000, ) spl = UnivariateSpline(x=valid_energies, y=1 / stopping_power, s=0, k=3) ice_range = np.array( [spl.integral(valid_energies.min(), e) for e in sample_energies]) _, muon_energy_to_length = generate_lerp( x=sample_energies, y=ice_range, low_behavior="constant", high_behavior="extrapolate", low_val=0, ) _, muon_length_to_energy = generate_lerp( x=ice_range, y=sample_energies, low_behavior="constant", high_behavior="extrapolate", low_val=0, ) return muon_energy_to_length, muon_length_to_energy, energy_bounds
# from matplotlib import pyplot as plt # from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator # import h5py as h5 # from time import time # from scipy.stats import norm # import subprocess # from multiprocessing import Pool, cpu_count # from src.modules.classes import * from pathlib import Path # from src.modules.reporting import * # from src.modules.constants import * # import src.modules.loss_funcs as lf import src.modules.helper_functions as hf import subprocess # from src.modules.eval_funcs import * # from src.modules.preprocessing import * # # import src.modules.preprocessing as pp # from src.modules.main_funcs import * # import shelve # import sys # from time import sleep for path in Path(hf.get_project_root() + '/reports/thesis_plots').iterdir(): if path.is_dir(): if path.name == 'all_pgf': continue print(hf.get_time(), 'Running', path.name) runpath = str(path) + '/script.py' subprocess.call(['python', runpath]) print('')
) else: weights, interpolator, savename = make_weights( name, ids, db, debug=args.dev, interpolator=interpolator, alpha=args.alpha ) # Save in DB ids_strings = [str(idx) for idx in ids] print(get_time(), 'Writing %s to database'%(savename)) db.write('scalar', savename, ids_strings, weights) print(get_time(), 'Weights saved!') # Save a figure of the weights if args.make_plot: if name == 'uniform_direction_weights': x = np.linspace(-1.0, 1.0) else: x = np.linspace(0.0, 3.0) y = interpolator(x) d = {'x': [x], 'y': [y]} d['savefig'] = '/'.join([get_project_root(), 'reports/plots', savename+'.png']) d['yscale'] = 'log' _ = make_plot(d) if args.save_interpolator: path = PATH_DATA_OSCNEXT + '/weights/' + savename + '.pickle' with open(path, 'wb') as f: pickle.dump(interpolator, f)
weights, interpolator = inverse_performance_muon_energy( masks, dataset_path, from_frac=from_frac, to_frac=to_frac, debug=debug) return weights, interpolator if __name__ == '__main__': # ! Can use 2*n_cpus - only ~45 % of processors are used # ! Update: Seems like with 2*n_cpus, ~45 % of procesors are also used. # * Choose dataset, masks and size of subset to calculate weights from dataset_path = get_project_root( ) + '/data/oscnext-genie-level5-v01-01-pass2' masks = ['muon_neutrino'] names = args.name if not names: raise KeyError('Names must be supplied!') # * Ensure weight directory exists weights_dir = dataset_path + '/weights/' if not Path(weights_dir).exists(): Path(weights_dir).mkdir() # * from and to are used for spline calculation from_frac, to_frac = args.from_frac, args.to_frac if args.dev: from_frac, to_frac = 0.8, 0.81 PRINT_EVERY = 100