def find_nearest_doms(data_dir_path=get_project_root()+'/data/oscnext-genie-level5-v01-01-pass2/', multiprocess=True, d_name='dom_geom.pickle'): # * Load precalculated geometry dictionary d_geom = pickle.load(open(data_dir_path+d_name, 'rb')) # * For each entry, calculate distances to all other DOMs # * Extract coordinates and pair with ID dom_ids = [dom_id for dom_id in d_geom] coords = {key: items['coordinates'] for key, items in d_geom.items()} own_coords = [items['coordinates'] for key, items in d_geom.items()] print(get_time(), 'Calculation of nearest DOMs begun...') if multiprocess: # * prepare for multiprocessing - we loop over DOM IDs coords_list = [coords]*len(dom_ids) packed = [pack for pack in zip(dom_ids, own_coords, coords_list)] with Pool() as p: dicts = p.map(find_nearest_doms_multi, packed) else: raise ValueError('Only multiprocessing implemented!') print(get_time(), 'Calculation finished!') # * Update the geometry dictionary with the closest DOMs for dom_id, d in zip(dom_ids, dicts): d_geom[dom_id].update(d) return d_geom
def move_tars(): """Scripts used to move tarballs of rpickled data from HEP to gpulab. Script must be run on gpulab - cannot ssh from HEP to gpulab, only other way around. Uses rsync to move tarballs. WHere, to and how many must be hardcoded for now. """ # * Setup - where to load data, how many events n_pickle_dirs = 1131 data_dir = get_project_root() + '/data/oscnext-genie-level5-v01-01-pass2/' if not Path(data_dir).exists(): Path(data_dir).mkdir() print(get_time(), 'Created directory %s' % (data_dir)) from_ = '[email protected]:/groups/hep/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/tarballs/' to_ = data_dir + 'tarballs/' if not Path(to_).exists(): Path(to_).mkdir() print(get_time(), 'Created directory %s' % (to_)) from_tarballs = [from_ + str(i) + '.tar' for i in range(n_pickle_dirs)] to_list = [to_ + str(i) + '.tar' for i in range(n_pickle_dirs)] # * Zip and multiprocess packed = [entry for entry in zip(from_tarballs, to_list)] with Pool() as p: p.map(move_tar, packed) print(get_time(), 'Finished copying tarballs!')
def make_geom_dict(data_dir_path=get_project_root()+'/data/oscnext-genie-level5-v01-01-pass2/', multiprocess=True, d_name='dom_geom.pickle'): print(get_time(), 'Making geometry dictionary...') shelve_path = data_dir_path+'shelve/oscnext-genie-level5-v01-01-pass2' # * Get filenames with shelve.open(shelve_path) as f: filenames = [key for key in f] # * Prepare for multiprocessing path_list = [shelve_path]*len(filenames) packed = [entry for entry in zip(filenames, path_list)] # * Multiprocess if multiprocess: with Pool() as p: all_dicts = p.map(find_unique_ids, packed) # * Combine dictionaries print(get_time(), 'Combining dictionaries...') dom_geom_dict = {} for d in all_dicts: dom_geom_dict.update(d) print(get_time(), 'Dictionaries combined!') else: dom_geom_dict = {} for pack in packed: dom_geom_dict.update(find_unique_ids(pack)) return dom_geom_dict
def create_transformed_db(old_db, new_db, transformers, chunksize=100000): # Expects new_db, which is a copy of the old db. # If a key from old_db is in transformers --> # load --> transofrm --> save in new db_tables = old_db.tables tables = ['sequential', 'scalar', 'meta'] primary_keys = ['row', 'event_no', 'event_no'] # Since different DBs start at different indices, find the beginning id0 = int(old_db.ids[0]) for table, primary_key in zip(tables, primary_keys): for var in db_tables[table]: if var in transformers: print('') if var in [ 'dom_x', 'dom_y', 'dom_z', 'dom_charge', 'dom_time', 'dom_pulse_width' ]: continue print(get_time(), 'Transforming', var, '...') # We found avariable that needs transforming. Transform it! transformer = transformers[var] # Loop over the primary key in the table i = 0 if primary_key == 'row': start = 0 else: start = id0 # Keep transforming until all have been transformed while True: _from = start + i * chunksize _to = start + (i + 1) * chunksize indices = [str(e) for e in np.arange(_from, _to)] print(get_time(), 'Transforming %s - %s' % (indices[0], indices[-1])) # print(indices[:10], old_db.ids[:10]) fetched = old_db.read(table, var, primary_key, indices) n_fetched = len(fetched) transformed = np.squeeze( transformer.transform(fetched.reshape(-1, 1))) # Write to new db new_db.write(table, var, indices[:n_fetched], transformed, primary_key=primary_key) # Check if we reached the end if n_fetched < chunksize: print(get_time(), 'Transformation of %s finished.' % (var)) break else: i += 1
def save_png_pgf(path, f, width=1.0, height=1.0): FOTW = get_frac_of_textwidth(keyword='single_fig') w = get_figure_width(frac_of_textwidth=FOTW) h = get_figure_height(width=w) f.set_size_inches(w * width, h * height) name = path.split('/')[-1] f.savefig(path + '.png', bbox_inches='tight') print(get_time(), 'Saved %s.png' % (name)) f.savefig(path + '.pgf', bbox_inches='tight') print(get_time(), 'Saved %s.pgf' % (name))
def transform_events(db_path, ids, feature_dicts, transformers, n_nearest_data, geom_features, n_cpus=cpu_count()): """Transforms events. For each ID, the data induced by feature_dicts is calculated and/or transformed and placed in a a dictionary under event ID --> transformed. Furthermore, meta-information and masks are saved aswell. Parameters ---------- db_path : str FUll path to Shelve database ids : list List of ids to process feature_dicts : dict dictionary containing dictionaries with e.g. which transformer to use transformers : dict Dictionary containing info on which transformer to use n_nearest_data : dict Dictionary containing the geometry data to transform and add to the event. geom_features : dict Dictionary containing the informtion required to transform the n_nearest_data Returns ------- dict Dictionary containing transformed events. """ # * Chunk ID's for multiprocessing n_chunks = n_cpus id_chunks = np.array_split(ids, n_chunks) # * Repack the n_nearest_data so that IDs matches n_nearest_chunks = [{event_id: n_nearest_data[event_id] for event_id in chunk} for chunk in id_chunks] # * Multiprocess - prep by zipping all the required stuff for each process db_path_list = [db_path]*n_chunks transformers_list = [transformers]*n_chunks feature_dicts_list = [feature_dicts]*n_chunks geom_features_list = [geom_features]*n_chunks packed = zip(id_chunks, n_nearest_chunks, db_path_list, transformers_list, feature_dicts_list, geom_features_list) with Pool(processes=n_cpus) as p: print(get_time(), 'Transforming events...') events_transformed = p.map(transform_events_multiprocess, packed) print(get_time(), 'Events transformed!') events_unpacked = {} for events in events_transformed: events_unpacked.update(events) return events_unpacked
def find_dom_interval_passed_cands(pack): # Unpack ids, db, min_doms, max_doms, dom_mask = pack accepted = [] # Split into chunks n_chunks = len(ids) // CHUNK_SIZE chunks = np.array_split(ids, n_chunks) # Loop over chunks for i_chunk, chunk in enumerate(chunks): # Retrieve the '<MASK_NAME>_event_length' - # this value is the number of DOMs in an event if dom_mask == 'SplitInIcePulses': len_key = 'split_in_ice_pulses_event_length' elif dom_mask == 'SRTInIcePulses': len_key = 'srt_in_ice_pulses_event_length' data_dict = db.fetch_features(all_events=chunk, meta_features=[len_key]) for event_id, event_dict in data_dict.items(): n_doms = event_dict[len_key] if min_doms <= n_doms <= max_doms: accepted.append(int(event_id)) # Print for sanity print(get_time(), 'Processed chunk %d of %d' % (i_chunk + 1, n_chunks)) sys.stdout.flush() return accepted
def find_particles(pack): # Unpack ids, db, particle_code = pack accepted = [] # Split into chunks n_chunks = len(ids) // CHUNK_SIZE chunks = np.array_split(ids, n_chunks) # Loop over chunks for i_chunk, chunk in enumerate(chunks): # Retrieve the 'particle_code' from meta - # this value determines the particle code_name = 'particle_code' data_dict = db.fetch_features(all_events=chunk, meta_features=[code_name]) for event_id, event_dict in data_dict.items(): code = event_dict[code_name] if str(code) == particle_code: accepted.append(int(event_id)) # Print for sanity print(get_time(), 'Processed chunk %d of %d' % (i_chunk + 1, n_chunks)) sys.stdout.flush() return accepted
def find_unique_ids(pack): # * Unpack and notify datafile, path = pack print(get_time(), 'Processing %s'%(datafile)) sys.stdout.flush() # * all_doms will be a dictionary with dom_id: coordinates. all_doms = {} # * Retrieve DOM-ID and coordinates of each event # * len(keys) = N_events in file, len(keys[0]) = NUmber of DOMs in event 0 with shelve.open(path) as f: keys = f[datafile]['dom_key'] dom_xs = f[datafile]['dom_x'] dom_ys = f[datafile]['dom_y'] dom_zs = f[datafile]['dom_z'] for key, dom_x, dom_y, dom_z in zip(keys, dom_xs, dom_ys, dom_zs): # * Convert x, y, z into one coordinate entry as a np-array coords = [{'coordinates': np.array([x, y, z])} for x, y, z in zip(dom_x, dom_y, dom_z)] # * Update the dictionary over all events all_doms.update(zip(key, coords)) return all_doms
def find_particles(pack): # * Unpack dirs, particle_code = pack accepted = [] i_file = 0 # * Loop over the given directories for directory in dirs: # * Loop over the events in the subdirectory for file in directory.iterdir(): # * Check each file. event = pickle.load(open(file, "rb" )) if particle_code == event['meta']['particle_code']: accepted.append(int(file.stem)) # * Print for sanity i_file += 1 if (i_file) % PRINT_EVERY == 0: print(get_time(), 'Subprocess: Processed %d'%(i_file)) sys.stdout.flush() return accepted
def find_energy_interval_passed_cands(pack): # * Unpack dirs, min_energy, max_energy = pack accepted = [] i_file = 0 # * Loop over the given directories for directory in dirs: # * Loop over the events in the subdirectory for file in directory.iterdir(): # * Check each file. event = pickle.load(open(file, "rb" )) energy = event['raw']['true_primary_energy'] if min_energy <= energy <= max_energy: accepted.append(int(file.stem)) # * Print for sanity i_file += 1 if (i_file)%PRINT_EVERY == 0: print(get_time(), 'Subprocess: Processed %d'%(i_file)) sys.stdout.flush() return accepted
def find_dom_interval_passed_cands(pack): # * Unpack dirs, min_doms, max_doms, dom_mask, process_ID = pack accepted = [] i_file = 0 # * Loop over the given directories for directory in dirs: # * Loop over the events in the subdirectory for file in directory.iterdir(): # * Check each file. event = pickle.load(open(file, "rb" )) dom_indices = event['masks'][dom_mask] n_doms = event['raw']['dom_charge'][dom_indices].shape[0] if min_doms <= n_doms <= max_doms: accepted.append(int(file.stem)) # * Print for sanity i_file += 1 if (i_file)%PRINT_EVERY == 0: print(get_time(), 'Subprocess %d: Processed %d'%(process_ID, i_file)) sys.stdout.flush() return accepted
def pickle_events(pack): # * Unpack - assumes multiprocessing fname, new_names, data_dir, particle_code, n_per_dir = pack print(get_time(), 'Pickling %s' % (Path(fname).name)) # * Loop over events in file - each event is turned into a .pickle with h5.File(fname, 'r') as f: n_events = f['meta/events'][()] for i_event, new_name in zip(range(n_events), new_names): event = empty_pickle_event() # * Fill the pickle file. for group in event: for key, data in f[group].items(): # * Save in numpy.float32 format - this is the format used in models anyways. if group != 'masks': event[group][key] = data[i_event].astype(np.float32) else: event[group][key] = data[i_event] # * Assign metavalues - where is event from, what kind of particle? event['meta'] = {} event['meta']['file'] = Path(fname).name event['meta']['index'] = i_event event['meta']['particle_code'] = particle_code # * Save it in subdirs - put n_per_dir in each directory dir_name = str(new_name // n_per_dir) new_name = data_dir + '/' + dir_name + '/' + str( new_name) + '.pickle' pickle.dump(event, open(new_name, 'wb'))
def save_thesis_pgf(path, f, save_pgf=False, png_name=None, pgf_name=None): if pgf_name == None: pgf = str(path.parent.stem) else: pgf = str(path.parent.stem) + '_' + pgf_name all_figs_path = str(path.parent.parent) + '/all_pgf/' + pgf + '.pgf' if png_name: f.savefig(str(path.parent) + '/' + png_name + '.png', bbox_inches='tight') else: f.savefig(str(path.parent) + '/fig.png', bbox_inches='tight') print(get_time(), 'Saved .png') if save_pgf: f.savefig(all_figs_path, bbox_inches='tight') print(get_time(), pgf + ' saved.')
def unpack_remove_tars(): """Script to unpack .tars holding directories with pickled events. Uses multiprocessing to unpack tars with bash: > tar -xf <tar_location> -C <pickle_dir_location>. For now, where (currently tarball_dir) and to (pickle_dir) has to be hardcoded in the script. """ # * Where are tars located? tarball_dir = get_project_root( ) + '/data/oscnext-genie-level5-v01-01-pass2/tarballs' tarballs = [path for path in Path(tarball_dir).iterdir()] # * WHere should they be put? pickle_dir = get_project_root( ) + '/data/oscnext-genie-level5-v01-01-pass2/pickles/' if not Path(pickle_dir).exists(): Path(pickle_dir).mkdir() # * Multiprocess available_cores = cpu_count() pickle_dir_list = [pickle_dir] * len(tarballs) packed = [entry for entry in zip(tarballs, pickle_dir_list)] with Pool(available_cores + 2) as p: p.map(unpack_tar_remove, packed) print(get_time(), 'Finished unpacking tarballs!')
def make_tar(pack): pickle_dir, tar_dir = pack print(get_time(), 'Making tar of %s' % (pickle_dir)) sys.stdout.flush() tar_path = tar_dir + '/' + pickle_dir.name + '.tar' subprocess.run(['tar', '-cf', tar_path, pickle_dir])
def transform_features(pack): file, transformers, keys, prefix = pack start = time() name = Path(file).name with h5.File(file, 'a') as f: n_events = f['meta/events'][()] # * Loop over keys and do all transformations for the whole file. # * scikits transformers expect 2D-arrays, hence we reshape into 2D-array and flatten again. d = {} for key in keys: # * For each key, check if already transformed - if yes, don't do it again if f['raw/' + key]: # and prefix+'/'+key not in f: transformer = transformers[key] # * Prepare an empty dataset if f['raw/' + key][0].shape: d[key] = [[]] * n_events # * We must loop due to the sequential nature of DOM sequences for i_event, event in enumerate(f['raw/' + key]): d[key][i_event] = transformer.transform( event.reshape(-1, 1)).flatten() else: # * For non-sequential data, we can transform entire set in one go d[key] = transformer.transform(f['raw/' + key][:].reshape( -1, 1)).flatten() # * Now save for key, data in d.items(): dataset_path = prefix + '/' + key # * Check if it is a DOM-variable or global event-variable if data[0].shape: # * If dataset already exists, delete it first if dataset_path in f: del f[dataset_path] f.create_dataset(dataset_path, data=data, dtype=h5.special_dtype(vlen=data[0][0].dtype)) else: # * If dataset already exists, delete it first if dataset_path in f: del f[dataset_path] f.create_dataset(dataset_path, data=data, dtype=data[0].dtype) # * Print progress for sanity... finish_time = time() - start print(hf.get_time(), 'Finished %s in %.0f seconds' % (name, finish_time)) print('Speed: %.0f Events per second\n' % (n_events / finish_time))
def move_tar(pack): from_hep, to_gpu = pack if Path(to_gpu).exists(): pass else: print(get_time(), 'Copying %s' % (from_hep)) sys.stdout.flush() command = 'rsync' subprocess.run([command, from_hep, to_gpu])
def inverse_low_E( name, masked_ids, db, debug=False, multiprocess=True, interpolator=None ): if interpolator == None: raise ValueError('Not implemented yet - interpolator must be supplied') # Loop over all events using multiprocessing print(get_time(), 'Assigning energy weights...') if multiprocess: weights = assign_energy_balanced_weights_multiprocess( ids, db, interpolator, true_key=['true_primary_energy'], debug=debug ) print(get_time(), 'Energy weights assigned!') return weights, interpolator
def uniform_direction( name, ids, db, multiprocess=True, debug=False, interpolator=None ): # Get indices used for interpolator-calculation if not interpolator: n_events = min(len(ids), USE_N_EVENTS) event_ids = ids[:n_events] print(get_time(), 'Calculating direction bins..') x, counts = calc_uniform_direction_weights(event_ids, db) weights_unscaled = 1.0/np.array(counts) print(get_time(), 'Bins calculated!') print(get_time(), 'Fitting interpolator') interpolator= make_scaled_interpolator(weights_unscaled, counts, x) print(get_time(), 'Interprolator fitted!') # Loop over all events using multiprocessing print(get_time(), 'Assigning energy weights...') if multiprocess: weights_dict = assign_uniform_direction_weights_multiprocess( ids, db, interpolator, true_key=['true_primary_direction_z'], debug=debug ) print(get_time(), 'Energy weights assigned!') return weights_dict, interpolator
def energy_balanced( name, ids, db, multiprocess=True, debug=False, interpolator=None, alpha=1.0 ): # Get indices used for interpolator-calculation if not interpolator: n_events = min(len(ids), USE_N_EVENTS) event_ids = ids[:n_events] print(get_time(), 'Calculating energy bins..') x, counts = calc_energy_balanced_weights(event_ids, db) weights_unscaled = np.power(1.0/np.array(counts), alpha) print(get_time(), 'Bins calculated!') print(get_time(), 'Fitting interpolator') # In this case, MAX 10 for better gradients interpolator = make_scaled_interpolator(weights_unscaled, counts, x) print(get_time(), 'Interpolator fitted!') # Loop over all events using multiprocessing print(get_time(), 'Assigning energy weights...') if multiprocess: weights = assign_energy_balanced_weights_multiprocess( ids, db, interpolator, true_key=['true_primary_energy'], debug=debug ) print(get_time(), 'Energy weights assigned!') return weights, interpolator
def get_n_nearest_data(db_path, id_chunk, geom_features, n_cpus=cpu_count()): """Finds and extracts data from the nearest n DOMs Parameters ---------- db_path : str Absolute path to the Shelve-database id_chunk : list list of event IDs to extract data for geom_features : dict What geometry data to extract, e.g. nearest DOMs x-value geom_dict_path : str full path to geometry dictionary (dictionary containing nearest DOMs for each DOM) Returns ------- dict Data of nearest N doms for each event ID """ # * Chunk ID's for multiprocessing n_chunks = n_cpus id_chunks = np.array_split(id_chunk, n_chunks) # * Multiprocess db_list = [db_path]*n_chunks geom_features_list = [geom_features]*n_chunks geom_dict_list = [geom_dict_path]*n_chunks packed = zip(id_chunks, db_list, geom_features_list, geom_dict_list) with Pool(processes=n_cpus) as p: print(get_time(), 'Finding n nearest DOMs...') data = p.map(get_n_nearest_data_multiprocess, packed) print(get_time(), 'N nearest DOMs found!') # * Unpack all_events = {} for events in data: all_events.update(events) return all_events
def fit_transformers(db_path, n_data, feature_dicts, n_cpus=cpu_count()): # * Assumes a RANDOMIZED DB! ids = [str(i) for i in range(n_data)] # * Load/calculate features, then transform keys = [key for key in feature_dicts] # * Multiprocess db_list = [db_path]*len(keys) ids_list = [ids]*len(keys) n_data_list = [n_data]*len(keys) packed = zip(ids_list, feature_dicts.items(), db_list, n_data_list) with Pool(processes=n_cpus) as p: print(get_time(), 'Fitting transformers...') transformers_list = p.map(load_and_fit_transformer, packed) print(get_time(), 'Transformers fitted!') # * Make a dictionary with the transformers transformers = {} for transformer in transformers_list: transformers.update(transformer) return transformers
def feature_engineer(pack): """Calculates desired features for a h5-datafile and appends the new datasets to the file. Multiprocessing-friendly. Arguments: packed {tuple} -- a tuple containing: i_file {int} -- Filenumber i of N_FILES - to track progress file {str} -- absolute path to h5-datafile. N_FILES {int} -- Total number of files to process (via multi- or singleprocesing). """ # * Unpack. One input is expected to be compatible with multiprocessing i_file, file, N_FILES = pack name = Path(file).name # * Print progress for our sanity.. print(hf.get_time(), 'Processing %s (file %d of %d)' % (name, i_file + 1, N_FILES)) # * Retrieve wanted engineers - they have to be predefined in get_wanted_feature_engineers (for now) functions = get_wanted_feature_engineers() # * Now calculate the features on a per event basis. d = calc_features(functions, file) # * Append our calculations to the datafile with h5.File(file, 'a') as f: # * Make a 'raw/'-group if it doesnt exist if 'raw' not in f: raw = f.create_group("raw") # * Now make the datasets for key, data in d.items(): dataset_path = 'raw/' + key # * Check if it is a DOM-variable or global event-variable if data[0].shape: # * If dataset already exists, delete it first if dataset_path in f: del f[dataset_path] f.create_dataset(dataset_path, data=data, dtype=h5.special_dtype(vlen=data[0][0].dtype)) else: # * If dataset already exists, delete it first if dataset_path in f: del f[dataset_path] f.create_dataset(dataset_path, data=data, dtype=data[0].dtype)
def move_pickle(pack): integer, hep_dir, gpu_dir = pack n_per_dir = 10000 path = hep_dir + str(integer) name_range = range(integer * n_per_dir, (integer + 1) * n_per_dir) print(get_time(), 'Moving %s' % (path)) sys.stdout.flush() command = 'scp' for name in name_range: from_ = '[email protected]:' + hep_dir + str( integer) + '/' + str(name) + '.pickle' to = gpu_dir + str(integer) + '/' + str(name) + '.pickle' subprocess.run([command, from_, to]) event = pickle.load(open(to, "rb")) if event['meta']['particle_code'] != '140000': Path(to).unlink()
def inverse_performance_muon_energy( name, ids, db, multiprocess=True, debug=False, interpolator=None ): """Given a pickled dataset, a weight is calculated for each event. The weight is calculated (using a quadratic spline) as w = (icecube_performance)**-0.5. In other words, the inverse of Icecubes performance in each energy range. It can be chosen to only use a fraction of the dataset for the creation of the quadratic spline. If an event is not in the mask, it is assigned a nan as weight. The weights are normalized such that the average weight of an event in a batch is 1. Arguments: masks {list} -- Masknames for the data to calculate weights on dataset_path {str} -- path to dataset Keyword Arguments: multiprocess {bool} -- Whether or not to use multiprocessing in calculating weights for each event (default: {True}) from_frac {float} -- Lower limit of the amount of data to use to calculate the spline (default: {0.0}) to_frac {float} -- Upper limit of the amount of data to use to calculate the spline (default: {1.0}) Returns: dict -- Weights for each event """ # Get indices used for interpolator-calculation if not interpolator: n_events = min(len(ids), USE_N_EVENTS) event_ids = ids[:n_events] print(get_time(), 'Calculating performance..') x, counts, retro_sigmas = calc_energy_performance_weights(event_ids, db) weights_unscaled = 1.0/np.array(retro_sigmas) print(get_time(), 'Performance calculated!') print(get_time(), 'Fitting interpolator') interpolator = make_scaled_interpolator(weights_unscaled, counts, x) print(get_time(), 'Interpolator fitted!') # Loop over all events using multiprocessing print(get_time(), 'Assigning energy weights...') if multiprocess: weights_dict = assign_energy_weights_multiprocess( ids, db, interpolator, debug=debug ) print(get_time(), 'Energy weights assigned!') return weights_dict, interpolator
def unpack_tar_remove(pack): tarball, path = pack command = 'tar' flags_tar = '-xf' flags_dir = '-C' # * The tar was created in a silly way - it is deeply nested in # * lustre/hpc/hep/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/pickles/. # * This is unwanted. Therefore, standing in ../pickles run: # * mv lustre/hpc/hep/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/pickles/* . subprocess.run([command, flags_tar, tarball, flags_dir, path]) # * Remove the tarball tarball.unlink() print(get_time(), 'Unpacked and removed %s' % (tarball)) sys.stdout.flush()
def make_tars(): """Script to pack pickle-directories with single events into .tars Must hardcode where pickles are located and where tars should be put. """ # * Setup - where to load data, how many events data_dir = get_project_root() + '/data/oscnext-genie-level5-v01-01-pass2/' from_ = data_dir + 'pickles' to_ = data_dir + 'tarballs' pickle_dirs = [path for path in Path(from_).iterdir()] # * Zip and multiprocess to_list = [to_] * len(pickle_dirs) packed = [entry for entry in zip(pickle_dirs, to_list)] available_cores = cpu_count() with Pool(available_cores + 2) as p: p.map(make_tar, packed) print(get_time(), 'Finished making tarballs!')
def calc_weights_multiprocess(pack): indices, interpolator, key, path, n_per_dir, subprocess_id = pack weights = [-1] * len(indices) n_indices = len(indices) for i_index, index in enumerate(indices): # * Check each file. full_path = path + '/pickles/' + str( index // n_per_dir) + '/' + str(index) + '.pickle' event = pickle.load(open(full_path, "rb")) energy = event['raw']['true_primary_energy'] weights[i_index] = interpolator(energy) if (i_index) % PRINT_EVERY == 0: print( get_time(), 'Subprocess %d: Processed %d of %d' % (subprocess_id, i_index, n_indices)) sys.stdout.flush() return weights
def find_energy_interval_passed_cands(pack): # Unpack ids, db, min_energy, max_energy = pack accepted = [] energy_key = 'true_primary_energy' # Split into chunks n_chunks = len(ids) // CHUNK_SIZE chunks = np.array_split(ids, n_chunks) # Load transformer transformer_path = '/'.join( [PATH_DATA_OSCNEXT, 'sqlite_transformers.pickle']) transformers = joblib.load(open(transformer_path, 'rb')) transformer = transformers[energy_key] # Loop over chunks for i_chunk, chunk in enumerate(chunks): # Fetch energy data_dict = db.fetch_features(all_events=chunk, scalar_features=[energy_key]) energies_transformed = np.array( [data_d[energy_key] for event_id, data_d in data_dict.items()]) # inverse transform energies = np.squeeze( transformer.inverse_transform(energies_transformed.reshape(-1, 1))) # add or discard for event_id, energy in zip(data_dict.keys(), energies): if min_energy <= energy <= max_energy: accepted.append(int(event_id)) # Print for sanity print(get_time(), 'Processed chunk %d of %d' % (i_chunk + 1, n_chunks)) sys.stdout.flush() return accepted