def select_ids_jug(self, data_filters={}): '''Save a set of all ids that match a set of data filters to a file. Args: data_filters (dict of dicts): The data filters to apply. ''' print( "########################################################################" ) print("Selecting IDs") print( "########################################################################" ) sys.stdout.flush() selected_ids = self.get_selected_ids_jug(data_filters) selected_ids_formatted = jug.Task(self.format_selected_ids, selected_ids) jug.Task(self.save_selected_ids, selected_ids_formatted, data_filters) jug.barrier() print( "########################################################################" ) print("Done selecting IDs!") sys.stdout.flush()
def get_selected_ids_jug(self, data_filters): '''Parallel version of self.get_selected_ids(). Requires a lot of memory, because it will have multiple snapshots open at once. Args: data_filters (list of dicts): The data filters to apply. Returns: selected_ids (set): Set of selected ids. ''' selected_ids = set() results = [] for snum in self.snums: for ptype in self.p_types: kwargs = dict(self.snapshot_kwargs) kwargs['snum'] = snum kwargs['ptype'] = ptype result = jug.Task( self.get_selected_ids_snapshot, (data_filters, kwargs), ) results.append(result) def unify_results(given_results): '''Helper function to get around jug formatting.''' for i, data in enumerate(given_results): data = np.array(list(data)) # The np method (which is supposedly faster) doesn't work for # results that contain child IDs too if len(data.shape) == 2: return set.union(*given_results) print("Adding data set {}, consisting of {} ids".format( i, data.shape, )) try: combined = np.union1d(combined, data) except NameError: combined = data return combined selected_ids = jug.Task(unify_results, results) return selected_ids
def get_tracked_data_jug(self): '''Loop overall redshift snapshots, and get the data. This is the parallelized version that uses Jug Returns: ptrack (dict): Structure to hold particle tracks. Structure is... ptrack ['varname'] [particle i, snap j, k component] ''' self.snaps = np.arange(self.snum_end, self.snum_start - 1, -self.snum_step) self.ntrack = self.target_ids.size print("Tracking {} particles...".format(self.ntrack)) sys.stdout.flush() def get_tracked_data_snapshot(args): i, snum = args time_1 = time.time() id_finder = IDFinder() dfid, redshift, attrs = id_finder.find_ids( self.sdir, snum, self.p_types, self.target_ids, target_child_ids=self.target_child_ids, custom_fns=self.custom_fns, ) # Maybe helps stop leaking memory del id_finder gc.collect() time_2 = time.time() # Print output information. print( 'Snapshot {:>3} | redshift {:>7.3g} | done in {:.3g} seconds'\ .format( snum, redshift, time_2 - time_1 ) ) sys.stdout.flush() return i, dfid, redshift, attrs, snum tracked_data_snapshots = [] for args in enumerate(self.snaps): tracked_data = jug.Task(get_tracked_data_snapshot, args) tracked_data_snapshots.append(tracked_data) return tracked_data_snapshots
def find_galaxies_for_particle_tracks_jug(self): '''Main function when using jug''' self.read_data() ptrack_gal_ids = self.get_galaxy_identification_loop_jug() jug.Task(self.write_galaxy_identifications, ptrack_gal_ids) jug.barrier()
def save_particle_tracks_jug(self): '''Loop over all redshifts, get the data, and save the particle tracks. ''' print("#" * 80) print("Starting Tracking!") print("#" * 80) # Get the target ids self.get_target_ids() tracked_data_snapshots = self.get_tracked_data_jug() formatted_data = jug.Task(self.format_tracked_data, tracked_data_snapshots) # Write particle data to the file jug.Task(self.write_tracked_data, formatted_data) jug.barrier()
def get_galaxy_identification_loop_jug(self): '''Loop over all snapshots and identify the galaxy in each. Use Jug for parallelism. Modifies: self.ptrack_gal_ids (dict) : Where the galaxy IDs are stored. ''' def get_galaxy_and_halo_ids(i): '''Get the galaxy and halo ids for a single snapshot.''' # Get the particle positions particle_positions = self.ptrack['P'][...][:, i] # Get the data parameters to pass to GalaxyLinker kwargs = { 'halo_data': None, 'galaxy_cut': self.galaxy_cut, 'length_scale': self.length_scale, 'mt_length_scale': self.mt_length_scale, 'ids_to_return': self.ids_to_return, 'minimum_criteria': self.minimum_criteria, 'minimum_value': self.minimum_value, 'redshift': self.ptrack['redshift'][...][i], 'snum': self.ptrack['snum'][...][i], 'hubble': self.ptrack.attrs['hubble'], 'halo_data_dir': self.halo_data_dir, 'mtree_halos_index': self.mtree_halos_index, 'main_mt_halo_id': self.main_mt_halo_id, 'halo_file_tag': self.halo_file_tag, } time_start = time.time() # Find the galaxy for a given snapshot gal_linker = galaxy_linker.GalaxyLinker(particle_positions, **kwargs) galaxy_and_halo_ids = gal_linker.find_ids() time_end = time.time() print( 'Snapshot {:>3} | redshift {:>7.3g} | done in {:.3g} seconds'\ .format( kwargs['snum'], kwargs['redshift'], time_end - time_start ) ) sys.stdout.flush() # Try to avoid memory leaks del kwargs del gal_linker gc.collect() return galaxy_and_halo_ids n_snaps = self.ptrack['snum'][...].size n_particles = self.ptrack['P'][...].shape[0] # Loop over each included snapshot and submit Jug Tasks galaxy_and_halo_ids_all = [] for i in range(n_snaps): galaxy_and_halo_ids = jug.Task( get_galaxy_and_halo_ids, i, ) galaxy_and_halo_ids_all.append(galaxy_and_halo_ids) assert len(galaxy_and_halo_ids_all) == n_snaps # Store the results def store_results(galaxy_and_halo_ids_all): for i, galaxy_and_halo_ids in enumerate(galaxy_and_halo_ids_all): # Make the arrays to store the data in if not hasattr(self, 'ptrack_gal_ids'): self.ptrack_gal_ids = {} for key in galaxy_and_halo_ids.keys(): dtype = type(galaxy_and_halo_ids[key][0]) self.ptrack_gal_ids[key] = np.empty( (n_particles, n_snaps), dtype=dtype) # Store the data in the primary array for key in galaxy_and_halo_ids.keys(): self.ptrack_gal_ids[key][:, i] = galaxy_and_halo_ids[key] # Try clearing up memory again, in case gal_linker # is hanging around del galaxy_and_halo_ids gc.collect() return self.ptrack_gal_ids return jug.Task(store_results, galaxy_and_halo_ids_all)
def run_linefinder_jug( tag, out_dir = None, sim_data_dir = None, halo_data_dir = None, main_mt_halo_id = None, sim_name = None, galdef = None, selector_data_filters = {}, selector_kwargs = {}, sampler_kwargs = {}, tracker_kwargs = {}, gal_linker_kwargs = {}, classifier_kwargs = {}, visualization_kwargs = {}, run_id_selecting = True, run_id_sampling = True, run_tracking = True, run_galaxy_linking = True, run_classifying = True, run_visualization = True, ): '''Main function for running linefinder. Args: tag (str): Filename identifier for data products. out_dir (str): Output directory to store the data in. sim_data_dir (str): Directory the simulation data is stored in. halo_data_dir (str): Directory the halo data (e.g. AHF output) is stored in. Halo data is necessary for linking particles to galaxies. main_mt_halo_id (int): Halo ID for the main merger tree halo that's being tracked. If not provided defaults to 0 (or whatever value is cataloged for the sim name). sim_name (str): Name of the simulation this is being run for. If provided then linefinder will automatically choose the location of the simulation and halo data, according to the linefinder.config file. The sim_data_dir or halo_data_dir arguments directly overwrites this. galdef (str): Which set of parameters to use for the galaxy_linking and classification steps? Defaults to the parameters in linefinder.config selector_data_filters (dict): Data filters to pass to select.IDSelector.select_ids() selector_kwargs (dict): Arguments to use when selecting what particles to track. Arguments will be passed to select.IDSelector sampler_kwargs (dict): Arguments to use when selecting what particles to track. Arguments will be passed to select.IDSampler tracker_kwargs (dict): Arguments to use when tracking particles. Arguments will be passedts to pass to track.ParticleTracker gal_linker_kwargs (dict): Arguments to use when associating particles with galaxies. Arguments will be passed to galaxy_link.ParticleTrackGalaxyLinker classifier_kwargs (dict): Arguments to use when classifying particles. Arguments will be passed to classify.Classifier visualization_kwargs (dict): Arguments to use when visualizing the data. Arguments will be passed to visualize.export_to_firefly run_id_selecting (bool): If True, then run routines for selecting particles. run_id_sampling (bool): If True, then run routines for sampling from the full list of selected particles. run_tracking (bool): If True, then run routines for tracking particles. run_galaxy_linking (bool): If True, then run routines for associating particles with galaxies. run_classifying (bool): If True, then run routines for classifying particles. ''' # Expand data dirs, if possible if out_dir is not None: out_dir = os.path.expandvars( out_dir ) if sim_data_dir is not None: sim_data_dir = os.path.expandvars( sim_data_dir ) if halo_data_dir is not None: halo_data_dir = os.path.expandvars( halo_data_dir ) # Set up for auto-retrieval, if chosen if sim_name is not None: file_manager = file_management.FileManager() if out_dir is None: out_dir = file_manager.get_linefinder_dir( sim_name ) # Setup for galaxy definitions, if chosen if galdef is not None: galdef_dict = linefinder_config.GALAXY_DEFINITIONS[galdef] # Setup jugdata jugdir_tail = '{}.jugdata'.format( tag ) jug.set_jugdir( os.path.join( out_dir, jugdir_tail ) ) print( "Starting jug thread..." ) # These are kwargs that could be used at any stage of running linefinder. general_kwargs = { 'out_dir': out_dir, 'tag': tag, } # Run the ID Selecting if run_id_selecting: # Update arguments selector_kwargs = utilities.merge_two_dicts( selector_kwargs, general_kwargs ) # Check if the snapshot kwargs exist, and if not, create them if 'snapshot_kwargs' not in list( selector_kwargs.keys() ): selector_kwargs['snapshot_kwargs'] = {} # Add in sim data dir if given if sim_data_dir is not None: selector_kwargs['snapshot_kwargs']['sdir'] = sim_data_dir # Add in halo data dir if given if halo_data_dir is not None: selector_kwargs['snapshot_kwargs']['halo_data_dir'] = halo_data_dir # Use sim name to find defaults if sim_name is not None: snapshot_kwargs = selector_kwargs['snapshot_kwargs'] if 'sdir' not in snapshot_kwargs: snapshot_kwargs['sdir'] = file_manager.get_sim_dir( sim_name ) if 'halo_data_dir' not in snapshot_kwargs: snapshot_kwargs['halo_data_dir'] = file_manager.get_halo_dir( sim_name ) if 'main_halo_id' not in snapshot_kwargs: snapshot_kwargs['main_halo_id'] = linefinder_config.MAIN_MT_HALO_ID[sim_name] selector_kwargs['snapshot_kwargs'] = snapshot_kwargs id_selector = select.IDSelector( **selector_kwargs ) id_selector.select_ids_jug( selector_data_filters ) # Run the ID Sampling if run_id_sampling: # Update arguments sampler_kwargs = utilities.merge_two_dicts( sampler_kwargs, general_kwargs ) # Check if the snapshot kwargs exist, and if not, create them if 'snapshot_kwargs' not in list( sampler_kwargs.keys() ): sampler_kwargs['snapshot_kwargs'] = {} # Add in sim data dir if given if sim_data_dir is not None: sampler_kwargs['snapshot_kwargs']['sdir'] = sim_data_dir # Add in halo data dir if given if halo_data_dir is not None: sampler_kwargs['snapshot_kwargs']['halo_data_dir'] = halo_data_dir # Use sim name to find defaults if sim_name is not None: snapshot_kwargs = sampler_kwargs['snapshot_kwargs'] if 'sdir' not in snapshot_kwargs: snapshot_kwargs['sdir'] = file_manager.get_sim_dir( sim_name ) if 'halo_data_dir' not in snapshot_kwargs: snapshot_kwargs['halo_data_dir'] = file_manager.get_halo_dir( sim_name ) if 'main_halo_id' not in snapshot_kwargs: snapshot_kwargs['main_halo_id'] = linefinder_config.MAIN_MT_HALO_ID[sim_name] sampler_kwargs['snapshot_kwargs'] = snapshot_kwargs id_sampler = select.IDSampler( **sampler_kwargs ) jug.Task( id_sampler.sample_ids ) jug.barrier() # Run the Particle Tracking if run_tracking: # Update arguments tracker_kwargs = utilities.merge_two_dicts( tracker_kwargs, general_kwargs ) # Add in sim data dir if given if sim_data_dir is not None: tracker_kwargs['sdir'] = sim_data_dir # Choose the sdir automatically, if possible if 'sdir' not in tracker_kwargs: # Try and load the default values if using the file manager. if sim_name is not None: tracker_kwargs['sdir'] = file_manager.get_sim_dir( sim_name ) # Try to use the sdir passed to the selector kwargs elif 'snapshot_kwargs' in selector_kwargs: if 'sdir' in 'snapshot_kwargs': tracker_kwargs['sdir'] = \ selector_kwargs['snapshot_kwargs']['sdir'] particle_tracker = track.ParticleTracker( **tracker_kwargs ) particle_tracker.save_particle_tracks_jug() # Run the Galaxy Finding if run_galaxy_linking: # Update arguments gal_linker_kwargs = utilities.merge_two_dicts( gal_linker_kwargs, general_kwargs ) # Add in halo data dir if given if halo_data_dir is not None: gal_linker_kwargs['halo_data_dir'] = halo_data_dir if sim_name is not None: if 'halo_data_dir' not in gal_linker_kwargs: gal_linker_kwargs['halo_data_dir'] = file_manager.get_halo_dir( sim_name ) if 'main_mt_halo_id' not in gal_linker_kwargs: gal_linker_kwargs['main_mt_halo_id'] = linefinder_config.MAIN_MT_HALO_ID[sim_name] # Default to halo 0 if MT halo ID not given if 'main_mt_halo_id' not in gal_linker_kwargs: gal_linker_kwargs['main_mt_halo_id'] = 0 if galdef is not None: for key in [ 'galaxy_cut', 'length_scale', 'mt_length_scale' ]: gal_linker_kwargs[key] = galdef_dict[key] particle_track_gal_linker = galaxy_link.ParticleTrackGalaxyLinker( **gal_linker_kwargs ) particle_track_gal_linker.find_galaxies_for_particle_tracks_jug() # Run the Classification if run_classifying: # Update arguments classifier_kwargs = utilities.merge_two_dicts( classifier_kwargs, general_kwargs ) # Add in halo data dir if given if halo_data_dir is not None: classifier_kwargs['halo_data_dir'] = halo_data_dir if sim_name is not None: if 'halo_data_dir' not in classifier_kwargs: classifier_kwargs['halo_data_dir'] = file_manager.get_halo_dir( sim_name ) if galdef is not None: for key in [ 't_pro', 't_m', ]: classifier_kwargs[key] = galdef_dict[key] classifier = classify.Classifier( **classifier_kwargs ) jug.Task( classifier.classify_particles ) # Run Visualizing if run_visualization: # Add in halo data dir if given if halo_data_dir is not None: visualization_kwargs['halo_data_dir'] = halo_data_dir if sim_name is not None: if 'halo_data_dir' not in visualization_kwargs: visualization_kwargs['halo_data_dir'] = file_manager.get_halo_dir( sim_name ) if 'main_mt_halo_id' not in visualization_kwargs: visualization_kwargs['main_halo_id'] = linefinder_config.MAIN_MT_HALO_ID[sim_name] jug.Task( visualize.export_to_firefly, tag = tag, data_dir = out_dir, **visualization_kwargs ) # Make a file indicating that the visualization completed. f = os.path.join( out_dir, 'visualized_{}'.format(tag ) ) open(f, 'a').close()