def __dispatch_raw_gradient_interpolation(self, event: str, verbose=False): """ Take the gradient out of the adjoint simulations and interpolate them to the inversion grid. """ submitted, retrieved = self.__submitted_retrieved( event, "gradient_interp") if submitted: if verbose: self.print(f"Interpolation for gradient {event} " "has already been submitted") return hpc_cluster = get_site(self.comm.project.interpolation_site) if hpc_cluster.config["site_type"] == "local": interp_folder = os.path.join( self.comm.project.remote_inversionson_dir, "INTERPOLATION_WEIGHTS", "GRADIENTS", event, ) else: interp_folder = os.path.join( self.comm.project.remote_inversionson_dir, "INTERPOLATION_WEIGHTS", "GRADIENTS", event, ) if not hpc_cluster.remote_exists(interp_folder): hpc_cluster.remote_mkdir(interp_folder) # Here I need to make sure that the correct layers are interpolated # I can just do this by specifying the layers, rather than saying # nocore. That's less nice though of course. Could be specified # in the config file. Then it should work fine. self.comm.multi_mesh.interpolate_gradient_to_model( event, smooth=False, interp_folder=interp_folder)
def __prepare_forward(self, event: str): """ Interpolate model to a simulation mesh :param event: Name of event :type event: str """ submitted, _ = self.__submitted_retrieved(event, sim_type="prepare_forward") if submitted: return hpc_cluster = get_site(self.comm.project.interpolation_site) interp_folder = os.path.join( self.comm.project.remote_inversionson_dir, "INTERPOLATION_WEIGHTS", "MODELS", event, ) if not hpc_cluster.remote_exists(interp_folder): hpc_cluster.remote_mkdir(interp_folder) self.comm.multi_mesh.prepare_forward(event=event) self.comm.project.update_iteration_toml()
def construct_simulation_from_dict(self, event: str): """ Download a dictionary with the simulation object and use it to create a local simulation object without having any of the relevant data locally. Only used to submit a job to the remote without having to store anything locally. :param event: Name of event :type event: str """ # Always write events to the same folder destination = ( self.comm.lasif.lasif_comm.project.paths["salvus_files"] / f"SIMULATION_DICTS" / event / "simulation_dict.toml") if not os.path.exists(destination.parent): os.makedirs(destination.parent) if not os.path.exists(destination): hpc_cluster = sapi.get_site(self.comm.project.site_name) interp_job = self.get_job(event, sim_type="prepare_forward") remote_dict = (interp_job.stdout_path.parent / "output" / "simulation_dict.toml") hpc_cluster.remote_get(remotepath=remote_dict, localpath=destination) sim_dict = toml.load(destination) local_dummy_mesh_path = self.comm.lasif.get_master_model() local_dummy_mesh = self.comm.lasif.get_master_mesh() for key in ["mesh", "model", "geometry"]: sim_dict["domain"][key]["filename"] = local_dummy_mesh_path w = self.simulation_from_dict(sim_dict, local_dummy_mesh) return w
def upload_stf(self, iteration: str, hpc_cluster=None): """ Upload the source time function to the remote machine :param iteration: Name of iteration :type iteration: str """ local_stf = self.find_stf(iteration=iteration) if not os.path.exists(local_stf): write_custom_stf(stf_path=local_stf, comm=self.lasif_comm) if hpc_cluster is None: hpc_cluster = get_site(self.comm.project.site_name) if not hpc_cluster.remote_exists( self.comm.project.remote_inversionson_dir / "SOURCE_TIME_FUNCTIONS" / iteration): hpc_cluster.remote_mkdir( self.comm.project.remote_inversionson_dir / "SOURCE_TIME_FUNCTIONS" / iteration) if not hpc_cluster.remote_exists( self.comm.project.remote_inversionson_dir / "SOURCE_TIME_FUNCTIONS" / iteration / "stf.h5"): hpc_cluster.remote_put( local_stf, self.comm.project.remote_inversionson_dir / "SOURCE_TIME_FUNCTIONS" / iteration / "stf.h5", )
def __cut_and_clip_gradient(self, event, verbose=False): """ Cut sources and receivers from gradient before summing or potential smoothing. We also clip the gradient to some percentile This can all be configured in information toml. :param event: name of the event """ if self.comm.project.cut_source_radius == 0.0 and \ self.comm.project.clip_gradient == 1.0: return job = self.comm.salvus_flow.get_job(event, "adjoint") output_files = job.get_output_files() gradient_path = output_files[0][("adjoint", "gradient", "output_filename")] # Connect to daint hpc_cluster = get_site(self.comm.project.site_name) remote_inversionson_dir = os.path.join( self.comm.project.remote_inversionson_dir, "GRADIENT_PROCESSING") if not hpc_cluster.remote_exists(remote_inversionson_dir): hpc_cluster.remote_mkdir(remote_inversionson_dir) # copy processing script to hpc remote_script = os.path.join(remote_inversionson_dir, "cut_and_clip.py") if not hpc_cluster.remote_exists(remote_script): hpc_cluster.remote_put(CUT_SOURCE_SCRIPT_PATH, remote_script) info = {} info["filename"] = str(gradient_path) info["cutout_radius_in_km"] = self.comm.project.cut_source_radius info["source_location"] = self.comm.lasif.get_source(event_name=event) info["clipping_percentile"] = self.comm.project.clip_gradient info["parameters"] = self.comm.project.inversion_params toml_filename = f"{event}_gradient_process.toml" with open(toml_filename, "w") as fh: toml.dump(info, fh) # put toml on daint and remove local toml remote_toml = os.path.join(remote_inversionson_dir, toml_filename) hpc_cluster.remote_put(toml_filename, remote_toml) os.remove(toml_filename) # Call script exit_code, stdout, stderr = hpc_cluster.run_ssh_command( f"python {remote_script} {remote_toml}") if "Remote source cut completed successfully" in stdout: self.print(f"Source cut and clip completed for {event}.", emoji_alias=":scissors:") else: print( "Something went wrong in cutting and clipping on the remote.") raise Exception(stdout, stderr)
def __get_custom_job(self, event: str, sim_type: str): """ A get_job function which handles job types which are not of type salvus.flow.sites.salvus_job.SalvusJob :param event: Name of event :type event: str :param sim_type: Type of simulation :type sim_type: str """ gradient = False if sim_type == "prepare_forward": if self.comm.project.prepare_forward_job[event]["submitted"]: job_name = self.comm.project.prepare_forward_job[event]["name"] else: raise InversionsonError( f"Model interpolation job for event: {event} " "has not been submitted") if sim_type == "hpc_processing": if self.comm.project.hpc_processing_job[event]["submitted"]: job_name = self.comm.project.hpc_processing_job[event]["name"] else: raise InversionsonError( f"HPC processing job for event: {event} " "has not been submitted") elif sim_type == "gradient_interp": gradient = True if self.comm.project.gradient_interp_job[event]["submitted"]: job_name = self.comm.project.gradient_interp_job[event]["name"] else: raise InversionsonError( f"Gradient interpolation job for event: {event} " "has not been submitted") site_name = self.comm.project.interpolation_site db_job = sapi._get_config()["db"].get_jobs( limit=1, site_name=site_name, job_name=job_name, )[0] job = s_job.Job( site=sapi.get_site(site_name=db_job.site.site_name), commands=self.comm.multi_mesh.get_interp_commands(event, gradient), job_type=db_job.job_type, job_info=db_job.info, jobname=db_job.job_name, job_description=db_job.description, wall_time_in_seconds=db_job.wall_time_in_seconds, working_dir=pathlib.Path(db_job.working_directory), tmpdir_root=pathlib.Path(db_job.temp_directory_root) if db_job.temp_directory_root else None, rundir_root=pathlib.Path(db_job.run_directory_root) if db_job.run_directory_root else None, job_groups=[i.group_name for i in db_job.groups], initialize_on_site=False, ) return job
def get_simulation_time_step(self, event=None): """ Get the timestep from a forward job if it does not exist yet. Returns the timestep if it is there and managed to do so. If an event is passed and it does not exist it, it will get it from an stdout file. """ misc_folder = os.path.join(self.comm.project.paths["documentation"], "MISC") if not os.path.exists(misc_folder): os.mkdir(misc_folder) timestep_file = os.path.join(misc_folder, "simulation_timestep.toml") if not os.path.exists(timestep_file) and event is not None: local_stdout = os.path.join( self.comm.project.paths["documentation"], "stdout_for_timestep_test") hpc_cluster = sapi.get_site(self.comm.project.site_name) forward_job = self.comm.salvus_flow.get_job(event=event, sim_type="forward") stdout = forward_job.path / "stdout" hpc_cluster.remote_get(stdout, local_stdout) with open(local_stdout, "r") as fh: stdout_str = fh.read() stdout_str_split = stdout_str.split() if os.path.exists(local_stdout): os.remove(local_stdout) if "(CFL)" in stdout_str_split: time_step_idx = stdout_str_split.index("(CFL)") + 1 try: time_step = float(stdout_str_split[time_step_idx]) # basic check to see if timestep makes some sense if time_step > 0.00001 and time_step < 1000: time_step_dict = dict(time_step=time_step) with open(timestep_file, "w") as fh: toml.dump(time_step_dict, fh) self.simulation_time_step = time_step simulation_dict_folder = (self.comm.lasif.lasif_comm. project.paths["salvus_files"] / f"SIMULATION_DICTS") # Clear cache of simulation dicts with the old checkpointing settings. shutil.rmtree(simulation_dict_folder) except Exception as e: print(e) self.simulation_time_step = False else: if os.path.exists(timestep_file): time_dict = toml.load(timestep_file) self.simulation_time_step = time_dict["time_step"] else: self.simulation_time_step = False
def _move_model_to_cluster( self, hpc_cluster=None, overwrite: bool = False, validation: bool = False, ): """ The model is moved to a dedicated directory on cluster :param hpc_cluster: A Salvus site object, defaults to None :type hpc_cluster: salvus.flow.Site, optional :param overwrite: Overwrite mesh already there?, defaults to False :type overwrite: bool, optional """ if hpc_cluster is None: hpc_cluster = get_site(self.comm.project.interpolation_site) optimizer = self.comm.project.get_optimizer() iteration = optimizer.iteration_name if validation: print("It's validation!") iteration = f"validation_{iteration}" local_model = self.comm.multi_mesh.find_model_file(iteration) else: local_model = optimizer.model_path has, path_to_mesh = self.has_remote_mesh( event=None, interpolate_to=False, gradient=False, hpc_cluster=hpc_cluster, iteration=iteration, validation=validation, ) if has: if overwrite: hpc_cluster.remote_put(local_model, path_to_mesh) else: self.print( f"Model for iteration {iteration} already on cluster", emoji_alias=":white_check_mark:", ) return else: if not hpc_cluster.remote_exists(path_to_mesh.parent): self.print("Making the directory") self.print(f"Directory is: {path_to_mesh.parent}") hpc_cluster.remote_mkdir(path_to_mesh.parent) self.print(f"Path to mesh is: {path_to_mesh}") hpc_cluster.remote_put(local_model, path_to_mesh) self.print("Did it")
def preprocess_remote_gradient(comm, gradient_path: str, event: str): """ Cut sources and receivers from gradient before smoothing. We also clip the gradient to some percentile This can all be configured in information toml. :param comm inversionson communicator :param gradient_path: gradient path on remote :type gradient_path: str :param event: name of the event """ # Connect to daint daint = get_site(comm.project.site_name) username = daint.config["ssh_settings"]["username"] remote_inversionson_dir = os.path.join("/scratch/snx3000", username, "smoothing_info") if not daint.remote_exists(remote_inversionson_dir): daint.remote_mkdir(remote_inversionson_dir) # copy processing script to daint remote_script = os.path.join(remote_inversionson_dir, "cut_and_clip.py") if not daint.remote_exists(remote_script): daint.remote_put(CUT_SOURCE_SCRIPT_PATH, remote_script) if comm.project.cut_receiver_radius > 0.0: raise InversionsonError("Remote receiver cutting not implemented yet.") info = {} info["filename"] = str(gradient_path) info["cutout_radius_in_km"] = comm.project.cut_source_radius info["source_location"] = comm.lasif.get_source(event_name=event) info["clipping_percentile"] = comm.project.clip_gradient info["parameters"] = comm.project.inversion_params toml_filename = f"{event}_gradient_process.toml" with open(toml_filename, "w") as fh: toml.dump(info, fh) # put toml on daint and remove local toml remote_toml = os.path.join(remote_inversionson_dir, toml_filename) daint.remote_put(toml_filename, remote_toml) os.remove(toml_filename) # Call script print(daint.run_ssh_command(f"python {remote_script} {remote_toml}"))
def get_remote_field_moving_script_path(self): site = get_site(self.comm.project.interpolation_site) username = site.config["ssh_settings"]["username"] remote_inversionson_scripts = os.path.join("/users", username, "scripts") if not site.remote_exists(remote_inversionson_scripts): site.remote_mkdir(remote_inversionson_scripts) # copy processing script to daint remote_script = os.path.join(remote_inversionson_scripts, "move_fields.py") if not site.remote_exists(remote_script): site.remote_put( os.path.join(REMOTE_SCRIPT_PATHS, "cut_and_clip.py"), remote_script ) return remote_script
def _move_mesh_to_cluster(self, event: str, gradient=False, hpc_cluster=None): """ Move the mesh to the cluster for interpolation :param event: Name of event :type event: str """ if event is None: if gradient: self.print("Moving example gradient to cluster", emoji_alias=":package:") self.move_gradient_to_cluster(hpc_cluster) else: # This happens when we want to move the model to the cluster self.print("Moving model to cluster", emoji_alias=":package:") self._move_model_to_cluster(hpc_cluster) return has, event_mesh = lapi.find_event_mesh(self.lasif_comm, event) if not has: raise InversionsonError(f"Mesh for event {event} does not exist.") # Get remote connection if hpc_cluster is None: hpc_cluster = get_site(self.comm.project.interpolation_site) path_to_mesh = self.find_remote_mesh( event=event, interpolate_to=True, check_if_exists=False, hpc_cluster=hpc_cluster, ) if not hpc_cluster.remote_exists(path_to_mesh.parent): hpc_cluster.remote_mkdir(path_to_mesh.parent) if not hpc_cluster.remote_exists(path_to_mesh): self.print(f"Moving mesh for event {event} to cluster", emoji_alias=":package:") hpc_cluster.remote_put(event_mesh, path_to_mesh)
def process_data(self, event: str): """ Process the data for the periods specified in Lasif project. :param event: Name of event to be processed :type event: str """ if self._already_processed(event): return if self.comm.project.remote_data_processing: # Get local proc filename lasif_root = self.comm.project.lasif_root proc_filename = ( f"preprocessed_{int(self.comm.project.min_period)}s_" f"to_{int(self.comm.project.max_period)}s.h5") local_proc_folder = os.path.join(lasif_root, "PROCESSED_DATA", "EARTHQUAKES", event) local_proc_file = os.path.join(local_proc_folder, proc_filename) if not os.path.exists(local_proc_folder): os.mkdir(local_proc_folder) remote_proc_file_name = f"{event}_{proc_filename}" hpc_cluster = get_site(self.comm.project.site_name) remote_processed_dir = os.path.join( self.comm.project.remote_inversionson_dir, "PROCESSED_DATA") remote_proc_path = os.path.join(remote_processed_dir, remote_proc_file_name) tmp_local_path = local_proc_file + "_tmp" if hpc_cluster.remote_exists(remote_proc_path): hpc_cluster.remote_get(remote_proc_path, tmp_local_path) os.rename(tmp_local_path, local_proc_file) return # Return if it got it and got it there. lapi.process_data(self.lasif_comm, events=[event])
def move_toml_to_hpc( self, toml_filename: pathlib.Path, event: str, hpc_cluster=None ): """ Move information file to HPC so that it can perform mesh generation and interpolation :param toml_filename: path to local toml :type toml_filename: pathlib.Path :param event: name of event :type event: str :param hpc_cluster: the cluster site object, defaults to None :type hpc_cluster: Salvus.site, optional """ if hpc_cluster is None: hpc_cluster = sapi.get_site(self.comm.project.interpolation_site) remote_path = ( pathlib.Path(self.comm.project.fast_mesh_dir) / event / toml_filename.name ) if not hpc_cluster.remote_exists(remote_path.parent): hpc_cluster.remote_mkdir(remote_path.parent) hpc_cluster.remote_put(toml_filename, remote_path) return str(remote_path)
def has_remote_mesh( self, event: str, gradient: bool, interpolate_to: bool = True, hpc_cluster=None, iteration: str = None, validation: bool = False, ): """ Just to check if remote mesh exists :param event: Name of event :type event: str :param gradient: Is it a gradient? :type gradient: bool :param interpolate_to: Mesh to interpolate to?, defaults to True :type interpolate_to: bool, optional :param hpc_cluster: you can pass the site object. Defaults to None :type hpc_cluster: salvus.flow.Site, optional :param iteration: Name of an iteration, defaults to None :type iteration: str, optional """ if hpc_cluster is None: hpc_cluster = get_site(self.comm.project.interpolation_site) mesh = self.find_remote_mesh( event=event, hpc_cluster=hpc_cluster, check_if_exists=False, iteration=iteration, validation=validation, interpolate_to=interpolate_to, gradient=gradient, ) return hpc_cluster.remote_exists(mesh), mesh
def move_gradient_to_cluster(self, hpc_cluster=None, overwrite: bool = False): """ Empty gradient moved to a dedicated directory on cluster :param hpc_cluster: A Salvus site object, defaults to None :type hpc_cluster: salvus.flow.Site, optional """ if hpc_cluster is None: hpc_cluster = get_site(self.comm.project.interpolation_site) has, path_to_mesh = self.has_remote_mesh( event=None, interpolate_to=True, gradient=True, hpc_cluster=hpc_cluster, iteration=None, validation=False, ) if has and not overwrite: self.print("Empty gradient already on cluster", emoji_alias=":white_check_mark:") return local_grad = self.lasif_comm.project.paths[ "models"] / "GRADIENT" / "mesh.h5" if not os.path.exists(local_grad.parent): os.makedirs(local_grad.parent) inversion_grid = self.get_master_model() shutil.copy(inversion_grid, local_grad) self.comm.salvus_mesher.fill_inversion_params_with_zeroes(local_grad) if not hpc_cluster.remote_exists(path_to_mesh.parent): hpc_cluster.remote_mkdir(path_to_mesh.parent) hpc_cluster.remote_put(local_grad, path_to_mesh)
def _launch_hpc_processing_job(self, event): """ Here, we launch a job to select windows and get adjoint sources for an event. """ submitted, _ = self.__submitted_retrieved(event, "hpc_processing") if submitted: return if not self.comm.project.remote_data_processing: self.__process_data(event) iteration = self.comm.project.current_iteration forward_job = sapi.get_job( site_name=self.comm.project.site_name, job_name=self.comm.salvus_flow.get_job_name(event=event, sim_type="forward"), ) # Get forward paths remote_syn_path = str(forward_job.output_path / "receivers.h5") forward_meta_json_filename = str(forward_job.output_path / "meta.json") # Get local proc filename lasif_root = self.comm.project.lasif_root proc_filename = (f"preprocessed_{int(self.comm.project.min_period)}s_" f"to_{int(self.comm.project.max_period)}s.h5") local_proc_file = os.path.join(lasif_root, "PROCESSED_DATA", "EARTHQUAKES", event, proc_filename) remote_proc_file_name = f"{event}_{proc_filename}" hpc_cluster = get_site(self.comm.project.site_name) remote_processed_dir = os.path.join( self.comm.project.remote_inversionson_dir, "PROCESSED_DATA") remote_adj_dir = os.path.join( self.comm.project.remote_inversionson_dir, "ADJOINT_SOURCES") remote_receiver_dir = os.path.join( self.comm.project.remote_inversionson_dir, "RECEIVERS") for dir_name in [ remote_processed_dir, remote_adj_dir, remote_receiver_dir ]: if not hpc_cluster.remote_exists(dir_name): hpc_cluster.remote_mkdir(remote_processed_dir) remote_proc_path = os.path.join(remote_processed_dir, remote_proc_file_name) tmp_remote_path = remote_proc_path + "_tmp" if not hpc_cluster.remote_exists(remote_proc_path): hpc_cluster.remote_put(local_proc_file, tmp_remote_path) hpc_cluster.run_ssh_command( f"mv {tmp_remote_path} {remote_proc_path}") remote_adj_dir = os.path.join( self.comm.project.remote_inversionson_dir, "ADJOINT_SOURCES") if "VPV" in self.comm.project.inversion_params: parameterization = "tti" elif "VP" in self.comm.project.inversion_params: parameterization = "rho-vp-vs" info = dict( processed_filename=remote_proc_path, synthetic_filename=remote_syn_path, forward_meta_json_filename=forward_meta_json_filename, parameterization=parameterization, event_name=event, delta=self.comm.project.simulation_dict["time_step"], npts=self.comm.project.simulation_dict["number_of_time_steps"], iteration_name=iteration, minimum_period=self.comm.project.min_period, maximum_period=self.comm.project.max_period, start_time_in_s=self.comm.project.simulation_dict["start_time"], receiver_json_path=os.path.join(remote_receiver_dir, f"{event}_receivers.json"), ad_src_type=self.comm.project.ad_src_type, ) toml_filename = f"{iteration}_{event}_adj_info.toml" with open(toml_filename, "w") as fh: toml.dump(info, fh) # Put info toml on daint and remove local toml remote_toml = os.path.join(remote_adj_dir, toml_filename) hpc_cluster.remote_put(toml_filename, remote_toml) os.remove(toml_filename) # Copy processing script to hpc remote_script = os.path.join(remote_adj_dir, "window_and_calc_adj_src.py") if not hpc_cluster.remote_exists(remote_script): hpc_cluster.remote_put(PROCESS_OUTPUT_SCRIPT_PATH, remote_script) # Now submit the job description = f"HPC processing of {event} for iteration {iteration}" # use interp wall time for now wall_time = self.comm.project.hpc_processing_wall_time from salvus.flow.sites import job, remote_io_site commands = [ remote_io_site.site_utils.RemoteCommand(command="mkdir output", execute_with_mpi=False), remote_io_site.site_utils.RemoteCommand( command=f"python {remote_script} {remote_toml}", execute_with_mpi=False), ] # Allow to set conda environment first if self.comm.project.remote_conda_env: conda_command = [ remote_io_site.site_utils.RemoteCommand( command= f"conda activate {self.comm.project.remote_conda_env}", execute_with_mpi=False, ) ] commands = conda_command + commands job = job.Job( site=sapi.get_site(self.comm.project.interpolation_site), commands=commands, job_type="hpc_processing", job_description=description, job_info={}, wall_time_in_seconds=wall_time, no_db=False, ) self.comm.project.change_attribute( attribute=f'hpc_processing_job["{event}"]["name"]', new_value=job.job_name, ) job.launch() self.comm.project.change_attribute( attribute=f'hpc_processing_job["{event}"]["submitted"]', new_value=True, ) self.print(f"HPC Processing job for event {event} submitted") self.comm.project.update_iteration_toml()
def get_interp_commands( self, event: str, gradient: bool, ) -> list: """ Get the interpolation commands needed to do remote interpolations. If not gradient, we will look for a smoothie mesh and create it if needed. """ # TODO Add average model option here # This might be a validation model if self.comm.project.is_validation_event(event) \ and self.comm.project.use_model_averaging \ and "00000" not in self.comm.project.current_iteration: average_model = True else: average_model = False optimizer = self.comm.project.get_optimizer() if not gradient: mesh_to_interpolate_from = optimizer.get_remote_model_path( model_average=average_model) else: mesh_to_interpolate_from = self.comm.lasif.find_remote_mesh( event=event, gradient=True, interpolate_to=False, validation=False, ) interpolation_script = self.find_interpolation_script() hpc_cluster = sapi.get_site(self.comm.project.interpolation_site) interpolation_toml = self.prepare_interpolation_toml( gradient=gradient, event=event, hpc_cluster=hpc_cluster ) remote_toml = self.move_toml_to_hpc( toml_filename=interpolation_toml, event=event, hpc_cluster=hpc_cluster, ) commands = [ remote_io_site.site_utils.RemoteCommand( command=f"cp {remote_toml} ./interp_info.toml", execute_with_mpi=False, ), remote_io_site.site_utils.RemoteCommand( command=f"cp {mesh_to_interpolate_from} ./from_mesh.h5", execute_with_mpi=False, ), remote_io_site.site_utils.RemoteCommand( command=f"cp {interpolation_script} ./interpolate.py", execute_with_mpi=False, ), remote_io_site.site_utils.RemoteCommand( command="mkdir output", execute_with_mpi=False ), remote_io_site.site_utils.RemoteCommand( command="python interpolate.py ./interp_info.toml", execute_with_mpi=False, ), ] if self.comm.project.remote_data_processing and not gradient: hpc_cluster = get_site(self.comm.project.site_name) remote_processed_dir = os.path.join( self.comm.project.remote_inversionson_dir, "PROCESSED_DATA" ) proc_filename = f"preprocessed_{int(self.comm.project.min_period)}s_to_{int(self.comm.project.max_period)}s.h5" remote_proc_file_name = f"{event}_{proc_filename}" remote_proc_path = os.path.join(remote_processed_dir, remote_proc_file_name) if not hpc_cluster.remote_exists(remote_proc_path): raw_file = os.path.join( self.comm.project.remote_raw_data_dir, f"{event}.h5" ) copy_data_command = [ remote_io_site.site_utils.RemoteCommand( command=f"cp {raw_file} raw_event_data.h5", execute_with_mpi=False, ) ] commands = copy_data_command + commands if self.comm.project.remote_conda_env: conda_command = [ remote_io_site.site_utils.RemoteCommand( command=f"conda activate {self.comm.project.remote_conda_env}", execute_with_mpi=False, ) ] commands = conda_command + commands if self.comm.project.remote_conda_source_location: source_command = [ remote_io_site.site_utils.RemoteCommand( command=f"source {self.comm.project.remote_conda_source_location}", execute_with_mpi=False ) ] commands = source_command + commands return commands
def prepare_interpolation_toml(self, gradient, event, hpc_cluster=None): toml_name = "gradient_interp.toml" if gradient else "prepare_forward.toml" toml_filename = ( self.comm.project.inversion_root / "INTERPOLATION" / event / toml_name ) if not os.path.exists(toml_filename.parent): os.makedirs(toml_filename.parent) tag = "GRADIENTS" if gradient else "MODELS" remote_weights_path = os.path.join( self.comm.project.remote_inversionson_dir, "INTERPOLATION_WEIGHTS", tag, event, ) if os.path.exists( toml_filename ): # if exists, we can update the important parameters. and skip the rest. information = toml.load(toml_filename) else: information = {} information["gradient"] = gradient information["mesh_info"] = { "event_name": event, "mesh_folder": str(self.comm.project.fast_mesh_dir), "long_term_mesh_folder": str(self.comm.project.remote_mesh_dir), "min_period": self.comm.project.min_period, "elems_per_quarter": self.comm.project.elem_per_quarter, "interpolation_weights": remote_weights_path, "elems_per_wavelength": self.comm.project.elem_per_wavelength, } if not gradient and self.comm.project.remote_data_processing: information["data_processing"] = True else: information["data_processing"] = False if self.comm.project.meshes == "multi-mesh": information["multi-mesh"] = True else: information["multi-mesh"] = False # Provide information for cut and clipping if gradient: information["cutout_radius_in_km"] = self.comm.project.cut_source_radius information["source_location"] = self.comm.lasif.get_source( event_name=event ) information["clipping_percentile"] = self.comm.project.clip_gradient information["parameters"] = self.comm.project.inversion_params else: proc_filename = f"preprocessed_{int(self.comm.project.min_period)}s_to_{int(self.comm.project.max_period)}s.h5" remote_proc_path = f"{event}_{proc_filename}" if hpc_cluster is None: hpc_cluster = sapi.get_site(self.comm.project.interpolation_site) remote_processed_dir = os.path.join( self.comm.project.remote_inversionson_dir, "PROCESSED_DATA" ) remote_proc_path = os.path.join(remote_processed_dir, remote_proc_path) if not hpc_cluster.remote_exists(remote_processed_dir): hpc_cluster.remote_mkdir(remote_processed_dir) processing_info = { "minimum_period": self.comm.project.min_period, "maximum_period": self.comm.project.max_period, "npts": self.comm.project.simulation_dict["number_of_time_steps"], "dt": self.comm.project.time_step, "start_time_in_s": self.comm.project.start_time, "asdf_input_filename": "raw_event_data.h5", "asdf_output_filename": remote_proc_path, "preprocessing_tag": self.comm.lasif.lasif_comm.waveforms.preprocessing_tag, } information["processing_info"] = processing_info remote_receiver_dir = os.path.join( self.comm.project.remote_inversionson_dir, "RECEIVERS" ) if not hpc_cluster.remote_exists(remote_receiver_dir): hpc_cluster.remote_mkdir(remote_receiver_dir) information["receiver_json_path"] = os.path.join( remote_receiver_dir, f"{event}_receivers.json" ) # If we have a dict already, we can just update it with the proper # remote mesh files and also we don't need to create the simulation # dict again in the interpolation job. local_simulation_dict = ( self.comm.lasif.lasif_comm.project.paths["salvus_files"] / f"SIMULATION_DICTS" / event / "simulation_dict.toml" ) # Only create simulation dict when we don't have it yet. information["create_simulation_dict"] = ( False if os.path.exists(local_simulation_dict) else True ) if not gradient: if self.comm.project.ellipticity: information["ellipticity"] = 0.0033528106647474805 if self.comm.project.topography["use"]: information["mesh_info"][ "topography" ] = self.comm.project.topography if self.comm.project.ocean_loading["use"]: information["mesh_info"][ "ocean_loading" ] = self.comm.project.ocean_loading source_info = self.comm.lasif.get_source(event_name=event) if isinstance(source_info, list): source_info = source_info[0] source_info["side_set"] = ( "r1_ol" if self.comm.project.ocean_loading["use"] and not self.comm.project.meshes == "multi-mesh" else "r1" ) source_info["stf"] = str( self.comm.project.remote_inversionson_dir / "SOURCE_TIME_FUNCTIONS" / self.comm.project.current_iteration / "stf.h5" ) information["source_info"] = source_info if ( not os.path.exists(toml_filename) and not self.comm.project.remote_data_processing ): # this is a slow step, so let's skip it if we can receivers = self.comm.lasif.get_receivers(event_name=event) information["receiver_info"] = receivers if self.comm.project.absorbing_boundaries: if ( "inner_boundary" in self.comm.lasif.lasif_comm.project.domain.get_side_set_names() ): side_sets = ["inner_boundary"] else: side_sets = [ "r0", "t0", "t1", "p0", "p1", ] else: side_sets = [] information["simulation_info"] = { "end_time": self.comm.project.end_time, "time_step": self.comm.project.time_step, "start_time": self.comm.project.start_time, "minimum_period": self.comm.lasif.lasif_comm.project.simulation_settings[ "minimum_period_in_s" ], "simulation_time_step": self.comm.project.simulation_time_step, "attenuation": self.comm.project.attenuation, "absorbing_boundaries": self.comm.project.absorbing_boundaries, "side_sets": side_sets, "absorbing_boundary_length": self.comm.project.abs_bound_length * 1000.0, } with open(toml_filename, "w") as fh: toml.dump(information, fh) return toml_filename
def construct_remote_interpolation_job(self, event: str, gradient=False): """ Construct a custom Salvus job which can be submitted to an HPC cluster The job can either do an interpolation of model or gradient :param event: Name of event :type event: str :param gradient: Are we interpolating the gradient?, defaults to False :type gradient: bool, optional """ description = "Interpolation of " description += "gradient " if gradient else "model " description += f"for event {event}" wall_time = 0.0 if self.comm.project.meshes == "multi-mesh": wall_time += self.comm.project.model_interp_wall_time if self.comm.project.remote_data_processing and not gradient: hpc_cluster = get_site(self.comm.project.site_name) remote_processed_dir = os.path.join( self.comm.project.remote_inversionson_dir, "PROCESSED_DATA" ) proc_filename = ( f"preprocessed_{int(self.comm.project.min_period)}s" f"_to_{int(self.comm.project.max_period)}s.h5" ) remote_proc_file_name = f"{event}_{proc_filename}" remote_proc_path = os.path.join(remote_processed_dir, remote_proc_file_name) # ALso add a check if the forward_dict exists here forward_simulation_dict = ( self.comm.lasif.lasif_comm.project.paths["salvus_files"] / f"SIMULATION_DICTS" / event / "simulation_dict.toml" ) # Submit a job either if the local dict is missing or # if the processed data is missing on the remote if not hpc_cluster.remote_exists(remote_proc_path) \ or not os.path.exists(forward_simulation_dict): wall_time += self.comm.project.remote_data_proc_wall_time elif self.comm.project.meshes != "multi-mesh": self.comm.project.change_attribute( attribute=f'prepare_forward_job["{event}"]["submitted"]', new_value=True, ) self.comm.project.change_attribute( attribute=f'prepare_forward_job["{event}"]["retrieved"]', new_value=True, ) return None if gradient: wall_time = self.comm.project.grad_interp_wall_time int_job = job.Job( site=sapi.get_site(self.comm.project.interpolation_site), commands=self.get_interp_commands(event=event, gradient=gradient), job_type="interpolation", job_description=description, job_info={}, wall_time_in_seconds=wall_time, no_db=False, ) return int_job
def get_adjoint_source_object(self, event_name: str) -> object: """ Generate the adjoint source object for the respective event :param event_name: Name of event :type event_name: str :return: Adjoint source object for salvus :rtype: object """ import h5py iteration = self.comm.project.current_iteration receivers = self.comm.lasif.get_receivers(event_name) if not self.comm.project.hpc_processing: adjoint_filename = self.comm.lasif.get_adjoint_source_file( event=event_name, iteration=iteration) if not self.comm.project.hpc_processing: p = h5py.File(adjoint_filename, "r") adjoint_recs = list(p.keys()) p.close() else: forward_job = self.get_job(event_name, sim_type="forward") # remote synthetics remote_meta_path = forward_job.output_path / "meta.json" hpc_cluster = get_site(self.comm.project.site_name) meta_json_filename = "meta.json" if os.path.exists(meta_json_filename): os.remove(meta_json_filename) hpc_cluster.remote_get(remote_meta_path, meta_json_filename) proc_job = self.get_job(event_name, sim_type="hpc_processing") remote_misfit_dict_toml = str(proc_job.stdout_path.parent / "output" / "misfit_dict.toml") adjoint_filename = "REMOTE:" + str( proc_job.stdout_path.parent / "output" / "stf.h5") local_misfit_dict = "misfit_dict.toml" if os.path.exists(local_misfit_dict): os.remove(local_misfit_dict) hpc_cluster.remote_get(remote_misfit_dict_toml, local_misfit_dict) misfits = toml.load(local_misfit_dict) adjoint_recs = list(misfits[event_name].keys()) if os.path.exists(local_misfit_dict): os.remove(local_misfit_dict) # Need to make sure I only take receivers with an adjoint source adjoint_sources = [] for rec in receivers: if (rec["network-code"] + "_" + rec["station-code"] in adjoint_recs or rec["network-code"] + "." + rec["station-code"] in adjoint_recs): adjoint_sources.append(rec) # print(adjoint_sources) # Get path to meta.json to obtain receiver position, use again for adjoint if not self.comm.project.hpc_processing: meta_json_filename = os.path.join( self.comm.project.lasif_root, "SYNTHETICS", "EARTHQUAKES", f"ITERATION_{iteration}", event_name, "meta.json", ) # Build meta info dict with open(meta_json_filename) as json_file: data = json.load(json_file) meta_recs = data["forward_run_input"]["output"]["point_data"][ "receiver"] meta_info_dict = {} for rec in meta_recs: if (rec["network_code"] + "_" + rec["station_code"] in adjoint_recs or rec["network_code"] + "." + rec["station_code"] in adjoint_recs): rec_name = rec["network_code"] + "_" + rec["station_code"] meta_info_dict[rec_name] = {} # this is the rotation from XYZ to ZNE, # we still need to transpose to get ZNE -> XYZ meta_info_dict[rec_name]["rotation_on_input"] = { "matrix": np.array(rec["rotation_on_output"]["matrix"]).T.tolist() } meta_info_dict[rec_name]["location"] = rec["location"] adj_src = [ source.cartesian.VectorPoint3D( x=meta_info_dict[rec["network-code"] + "_" + rec["station-code"]]["location"][0], y=meta_info_dict[rec["network-code"] + "_" + rec["station-code"]]["location"][1], z=meta_info_dict[rec["network-code"] + "_" + rec["station-code"]]["location"][2], fx=1.0, fy=1.0, fz=1.0, source_time_function=stf.Custom( filename=adjoint_filename, dataset_name="/" + rec["network-code"] + "_" + rec["station-code"], ), rotation_on_input=meta_info_dict[rec["network-code"] + "_" + rec["station-code"]] ["rotation_on_input"], ) for rec in adjoint_sources ] if os.path.exists( meta_json_filename) and self.comm.project.hpc_processing: os.remove(meta_json_filename) return adj_src
def get_sims_for_smoothing_task( self, reference_model, model_to_smooth, smoothing_lengths, smoothing_parameters, ): """ Writes diffusion models based on a reference model and smoothing lengths. Then ploads them to the remote cluster if they don't exist there yet. and returns a list of simulations that can then be submitted as usual. The model_to_smooth [a Returns a list of simulation objects :param reference_model: Mesh file with the velocities on which smoothing lengths are based. This file should be locally present. :type reference_model: str :param model_to_smooth: Mesh file with the fields that require smoothing This may either be a file that is currently located on the HPC already or a file that stills needs to be uploaded. If it is located on the remote already, please pass a path starts with: "Remote:" :type model_to_smooth: str :param smoothing_lengths: List of floats that specify the smoothing lengths :type smoothing_lengths: list :param smoothing_parameters: List of strings that specify which parameters need smoothing :type smoothing_parameters: list """ import salvus.flow.simple_config as sc from salvus.opt import smoothing from salvus.flow.api import get_site ref_model_name = ".".join( reference_model.split("/")[-1].split(".")[:-1]) freq = 1.0 / self.comm.project.min_period hpc_cluster = get_site(self.comm.project.site_name) remote_diff_dir = self.comm.project.remote_diff_model_dir local_diff_model_dir = "DIFFUSION_MODELS" if not os.path.exists(local_diff_model_dir): os.mkdir(local_diff_model_dir) if not hpc_cluster.remote_exists(remote_diff_dir): hpc_cluster.remote_mkdir(remote_diff_dir) if "REMOTE:" not in model_to_smooth: print(f"Uploading initial values from: {model_to_smooth} " f"for smoothing.") file_name = model_to_smooth.split("/")[-1] remote_file_path = os.path.join(remote_diff_dir, file_name) tmp_remote_file_path = remote_file_path + "_tmp" hpc_cluster.remote_put(model_to_smooth, tmp_remote_file_path) hpc_cluster.run_ssh_command( f"mv {tmp_remote_file_path} {remote_file_path}") model_to_smooth = "REMOTE:" + remote_file_path sims = [] for param in smoothing_parameters: if param.startswith("V"): reference_velocity = param # If it is not some velocity, use P velocities elif not param.startswith("V"): if "VPV" in self.comm.project.inversion_params: reference_velocity = "VPV" elif "VP" in self.comm.project.inversion_params: reference_velocity = "VP" else: raise NotImplementedError("Inversionson always expects" "to get models with at least VP") unique_id = ( "_".join([str(i).replace(".", "") for i in smoothing_lengths]) + "_" + str(self.comm.project.min_period)) diff_model_file = unique_id + f"diff_model_{ref_model_name}_{param}.h5" remote_diff_model = os.path.join(remote_diff_dir, diff_model_file) diff_model_file = os.path.join(local_diff_model_dir, diff_model_file) if not os.path.exists(diff_model_file): smooth = smoothing.AnisotropicModelDependent( reference_frequency_in_hertz=freq, smoothing_lengths_in_wavelengths=smoothing_lengths, reference_model=reference_model, reference_velocity=reference_velocity, ) diff_model = smooth.get_diffusion_model(reference_model) diff_model.write_h5(diff_model_file) if not hpc_cluster.remote_exists(remote_diff_model): tmp_remote_diff_model = remote_diff_model + "_tmp" hpc_cluster.remote_put(diff_model_file, tmp_remote_diff_model) hpc_cluster.run_ssh_command( f"mv {tmp_remote_diff_model} {remote_diff_model}") sim = sc.simulation.Diffusion(mesh=diff_model_file) tensor_order = self.comm.project.smoothing_tensor_order sim.domain.polynomial_order = tensor_order if not self.comm.project.smoothing_timestep == "auto": sim.physics.diffusion_equation.time_step_in_seconds = ( self.comm.project.smoothing_timestep) sim.physics.diffusion_equation.courant_number = 0.06 sim.physics.diffusion_equation.initial_values.filename = ( model_to_smooth) sim.physics.diffusion_equation.initial_values.format = "hdf5" sim.physics.diffusion_equation.initial_values.field = f"{param}" sim.physics.diffusion_equation.final_values.filename = f"{param}.h5" sim.domain.mesh.filename = "REMOTE:" + remote_diff_model sim.domain.model.filename = "REMOTE:" + remote_diff_model sim.domain.geometry.filename = "REMOTE:" + remote_diff_model sim.validate() # append sim to array sims.append(sim) return sims
def __run_forward_simulation(self, event: str, verbose=False): """ Submit forward simulation to daint and possibly monitor aswell :param event: Name of event :type event: str """ # Check status of simulation submitted, retrieved = self.__submitted_retrieved(event, sim_type="forward") if submitted: return # In the case of a prepare_forward job, assume dict is created remotely if self.comm.project.prepare_forward: simulation_created_remotely = True else: simulation_created_remotely = False if verbose: self.print("Run forward simulation", line_above=True, emoji_alias=":rocket:") self.print(f"Event: {event}") if simulation_created_remotely: w = self.comm.salvus_flow.construct_simulation_from_dict(event) else: receivers = self.comm.salvus_flow.get_receivers(event) source = self.comm.salvus_flow.get_source_object(event) w = self.comm.salvus_flow.construct_simulation( event, source, receivers) if self.comm.project.meshes == "multi-mesh": already_interpolated = True else: already_interpolated = False # Get the average model when validation event if (self.comm.project.is_validation_event(event) and self.comm.project.use_model_averaging and "00000" not in self.comm.project.current_iteration): validation = True else: validation = False hpc_cluster = get_site(self.comm.project.interpolation_site) remote_mesh = self.comm.lasif.find_remote_mesh( event=event, gradient=False, interpolate_to=False, hpc_cluster=hpc_cluster, validation=validation, already_interpolated=already_interpolated, ) w.set_mesh("REMOTE:" + str(remote_mesh)) # make the mesh use self.comm.salvus_flow.submit_job( event=event, simulation=w, sim_type="forward", site=self.comm.project.site_name, ranks=self.comm.project.ranks, ) self.print(f"Submitted forward job for event: {event}")
def submit_job( self, event: str, simulation: object, sim_type: str, site="daint", ranks=1024, ): """ Submit a job with some information. Salvus flow returns an object which can be used to interact with job. :param event: Name of event :type event: str :param simulation: Simulation object constructed beforehand :type simulation: object :param sim_type: Type of simulation, forward or adjoint :type sim_type: str :param site: Name of site in salvus flow config file, defaults to "daint" :type site: str, optional :param ranks: How many cores to run on. (A multiple of 12 on daint), defaults to 1024 :type ranks: int, optional """ # Adjoint simulation takes longer and seems to be less predictable # we thus give it a longer wall time. if sim_type == "adjoint": wall_time = self.comm.project.wall_time * 1.5 else: wall_time = self.comm.project.wall_time start_submit = time.time() job = sapi.run_async( site_name=site, input_file=simulation, ranks=ranks, wall_time_in_seconds=wall_time, ) end_submit = time.time() self.print( f"Submitting took {end_submit - start_submit:.3f} seconds", emoji_alias=":hourglass:", color="magenta", ) hpc_cluster = sapi.get_site(self.comm.project.site_name) if sim_type == "forward": self.comm.project.change_attribute( f'forward_job["{event}"]["name"]', job.job_name) self.comm.project.change_attribute( f'forward_job["{event}"]["submitted"]', True) elif sim_type == "adjoint": self.comm.project.change_attribute( f'adjoint_job["{event}"]["name"]', job.job_name) self.comm.project.change_attribute( f'adjoint_job["{event}"]["submitted"]', True) self.comm.project.update_iteration_toml() if hpc_cluster.config["site_type"] == "local": self.print(f"Running {sim_type} simulation...") job.wait( poll_interval_in_seconds=self.comm.project.sleep_time_in_s)
def run_remote_smoother( self, event: str, ): """ Run the Smoother, the settings are specified in inversion toml. Make sure that the smoothing config has already been generated :param event: Name of event :type event: str """ from salvus.opt import smoothing import salvus.flow.simple_config as sc from salvus.flow.api import get_site from salvus.flow import api as sapi if self.comm.project.meshes == "multi-mesh": mesh = self.comm.lasif.find_event_mesh(event) else: mesh = self.comm.lasif.get_simulation_mesh(event) freq = 1.0 / self.comm.project.min_period smoothing_lengths = self.comm.project.smoothing_lengths # get remote gradient filename job = self.comm.salvus_flow.get_job(event, "adjoint") output_files = job.get_output_files() remote_grad = str(output_files[0][("adjoint", "gradient", "output_filename")]) # make site stuff (hardcoded for now) daint = get_site(self.comm.project.site_name) username = daint.config["ssh_settings"]["username"] remote_diff_dir = os.path.join("/scratch/snx3000", username, "diff_models") local_diff_model_dir = "DIFF_MODELS" if not os.path.exists(local_diff_model_dir): os.mkdir(local_diff_model_dir) if not daint.remote_exists(remote_diff_dir): daint.remote_mkdir(remote_diff_dir) sims = [] for param in self.comm.project.inversion_params: if param.startswith("V"): reference_velocity = param elif param == "RHO": if "VP" in self.comm.project.inversion_params: reference_velocity = "VP" elif "VPV" in self.comm.project.inversion_params: reference_velocity = "VPV" unique_id = ( "_".join([str(i).replace(".", "") for i in smoothing_lengths]) + "_" + str(self.comm.project.min_period)) diff_model_file = unique_id + f"diff_model_{param}.h5" if self.comm.project.meshes == "multi-mesh": diff_model_file = event + "_" + diff_model_file remote_diff_model = os.path.join(remote_diff_dir, diff_model_file) diff_model_file = os.path.join(local_diff_model_dir, diff_model_file) if not os.path.exists(diff_model_file): smooth = smoothing.AnisotropicModelDependent( reference_frequency_in_hertz=freq, smoothing_lengths_in_wavelengths=smoothing_lengths, reference_model=mesh, reference_velocity=reference_velocity, ) diff_model = smooth.get_diffusion_model(mesh) diff_model.write_h5(diff_model_file) if not daint.remote_exists(remote_diff_model): daint.remote_put(diff_model_file, remote_diff_model) sim = sc.simulation.Diffusion(mesh=diff_model_file) if self.comm.project.meshes == "multi-mesh": tensor_order = 4 else: tensor_order = 2 sim.domain.polynomial_order = tensor_order sim.physics.diffusion_equation.time_step_in_seconds = ( self.comm.project.smoothing_timestep) sim.physics.diffusion_equation.courant_number = 0.06 sim.physics.diffusion_equation.initial_values.filename = ( "REMOTE:" + remote_grad) sim.physics.diffusion_equation.initial_values.format = "hdf5" sim.physics.diffusion_equation.initial_values.field = f"{param}" sim.physics.diffusion_equation.final_values.filename = ( f"{param}.h5") sim.domain.mesh.filename = "REMOTE:" + remote_diff_model sim.domain.model.filename = "REMOTE:" + remote_diff_model sim.domain.geometry.filename = "REMOTE:" + remote_diff_model sim.validate() # append sim to array sims.append(sim) job = sapi.run_many_async( input_files=sims, site_name=self.comm.project.smoothing_site_name, ranks_per_job=self.comm.project.smoothing_ranks, wall_time_in_seconds_per_job=self.comm.project.smoothing_wall_time, ) if self.comm.project.inversion_mode == "mini-batch": self.comm.project.change_attribute( f'smoothing_job["{event}"]["name"]', job.job_array_name) self.comm.project.change_attribute( f'smoothing_job["{event}"]["submitted"]', True) else: self.comm.project.change_attribute('smoothing_job["name"]', job.job_array_name) self.comm.project.change_attribute('smoothing_job["submitted"]', True)
def find_remote_mesh( self, event: str, gradient: bool = False, interpolate_to: bool = True, check_if_exists: bool = False, hpc_cluster=None, iteration: str = None, already_interpolated: bool = False, validation: bool = False, ) -> pathlib.Path: """ Find the path to the relevant mesh on the hpc cluster :param event: Name of event :type event: str :param gradient: Is it a gradient? If not, it's a model, defaults to False :type gradient: bool, optional :param interpolate_to: Mesh to interpolate to?, defaults to True :type interpolate_to: bool, optional :param check_if_exists: Check if the file exists?, defaults to False :type check_if_exists: bool, optional :param hpc_cluster: you can pass the site object. Defaults to None :type hpc_cluster: salvus.flow.Site, optional :param iteration: Name of an iteration, defaults to None :type iteration: str, optional :param already_interpolated: If mesh has been interpolated, we find it in the interpolation job folder, defaults to False :type already_interpolated: bool, optional :return: The path to the correct mesh :rtype: pathlib.Path """ if hpc_cluster is None: hpc_cluster = get_site(self.comm.project.interpolation_site) remote_mesh_dir = pathlib.Path(self.comm.project.remote_mesh_dir) fast_dir = pathlib.Path(self.comm.project.remote_inversionson_dir) if iteration is None: iteration = self.comm.project.current_iteration if gradient: if interpolate_to: mesh = (self.comm.project.remote_inversionson_dir / "MESHES" / "standard_gradient" / "mesh.h5") # mesh = remote_mesh_dir / "standard_gradient" / "mesh.h5" else: output = self.comm.salvus_flow.get_job_file_paths( event=event, sim_type="adjoint") mesh = output[0][("adjoint", "gradient", "output_filename")] else: if already_interpolated: job = self.comm.salvus_flow.get_job( event=event, sim_type="prepare_forward", iteration=iteration, ) mesh = job.stdout_path.parent / "output" / "mesh.h5" else: if interpolate_to: mesh = remote_mesh_dir / "meshes" / event / "mesh.h5" else: if validation: mesh = (fast_dir / "AVERAGE_MODELS" / iteration / "mesh.h5") else: mesh = fast_dir / "MODELS" / iteration / "mesh.h5" if check_if_exists: if not hpc_cluster.remote_exists(mesh): if gradient and interpolate_to: self._move_mesh_to_cluster(event=None, gradient=gradient, hpc_cluster=hpc_cluster) raise InversionsonError( "Mesh for event {event} does not exist") return mesh
def prepare_iteration( self, it_name: str, events: List[str] = None, ): """ Prepare iteration. :param it_name: Name of iteration :type it_name: "str", optional :param events: Pass a list of events if you want them to be predefined, defaults to None :type events: List[str], optional """ self.comm.project.change_attribute("current_iteration", it_name) print("Preparing iteration for", it_name) if self.comm.lasif.has_iteration(it_name): raise InversionsonError(f"Iteration {it_name} already exists") self.comm.lasif.set_up_iteration(it_name, events) self.comm.project.create_iteration_toml(it_name) self.comm.project.get_iteration_attributes() optimizer = self.comm.project.get_optimizer() model = optimizer.model_path # WIP no average models being uploaded yet. remote_mesh_file = ( self.comm.project.remote_inversionson_dir / "MODELS" / it_name / "mesh.h5" ) hpc_cluster = get_site(self.comm.project.site_name) if not hpc_cluster.remote_exists(remote_mesh_file.parent): if not hpc_cluster.remote_exists(self.comm.project.remote_mesh_dir): hpc_cluster.remote_mkdir(self.comm.project.remote_mesh_dir) if not hpc_cluster.remote_exists(self.comm.project.remote_mesh_dir / "MODELS"): hpc_cluster.remote_mkdir(self.comm.project.remote_mesh_dir / "MODELS") hpc_cluster.remote_mkdir(remote_mesh_file.parent) self.print( f"Moving mesh to {self.comm.project.interpolation_site}", emoji_alias=":package:", ) hpc_cluster.remote_put(model, remote_mesh_file) if self.time_for_validation() and self.comm.project.use_model_averaging\ and self.iteration_number > 0: remote_avg_mesh_file = ( self.comm.project.remote_mesh_dir / "AVERAGE_MODELS" / it_name / "mesh.h5" ) # this enters when the iteration number is 4 print("writing average validation model") # 4 - 5 + 1 = 0 starting_it_number = self.iteration_number - self.comm.project.val_it_interval + 1 self.write_average_model(starting_it_number, self.iteration_number) self.print( f"Moving average_model to {self.comm.project.interpolation_site}", emoji_alias=":package:", ) if not hpc_cluster.remote_exists(remote_avg_mesh_file.parent): hpc_cluster.remote_mkdir(remote_avg_mesh_file.parent) hpc_cluster.remote_put( self.get_average_model_name(starting_it_number, self.iteration_number), remote_avg_mesh_file ) self.comm.lasif.upload_stf(iteration=it_name)
def sum_gradients( self, events, output_location, batch_average=True, sum_vpv_vph=True, store_norms=True, ): """ Sum gradients on the HPC. :param events: List of events to be summed. :type events: list :param output_location: local file path for the end result :type: output_location: bool :param batch_average: Average the summed gradients :type batch_average: bool :param sum_vpv_vph: sum vpv and vph :type: sum_vpv_vph: bool :param store_norms: Store the gradient norms that are computed while summing. :type: store_norms: bool """ gradient_paths = [] iteration = self.comm.project.current_iteration for event in events: if self.comm.project.meshes == "multi-mesh": job = self.comm.salvus_flow.get_job(event, "gradient_interp") gradient_path = os.path.join( str(job.stderr_path.parent), "output/mesh.h5" ) else: job = self.comm.salvus_flow.get_job(event, "adjoint") output_files = job.get_output_files() gradient_path = output_files[0][ ("adjoint", "gradient", "output_filename") ] gradient_paths.append(str(gradient_path)) # Connect to daint hpc_cluster = get_site(self.comm.project.site_name) remote_inversionson_dir = os.path.join( self.comm.project.remote_inversionson_dir, "SUMMING" ) if not hpc_cluster.remote_exists(remote_inversionson_dir): hpc_cluster.remote_mkdir(remote_inversionson_dir) remote_output_path = os.path.join(remote_inversionson_dir, "summed_gradient.h5") remote_norms_path = os.path.join( remote_inversionson_dir, f"{iteration}_gradient_norms.toml" ) # copy summing script to hpc remote_script = os.path.join(remote_inversionson_dir, "gradient_summing.py") if not hpc_cluster.remote_exists(remote_script): hpc_cluster.remote_put(SUM_GRADIENTS_SCRIPT_PATH, remote_script) info = dict( filenames=gradient_paths, parameters=self.comm.project.inversion_params, output_gradient=remote_output_path, events_list=events, gradient_norms_path=remote_norms_path, batch_average=batch_average, ) toml_filename = f"gradient_sum.toml" with open(toml_filename, "w") as fh: toml.dump(info, fh) # Copy toml to HPC and remove locally remote_toml = os.path.join(remote_inversionson_dir, toml_filename) hpc_cluster.remote_put(toml_filename, remote_toml) os.remove(toml_filename) # Call script self.print("Remote summing of gradients started...") hpc_cluster.run_ssh_command(f"python {remote_script} {remote_toml}") self.print("Remote summing completed...") if store_norms: norm_dict_toml = self.optimizer.gradient_norm_path hpc_cluster.remote_get(remote_norms_path, norm_dict_toml) all_norms_path = os.path.join( self.optimizer.gradient_norm_dir, "all_norms.toml" ) if os.path.exists(all_norms_path): norm_dict = toml.load(all_norms_path) else: norm_dict = {} norm_iter_dict = toml.load(norm_dict_toml) for event, norm in norm_iter_dict.items(): norm_dict[event] = float(norm) with open(all_norms_path, "w") as fh: toml.dump(norm_dict, fh) hpc_cluster.remote_get(remote_output_path, output_location) # Only sum the raw gradient in AdamOpt, not the update if sum_vpv_vph: sum_two_parameters_h5(output_location, ["VPV", "VPH"])
def move_files_to_cluster(self): """ Move all the remote scripts to hpc. Move the bathymetry and topography files if it makes sense. """ hpc_cluster = get_site(self.comm.project.site_name) if not hpc_cluster.remote_exists( self.comm.project.remote_inversionson_dir): hpc_cluster.remote_mkdir(self.comm.project.remote_inversionson_dir) for directory in [ "DIFFUSION_MODELS", "SOURCE_TIME_FUNCTIONS", "INTERPOLATION_WEIGHTS", "MESHES", "ADJOINT_SOURCES", "PROCESSED_DATA", "SCRIPTS", ]: if not hpc_cluster.remote_exists( self.comm.project.remote_inversionson_dir / directory): hpc_cluster.remote_mkdir( self.comm.project.remote_inversionson_dir / directory) if self.comm.project.ocean_loading[ "use"] and self.comm.project.meshes == "multi-mesh": if hpc_cluster.remote_exists( self.comm.project.ocean_loading["remote_path"]): self.print( "Remote Bathymetry file is already uploaded", emoji_alias=":white_check_mark:", ) else: if not hpc_cluster.remote_exists( Path(self.comm.project.ocean_loading["remote_path"]). parent): hpc_cluster.remote_mkdir( Path(self.comm.project.ocean_loading["remote_path"]). parent) hpc_cluster.remote_put( self.comm.project.ocean_loading["file"], self.comm.project.ocean_loading["remote_path"], ) if self.comm.project.topography["use"]: if hpc_cluster.remote_exists( self.comm.project.topography["remote_path"]): self.print( "Remote Topography file is already uploaded", emoji_alias=":white_check_mark:", ) else: if not hpc_cluster.remote_exists( Path(self.comm.project.topography["remote_path"]). parent): hpc_cluster.remote_mkdir( Path(self.comm.project.topography["remote_path"]). parent) hpc_cluster.remote_put( self.comm.project.topography["file"], self.comm.project.topography["remote_path"], ) remote_interp_path = self.comm.multi_mesh.find_interpolation_script() hpc_cluster.remote_put(INTERPOLATION_SCRIPT_PATH, remote_interp_path) if self.comm.project.meshes == "multi-mesh": self.comm.lasif.move_gradient_to_cluster(hpc_cluster=hpc_cluster, overwrite=False)