def record_postexecute(self, execution_time=None): """ Function used to record runtime information after the task we want to track is comleted, e.g. the `execute_analysis(...)` function of a standard analysis. The function may be overwritten in child classes to add recording of additional runtime information. When overwriting the function we should call super(...,self).runinfo_record_postexecute(execution_time) in the custom version to ensure that the execution and end_time are properly recorded. :param execution_time: The total time it took to execute the analysis. May be None, in which case the function will attempt to compute the execution time based on the start_time (if available) and the the current time. :param comm: Used for logging only. The MPI communicator to be used. Default value is None, in which case MPI.COMM_WORLD is used. """ log_helper.debug(__name__, 'Recording post-execution runtime data', root=self.mpi_root, comm=self.mpi_comm) # Finalize recording of post execution provenance self['end_time'] = unicode(datetime.datetime.now()) if execution_time is not None: self['execution_time'] = unicode(execution_time) elif 'start_time' in self: start_time = run_info_dict.string_to_time(self['start_time']) stop_time = run_info_dict.string_to_time(self['end_time']) self['execution_time'] = unicode(stop_time - start_time) # TODO: This only gives execution time in full seconds right now else: self['execution_time'] = None # Attempt to record psutil data try: import psutil process = psutil.Process() self['memory_info_after'] = unicode(process.memory_info()) except ImportError: log_helper.warning(__name__, 'psutil not installed. Recording of part of runtime information not possible', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn("Recording of psutil-based runtime information failed: "+str(sys.exc_info())) # Record the time and use profiling data if possible if self.__time_and_use_profiler is not None: self.__time_and_use_profiler.disable() self.__time_and_use_profiler.create_stats() self['profile'] = unicode(self.__time_and_use_profiler.stats) # Save the summary statistics for the profiling data stats_io = StringIO.StringIO() profiler_stats = pstats.Stats(self.__time_and_use_profiler, stream=stats_io).sort_stats('cumulative') profiler_stats.print_stats() self['profile_stats'] = stats_io.getvalue() # Record the memory profiling data if possible if self.__memory_profiler is not None and self.get_profile_memory(): log_helper.debug(__name__, 'Recording memory profiling data', root=self.mpi_root, comm=self.mpi_comm) mem_stats_io = StringIO.StringIO() memory_profiler.show_results(self.__memory_profiler, stream=mem_stats_io) self['profile_mem'] = unicode(self.__memory_profiler.code_map) self['profile_mem_stats'] = mem_stats_io.getvalue()
def get_additional_analysis_dependencies(self): """ Compute a list of all dependencies of the current list of analyses (excluding analyses that are already in the the list of tasks. :return: analysis_task_list of all analysis dependencies """ from omsi.dataformat.omsi_file.common import omsi_file_common from omsi.analysis.base import analysis_base missing_dependencies = analysis_task_list() for analysis_obj in self: for dependency_param_obj in analysis_obj.get_all_dependency_data(): dependency_analysis = dependency_param_obj['data'][ 'omsi_object'] if isinstance(dependency_analysis, analysis_base): if dependency_analysis not in self: missing_dependencies.add(dependency_analysis) elif isinstance(dependency_analysis, omsi_file_common): pass # Ignore dependencies on data files. We do not need to execute those else: log_helper.warning( __name__, 'Unknown dependency object type that cannot be processed by workflow.' + str(dependency_param_obj)) return missing_dependencies
def enable_profile_time_and_usage(self, enable=True): """ Enable/disable time and usage profiling :param enable: boolean to enable (True) or disable (False) time and usage profiling """ if PROFILE_AVAILABLE: if not enable and self.__profile_time_and_usage: log_helper.debug(__name__, "Disabled time and usage profiling. ", root=self.mpi_root, comm=self.mpi_comm) if enable and not self.__profile_time_and_usage: log_helper.debug(__name__, "Enabled time and usage profiling. ", root=self.mpi_root, comm=self.mpi_comm) self.__profile_time_and_usage = enable else: self.__profile_time_and_usage = False if enable: log_helper.warning( __name__, 'Profiling of time and usage not available.' + ' Missing profile and/or pstats package')
def enable_profile_memory(self, enable=True): """ Enable/disable profiling of memory usage :param enable: boolean to enable (True) or disable (False) memory profiling """ if PROFILE_MEMORY_AVAILABLE: if not enable and self.__profile_memory: log_helper.debug(__name__, "Disabled memory profiling. ", root=self.mpi_root, comm=self.mpi_comm) if enable and not self.__profile_memory: log_helper.debug(__name__, "Enabled memory profiling. ", root=self.mpi_root, comm=self.mpi_comm) self.__profile_memory = enable else: self.__profile_memory = False if enable: log_helper.warning( __name__, 'Profiling of memory usage not available.' + ' Missing memory_profiler or StringIO package')
def main(self): """Execute the analysis workflow""" if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty") return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow") log_helper.info(__name__, "Adding all dependencies") self.add_analysis_dependencies() # Record the runtime information log_helper.debug(__name__, "Recording runtime information") self.run_info.clear() self.run_info.record_preexecute() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow") all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len( analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis)) analysis.execute() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.") break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning( __name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies." ) iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG') # Record the runtime information after we are done with the workflow self.run_info.record_postexecute() self.run_info.gather()
def main(self): """ Execute the analysis workflow """ # Do the optional MPI barrier if self['synchronize']: mpi_helper.barrier(comm=self.mpi_comm) # Check if we have anything to do at all if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm) return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm) log_helper.info(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm) self.add_analysis_dependencies() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm) all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis), root=self.mpi_root, comm=self.mpi_comm) analysis.execute() if self['reduce_memory_usage']: analysis.clear_and_restore() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm) break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies.", root=self.mpi_root, comm=self.mpi_comm) iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
def get_files_from_dir(cls, dirname): """ Get a list of all basenames of all img files in a given directory. Note: The basenames include the dirname. """ filelist = [] for l in os.listdir(dirname): currname = os.path.join(dirname, l) filename_only, extension = os.path.splitext(currname) if os.path.isfile(currname) and currname.lower().endswith(".imzml"): if os.path.isfile(filename_only + '.ibd'): filelist.append(currname) else: log_helper.warning(__name__, 'Could not find binary .ibd file for file %s . Skipping conversion of this file.' % currname) return filelist
def main(self): """Execute the analysis workflow""" if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty") return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow") log_helper.info(__name__, "Adding all dependencies") self.add_analysis_dependencies() # Record the runtime information log_helper.debug(__name__, "Recording runtime information") self.run_info.clear() self.run_info.record_preexecute() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow") all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis)) analysis.execute() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.") break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies.") iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG') # Record the runtime information after we are done with the workflow self.run_info.record_postexecute() self.run_info.gather()
def read_from_omsi_file(self, analysis_object, load_data=True, load_parameters=True, load_runtime_data=True, dependencies_omsi_format=True, ignore_type_conflict=False): """ See `omsi.analysis.analysis_base.read_from_omsi_file(...)` for details. The function is overwritten here mainly to initialize the self.real_analysis_type instance variable but otherwise uses the default behavior. """ # Attempt to add all analysis parameters to avoid warnings when setting the parameters during # the data load process, when we would set parameters that are not defined yet try: parameter_list = analysis_object.get_all_parameter_data( load_data=False, exclude_dependencies=False) for param in parameter_list: # Ignore the profiling parameters as they are added by the analysis base class already if param['name'] in [ 'profile_time_and_usage', 'profile_memory' ]: continue self.add_parameter(name=param['name'], help=param['help'], dtype=param['dtype']) except: log_helper.warning(__name__, "Could not generate all parameters.") # Load the data as usual output_val = super(analysis_generic, self).read_from_omsi_file( analysis_object=analysis_object, load_data=load_data, load_parameters=load_parameters, load_runtime_data=load_runtime_data, dependencies_omsi_format=dependencies_omsi_format, ignore_type_conflict=ignore_type_conflict) # Fill in the list of output names self.data_names = [ dat['name'] for dat in self._analysis_base__data_list ] # Load the real data type. self.real_analysis_type = unicode( analysis_object.get_analysis_type()[:]) # Return the output data return output_val
def get_files_from_dir(cls, dirname): """ Get a list of all basenames of all img files in a given directory. Note: The basenames include the dirname. """ filelist = [] for l in os.listdir(dirname): currname = os.path.join(dirname, l) filename_only, extension = os.path.splitext(currname) if os.path.isfile(currname) and currname.endswith(".imzML"): if os.path.isfile(filename_only + '.ibd'): filelist.append(currname) else: log_helper.warning( __name__, 'Could not find binary .ibd file for file %s . Skipping conversion of this file.' % currname) return filelist
def enable_profile_time_and_usage(self, enable=True): """ Enable/disable time and usage profiling :param enable: boolean to enable (True) or disable (False) time and usage profiling """ if PROFILE_AVAILABLE: if not enable and self.__profile_time_and_usage: log_helper.debug(__name__, "Disabled time and usage profiling. ", root=self.mpi_root, comm=self.mpi_comm) if enable and not self.__profile_time_and_usage: log_helper.debug(__name__, "Enabled time and usage profiling. ", root=self.mpi_root, comm=self.mpi_comm) self.__profile_time_and_usage = enable else: self.__profile_time_and_usage = False if enable: log_helper.warning(__name__, 'Profiling of time and usage not available.' + ' Missing profile and/or pstats package')
def enable_profile_memory(self, enable=True): """ Enable/disable profiling of memory usage :param enable: boolean to enable (True) or disable (False) memory profiling """ if PROFILE_MEMORY_AVAILABLE: if not enable and self.__profile_memory: log_helper.debug(__name__, "Disabled memory profiling. ", root=self.mpi_root, comm=self.mpi_comm) if enable and not self.__profile_memory: log_helper.debug(__name__, "Enabled memory profiling. ", root=self.mpi_root, comm=self.mpi_comm) self.__profile_memory = enable else: self.__profile_memory = False if enable: log_helper.warning(__name__, 'Profiling of memory usage not available.' + ' Missing memory_profiler or StringIO package')
def read_from_omsi_file(self, analysis_object, load_data=True, load_parameters=True, load_runtime_data=True, dependencies_omsi_format=True, ignore_type_conflict=False): """ See `omsi.analysis.analysis_base.read_from_omsi_file(...)` for details. The function is overwritten here mainly to initialize the self.real_analysis_type instance variable but otherwise uses the default behavior. """ # Attempt to add all analysis parameters to avoid warnings when setting the parameters during # the data load process, when we would set parameters that are not defined yet try: parameter_list = analysis_object.get_all_parameter_data(load_data=False, exclude_dependencies=False) for param in parameter_list: # Ignore the profiling parameters as they are added by the analysis base class already if param['name'] in ['profile_time_and_usage', 'profile_memory']: continue self.add_parameter(name=param['name'], help=param['help'], dtype=param['dtype']) except: log_helper.warning(__name__, "Could not generate all parameters.") # Load the data as usual output_val = super(analysis_generic, self).read_from_omsi_file( analysis_object=analysis_object, load_data=load_data, load_parameters=load_parameters, load_runtime_data=load_runtime_data, dependencies_omsi_format=dependencies_omsi_format, ignore_type_conflict=ignore_type_conflict) # Fill in the list of output names self.data_names = [dat['name'] for dat in self._analysis_base__data_list] # Load the real data type. self.real_analysis_type = unicode(analysis_object.get_analysis_type()[:]) # Return the output data return output_val
def get_additional_analysis_dependencies(self): """ Compute a list of all dependencies of the current list of analyses (excluding analyses that are already in the the list of tasks. :return: analysis_task_list of all analysis dependencies """ from omsi.dataformat.omsi_file.common import omsi_file_common from omsi.analysis.base import analysis_base missing_dependencies = analysis_task_list() for analysis_obj in self: for dependency_param_obj in analysis_obj.get_all_dependency_data(): dependency_analysis = dependency_param_obj['data']['omsi_object'] if isinstance(dependency_analysis, analysis_base): if dependency_analysis not in self: missing_dependencies.add(dependency_analysis) elif isinstance(dependency_analysis, omsi_file_common): pass # Ignore dependencies on data files. We do not need to execute those else: log_helper.warning(__name__, 'Unknown dependency object type that cannot be processed by workflow.' + str(dependency_param_obj)) return missing_dependencies
def main(self): """ Execute the analysis workflow """ # Do the optional MPI barrier if self['synchronize']: mpi_helper.barrier(comm=self.mpi_comm) # Check if we have anything to do at all if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm) return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm) log_helper.debug(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm) self.add_analysis_dependencies() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm) all_analyses = self.get_analyses() iterations = 0 continue_running = True while continue_running: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis), root=self.mpi_root, comm=self.mpi_comm) analysis.execute() if self['reduce_memory_usage']: analysis.clear_and_restore() # Check if there is any other tasks that we need to execute now num_tasks_completed, num_tasks_waiting, num_tasks_ready, num_tasks_blocked = \ all_analyses.task_status_stats() if num_tasks_waiting == 0: log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm) continue_running = False if num_tasks_waiting > 0 and num_tasks_ready == 0: blocking_tasks = all_analyses.get_blocking_tasks() log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks_waiting) + " remain in the queue but cannot be completed due to unresolved dependencies." + " The workflow will be restarted once the outputs of the blocking tasks are ready." + " Blocking tasks are: " + str(blocking_tasks), root=self.mpi_root, comm=self.mpi_comm) # Tell all blocking tasks that they should continue the workflow once they are ready # This happens in omsi.analysis.analysis_base.outputs_ready(...) function for block_task in blocking_tasks: block_task.continue_workflow_when_ready(self) # NOTE: if self['reduce_memory_usage'] is True then prior analyses were cleared, i.e., # they will be rexecuted when the workflow is restarted. It is, therefore, not recommeneded # to use reduce_memory_usage option when performing interactive tasks. continue_running = False iterations += 1 # All analyses are done, so we no longer need to coninue any analyses when we are done if num_tasks_blocked == 0: for analysis in all_analyses: analysis.continue_analysis_when_ready = False log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
def __populate_analysis__(cls, analysis_group, analysis): """ Populate the given h5py group with the analysis data. NOTE: This is a private helper function. Use the corresponding create_analysis function of omsi_file_experiment to create a completely new analysis. NOTE: At this point we assume that all in-memory dependencies have been resolved. If not, then the raw data associated with the given parameter will be saved instead. :param analysis_group: h5py group in which the analysis data should be stored. :param analysis: Instance of omsi.analysis.analysis_base defining the analysis :type analysis: omsi.analysis.analysis_base: :returns: The omsi_file_analysis object for the newly created analysis group. The analysis data is automatically written to file by this function so no addition work is required. """ from omsi.datastructures.analysis_data import analysis_data from omsi.dataformat.omsi_file.dependencies import omsi_file_dependencies from omsi.analysis.base import analysis_base # 1. Write the analysis name analysis_identifier_data = analysis_group.require_dataset( name=unicode(omsi_format_analysis.analysis_identifier), shape=(1,), dtype=omsi_format_common.str_type) if omsi_format_common.str_type_unicode: analysis_identifier_data[0] = analysis.get_analysis_identifier() else: analysis_identifier_data[0] = str(analysis.get_analysis_identifier()) # 2. Write the analysis type analysis_type_data = analysis_group.require_dataset(name=unicode(omsi_format_analysis.analysis_type), shape=(1,), dtype=omsi_format_common.str_type) if omsi_format_common.str_type_unicode: analysis_type_data[0] = analysis.get_analysis_type() else: analysis_type_data[0] = str(analysis.get_analysis_type()) # 3. Write the analysis data try: analysis.write_analysis_data(analysis_group=analysis_group) except NotImplementedError: for ana_data in analysis.get_all_analysis_data(): cls.__write_omsi_analysis_data__(analysis_group, ana_data) # 4. Determine all dependencies and parameters that we need to write dependencies = [] # [dep['data'] for dep in analysis.get_all_dependency_data()] parameters = [] # 4.1 Resolve in-memory dependencies if possible for dependent_parameter in analysis.get_all_dependency_data(): # 4.1.1 We have an in-memory dependency if isinstance(dependent_parameter['data']['omsi_object'], analysis_base): # 4.1.1.1 We can resolve the dependency to an object in an HDF5 file if dependent_parameter['data']['omsi_object'].has_omsi_analysis_storage(): # Create a new dependency that points to the approbriate file location # NOTE: We do not modify the dependency in the analysis object that we save # but we only change it for the purpose of storage new_dep = dependent_parameter['data'].copy() new_dep_omsi_object = None # Check if we can find an analysis data store within the same parent (or at least file) parent_filename = os.path.abspath(analysis_group.file.filename) for analysis_store in dependent_parameter['data']['omsi_object'].get_omsi_analysis_storage(): analysis_store_filename = os.path.abspath(analysis_store.managed_group.file.filename) if analysis_store.name == analysis_group.parent.name and \ analysis_store_filename == parent_filename: new_dep_omsi_object = analysis_store break elif analysis_store_filename == parent_filename: new_dep_omsi_object = analysis_store # We could not find a prior data store within the same file so use one from another file if new_dep_omsi_object is None: dep_object = dependent_parameter['data']['omsi_object'] new_dep['omsi_object'] = dep_object.get_omsi_analysis_storage()[0] else: new_dep['omsi_object'] = new_dep_omsi_object # Append it to the list of dependencies dependencies.append(new_dep) # 4.1.1.2 We cannot resolve the dependency and need to store it as an parameter instead else: # Replace the dependency with the actual data and save it as a parameter instead new_param = dependent_parameter.copy() new_param['data'] = new_param['data'].get_data() parameters.append(new_param) # 4.1.2 We have a file-based dependencies so keep it as is and add it to the list of dependencies else: dependencies.append(dependent_parameter['data']) # 4.2 Add all regular parameters to the list of parameters parameters += analysis.get_all_parameter_data(exclude_dependencies=True) # 5. Write all the parameters parameter_group = analysis_group.require_group(omsi_format_analysis.analysis_parameter_group) for param_data in parameters: if param_data['required'] or param_data.data_set() or param_data['default'] is not None: temp_data = param_data.get_data_or_default() if temp_data is not None: anadata = analysis_data(name=param_data['name'], data=param_data.get_data_or_default(), dtype=param_data['dtype']) cls.__write_omsi_analysis_data__(parameter_group, anadata) # Try to add the help string attribute try: help_attr = omsi_format_analysis.analysis_parameter_help_attr parameter_group[param_data['name']].attrs[help_attr] = param_data['help'] except KeyError: pass # 6. Write all the runtime execution information runinfo_group = analysis_group.require_group(omsi_format_analysis.analysis_runinfo_group) for run_info_key, run_info_value in analysis.get_all_run_info().items(): # Generate an analysis_data object in order to use the # __write_omsi_analysis_data function to write the data if isinstance(run_info_value, unicode) or isinstance(run_info_value, str): anadata = analysis_data(name=unicode(run_info_key), data=run_info_value, dtype=omsi_format_common.str_type) else: dat = np.asarray(run_info_value) if len(dat.shape) == 0: dat = np.asarray([run_info_value]) anadata = analysis_data(name=unicode(run_info_key), data=dat, dtype=dat.dtype) cls.__write_omsi_analysis_data__(runinfo_group, anadata) # 7. Write all dependencies omsi_file_dependencies.__create__(parent_group=analysis_group, dependencies_data_list=dependencies) # 8. Execute the custom data write for the analysis analysis.add_custom_data_to_omsi_file(analysis_group) # 9. Create the output object re = omsi_file_analysis(analysis_group) # 10. Save the output object in the ist of omsi analysis data stores as part of the analysis object analysis.omsi_analysis_storage.append(re) # 11. Check if we need to pickle and save the analysis class in case this is a custom class that is not part of BASTet try: from omsi.analysis.analysis_views import analysis_views _ = analysis_views.analysis_name_to_class(analysis.get_analysis_type()) except NameError: class_pickle = cloudpickle.dumps(analysis.__class__) # Convert the pickle string to an uint8 array to avoid problems # with storing string with NULL characters in HDF5 class_pickle_arr = np.fromstring(class_pickle, dtype=omsi_format_analysis.analysis_class_pickle_np_dtype) analysis_group[unicode(omsi_format_analysis.analysis_class)] = class_pickle_arr except: log_helper.warning(__name__, "Could not save the analysis class.") pass # 12. Retrun the new omsi_file_analysis object return re
def record_preexecute(self): """ Record basic runtime information in this dict before the exeuction is started. Function used to record runtime information prior to executing the process we want to track, e.g., the `execute_analysis(...)` of a standard analysis. The function may be overwritten in child classes to add recording of additional runtime information. All runtime data should be recorded in the main dict (i.e, self). This ensures in the case of standard analysis that the data is stored in the HDF5 file. Other data should be stored in separate variables that we may add to the object. When overwriting the function we should typically call super(...,self).runinfo_record_pretexecute() last in the custom version to ensure that the start_time is properly recorded right before the execution of the analysis. """ log_helper.debug(__name__, 'Recording pre-execution runtime data', root=self.mpi_root, comm=self.mpi_comm) # Record basic runtime environment information using the platform module try: self['architecture'] = unicode(platform.architecture()) self['java_ver'] = unicode(platform.java_ver()) self['libc_ver'] = unicode(platform.libc_ver()) self['linux_distribution'] = unicode(platform.linux_distribution()) self['mac_ver'] = unicode(platform.mac_ver()) self['machine'] = unicode(platform.machine()) self['node'] = unicode(platform.node()) self['platform'] = unicode(platform.platform()) self['processor'] = unicode(platform.processor()) self['python_branch'] = unicode(platform.python_branch()) self['python_build'] = unicode(platform.python_build()) self['python_compiler'] = unicode(platform.python_compiler()) self['python_implementation'] = unicode(platform.python_implementation()) self['python_revision'] = unicode(platform.python_revision()) self['python_version'] = unicode(platform.python_version()) self['release'] = unicode(platform.release()) self['system'] = unicode(platform.system()) self['uname'] = unicode(platform.uname()) self['version'] = unicode(platform.version()) self['win32_ver'] = unicode(platform.win32_ver()) except: warnings.warn("WARNING: Recording of platform provenance failed: " + str(sys.exc_info())) # Attempt to record the svn version information try: import subprocess self['svn_ver'] = subprocess.check_output('svnversion').rstrip('\n') except ImportError: log_helper.warning(__name__, 'Recording of svn version not possible. subprocess not installed', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn("Recording of svn version information failed: "+str(sys.exc_info())) # Attempt to record software library version try: import numpy as np self['numpy_version_full_version'] = unicode(np.version.full_version) self['numpy_version_release'] = unicode(np.version.release) self['numpy_version_git_revision'] = unicode(np.version.git_revision) except ImportError: log_helper.warning(__name__, 'Recording of numpy version not possible.', root=self.mpi_root, comm=self.mpi_comm) # Attempt to record psutil data try: import psutil self['logical_cpu_count'] = unicode(psutil.cpu_count()) self['cpu_count'] = unicode(psutil.cpu_count(logical=False)) process = psutil.Process() self['open_files'] = unicode(process.open_files()) self['memory_info_before'] = unicode(process.memory_info()) except ImportError: log_helper.warning(__name__, 'psutil not installed. Recording of part of runtime information not possible', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn("Recording of psutil-based runtime information failed: "+str(sys.exc_info())) # Record the start time for the analysis self['start_time'] = unicode(datetime.datetime.now()) # Enable time and usage profiling if requested if self.__profile_time_and_usage: self.__time_and_use_profiler = Profile() self.__time_and_use_profiler.enable()
process = Popen('%s -c "from mpi4py import MPI as mpi"'%( \ sys.executable), shell=True, stderr=PIPE, stdout=PIPE) import_failed = process.wait() return not import_failed MPI_AVAILABLE = test_mpi_available() if MPI_AVAILABLE: from mpi4py import MPI except ImportError: MPI_AVAILABLE = False if not MPI_AVAILABLE: try: from omsi.shared.log import log_helper log_helper.warning(__name__, "MPI not available. Running in serial.") except: print "MPI not available. Running in serial." import numpy as np import itertools import warnings import time import os class parallel_over_axes(object): """ Helper class used to parallelize the execution of a function using MPI by splitting the input data into sub-blocks along a given set of axes.
process = Popen('%s -c "from mpi4py import MPI as mpi"'%( \ sys.executable), shell=True, stderr=PIPE, stdout=PIPE) import_failed = process.wait() return not import_failed MPI_AVAILABLE = test_mpi_available() if MPI_AVAILABLE: from mpi4py import MPI except ImportError: MPI_AVAILABLE = False if not MPI_AVAILABLE: try: from omsi.shared.log import log_helper log_helper.warning(__name__, "MPI not available. Running in serial.") except: print "MPI not available. Running in serial." import numpy as np import itertools import warnings import time class parallel_over_axes(object): """ Helper class used to parallelize the execution of a function using MPI by splitting the input data into sub-blocks along a given set of axes. :ivar task_function: The function we should run.
def add_and_parse_workflow_arguments(self): """ The function assumes that the command line parser has been setup using the initialize_argument_parser(..) This function is responsible for adding all command line arguments that are specific to the workflow and to then parse those arguments and save the relevant data in the self.analysis_arguments dictionary. Command-line arguments that are specific to the command line driver are removed, so that only arguments that can be consumed by the analysis are handed to the analysis. *Side effects:* The function sets ``self.analysis_arguments`` and updates the analysis parameters of the analyses stored in ``self.workflow_executor.analysis_tasks`` """ # Ensure that we have a workflow executor instantiated. This should usually not happen. if self.workflow_executor is None: log_helper.warning( __name__, "Late creation of the workflow executor in add_and_parse_workflow_arguments", root=self.mpi_root, comm=self.mpi_comm) self.create_workflow_executor_object() # Ensure that all analysis identifiers are set to a defined value self.workflow_executor.analysis_tasks.set_undefined_analysis_identifiers( ) # Ensure that all analysis identifiers are unique if not self.workflow_executor.analysis_tasks.analysis_identifiers_unique( ): log_helper.warning( __name__, "The workflow contains multiple analyses with the same user-defined " + "identifier. Colliding identifiers will be modified to ensure uniqueness", root=self.mpi_root, comm=self.mpi_comm) self.workflow_executor.analysis_tasks.make_analysis_identifiers_unique( self) # Ensure that the prefix of the workflow executor does not interfere with the prefix of an analysis all_analysis_identifiers = self.workflow_executor.analysis_tasks.get_all_analysis_identifiers( ) if self.workflow_executor.workflow_identifier in all_analysis_identifiers: log_helper.warning( __name__, "The identifier of the workflow executor collides with the identifier " + "of an analysis. Updating the identifier of the workflow executor to be unique", root=self.mpi_root, comm=self.mpi_comm) while self.workflow_executor.workflow_identifier in all_analysis_identifiers: self.workflow_executor.workflow_identifier += '_' # Add all arguments from our workflow executor target_seperator = self.identifier_argname_seperator if self.workflow_executor is not None: for analysis in self.workflow_executor.analysis_tasks: # Create the group for the analysis in general analysis_group = self.parser.add_argument_group( title=analysis.get_analysis_identifier() + " : " + analysis.get_analysis_type()) arg_group_name = analysis.get_analysis_identifier( ) + target_seperator + analysis.get_analysis_type() self.custom_argument_groups[arg_group_name] = analysis_group # Create groups for all argument groups of the analysis analysis_arg_groups = {} arg_group_dict = { arg_param.get_group_name(): arg_param.get_group_description() for arg_param in analysis.get_all_parameter_data() if arg_param.get_group_name() is not None } for group_name, group_description in arg_group_dict.iteritems( ): ana_arg_group_name = arg_group_name + ":" + group_name analysis_arg_groups[ group_name] = self.parser.add_argument_group( title=ana_arg_group_name, description=group_description) self.custom_argument_groups[ ana_arg_group_name] = analysis_arg_groups[group_name] # Add all undefined parameters of the current analysis for arg_param in analysis.get_all_parameter_data(): # If the parameter is notset if not arg_param.data_set(): # Add the parameter to the argument parser arg_name = "--" + analysis.get_analysis_identifier() + \ self.identifier_argname_seperator + arg_param['name'] arg_action = 'store' arg_default = arg_param['default'] arg_required = arg_param['required'] and (arg_default is None) arg_type = arg_param['dtype'] arg_choices = arg_param['choices'] arg_help = arg_param['help'] arg_dest = analysis.get_analysis_identifier( ) + target_seperator + arg_param['name'] arg_group = arg_param.get_group_name() # Determine the group the argument belongs to argument_group = self.required_argument_group if arg_required else analysis_group if arg_group in analysis_arg_groups: argument_group = analysis_arg_groups[arg_group] # Add the argument to the proper group argument_group.add_argument( arg_name, # <-- Required, user specified arg name action= arg_action, # Constant. We define this not the user. # nargs=1, Don't use. Leave as default # const=None, Don't use this type of action default= arg_default, # <-- Optional default value of the argument type=arg_type, # <-- Optional dtype of the argument choices= arg_choices, # <-- Optional Key may be missing. required=arg_required, # <-- Optional help=arg_help, # <-- Required # metavar # Don't use. Positional analysis # # arguments are not allowed dest=arg_dest ) # Automatically determined by the name # Add the arguments of the workflow executor if 'workflow_executor' not in self.custom_argument_groups: workflow_executor_group = self.parser.add_argument_group( title='optional workflow executor options', description= 'Additional, optional settings for the workflow execution controls' ) self.custom_argument_groups[ 'workflow_executor'] = workflow_executor_group else: log_helper.warning( __name__, 'The workflow exectutor parser group already exists. ' + 'Workflow options are added to the main parser instead', root=self.mpi_root, comm=self.mpi_comm) workflow_executor_group = self.parser for arg_param in self.workflow_executor.get_all_parameter_data(): # Add the parameter to the argument parser arg_name = "--" + self.workflow_executor.workflow_identifier + target_seperator + arg_param[ 'name'] arg_action = 'store' arg_default = arg_param['default'] arg_required = arg_param['required'] and (arg_default is None) arg_type = arg_param['dtype'] arg_choices = arg_param['choices'] arg_help = arg_param['help'] arg_dest = self.workflow_executor.workflow_identifier + target_seperator + arg_param[ 'name'] argument_group = self.required_argument_group if arg_required else workflow_executor_group # Add the argument to the proper group argument_group.add_argument( arg_name, # <-- Required, user specified arg name action=arg_action, # Constant. We define this not the user. # nargs=1, Don't use. Leave as default # const=None, Don't use. We don't use this type of action default= arg_default, # <-- Optional default value for the argument type=arg_type, # <-- Optional dtype of the argument choices=arg_choices, # <-- Optional Key may be missing. required=arg_required, # <-- Optional help=arg_help, # <-- Required # metavar # Don't use. Positional analysis arguments # # are not allowed dest=arg_dest) # Automatically determined by the name # Add the help argument self.optional_argument_group.add_argument( '-h', '--help', action='help', default=argparse.SUPPRESS, help='show this help message and exit') # Remove the arguments from this driver that cannot be understood by the analysis parsed_arguments = vars(self.parser.parse_args()) parsed_arguments.pop(self.profile_arg_name, None) parsed_arguments.pop(self.output_save_arg_name, None) parsed_arguments.pop(self.profile_mem_arg_name, None) parsed_arguments.pop(self.log_level_arg_name, None) parsed_arguments.pop(self.script_arg_name, None) # Consume the command line arguments for the workflow executor and the analysis self.workflow_executor_arguments = {} for arg_param in self.workflow_executor.get_all_parameter_data(): arg_dest = self.workflow_executor.workflow_identifier + target_seperator + arg_param[ 'name'] if arg_dest in parsed_arguments: param_value = parsed_arguments.pop(arg_dest) self.workflow_executor[arg_param['name']] = param_value self.workflow_executor_arguments[ arg_param['name']] = param_value # Consume the arguments for the different analyses self.analysis_arguments = parsed_arguments for arg_key, arg_value in self.analysis_arguments.iteritems(): ana_identifier, arg_key = arg_key.split(target_seperator) self.workflow_executor.analysis_tasks[ana_identifier][ arg_key] = arg_value # Make sure we use the user-specified log level, even if it is set differently in the scripts if self.user_log_level is not None: log_helper.set_log_level( level=log_helper.log_levels[self.user_log_level])
def main(self): """ Default main function for running an analysis from the command line. The default implementation exposes all specified analysis parameters as command line options to the user. The default implementation also provides means to print a help text for the function. :raises: ValueError is raised in case that the analysis class is unknown """ # Initialize the argument parser if self.parser is None: self.initialize_argument_parser() try: # Parse the command line arguments to determine the command line driver settings self.parse_cl_arguments() except: self.remove_output_target() raise if self.workflow_executor is None: self.remove_output_target() log_helper.error( __name__, 'Missing --script parameter or worfklow_executor object') raise ValueError('Workflow not initalized') # Add and parse the command line arguments specific to the analysis to determine the analysis settings try: self.add_and_parse_workflow_arguments() except: self.remove_output_target() raise # Print the analysis settings if mpi_helper.get_rank() == self.mpi_root: self.print_settings() # Enable time and usage profiling try: # Enable time and usage profiling if requested if self.profile_analyses: try: self.workflow_executor.analysis_tasks.enable_time_and_usage_profiling( self.profile_analyses) except ImportError as e: log_helper.warning( __name__, "Profiling of time and usage not available due to missing packages." ) log_helper.warning(__name__, e.message) # Enable memory profiling if requested if self.profile_analyses_mem: try: self.workflow_executor.analysis_tasks.enable_memory_profiling( self.profile_analyses_mem) except ImportError as e: log_helper.warning( __name__, "Profiling of memory usage not available due to missing packages" ) log_helper.warning(__name__, e.message) except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Execute the analysis try: log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments), root=self.mpi_root, comm=self.mpi_comm) self.workflow_executor.execute() except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial # the condition of mpi_helper.get_rank() == self.mpi_root evaluates to True because # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial. if mpi_helper.get_rank() == self.mpi_root: # Print usage profiles if available try: self.print_time_and_usage_profiles() except: log_helper.error( __name__, "An error occured while trying to print time and usage profiles", root=self.mpi_root, comm=self.mpi_comm) # Print memory profile data if available try: self.print_memory_profiles() except: log_helper.error( __name__, "An error occured while trying to print memory profiles", root=self.mpi_root, comm=self.mpi_comm) # Print the time it took to run the analysis try: # Parallel case: We need to compile/collect timing data from all cores if isinstance( self.workflow_executor.run_info['execution_time'], list): # Time for each task to execute log_helper.info( __name__, "Time in seconds for each analysis process: " + str(self.workflow_executor.run_info['execution_time']), root=self.mpi_root, comm=self.mpi_comm) # Start times of each task log_helper.info( __name__, "Time when each of the processes started: " + str(self.workflow_executor.run_info['start_time']), root=self.mpi_root, comm=self.mpi_comm) # Stop times for each task log_helper.info( __name__, "Time when each of the processes finished: " + str(self.workflow_executor.run_info['end_time']), root=self.mpi_root, comm=self.mpi_comm) # Compile the time to execute string exec_time_array = np.asarray( self.workflow_executor.run_info['execution_time'], dtype=float) max_exec_time = str(exec_time_array.max()) min_exec_time = str(exec_time_array.min()) mean_exec_time = str(exec_time_array.mean()) exec_time_string = max_exec_time + " s " + \ " ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )" # Serial case: We only have a single time to worry about else: exec_time_string = str(self.workflow_executor. run_info['execution_time']) + " s" log_helper.info(__name__, "Time to execute analysis: " + exec_time_string, root=self.mpi_root, comm=self.mpi_comm) except: raise # Save the analysis to file if self.output_target is not None: from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager for analysis in self.workflow_executor.analysis_tasks: omsi_analysis_manager.create_analysis_static( analysis_parent=self.output_target, analysis=analysis)
from omsi.analysis.base import analysis_base from omsi.datastructures.analysis_data import data_dtypes from omsi.datastructures.dependency_data import dependency_dict from omsi.datastructures.analysis_data import parameter_data from omsi.shared.log import log_helper try: import cloudpickle # Use the version of cloud-pickle installed on the system log_helper.debug(__name__, "Using system cloudpickle module") except ImportError: try: import omsi.shared.third_party.cloudpickle as cloudpickle log_helper.debug(__name__, "Using fallback cloudpickle version") except ImportError: log_helper.warning(__name__, "cloudpickle could not be imported. Using standard pickle instead. " + " Some features may not be available.") import pickle as cloudpickle import numpy as np def bastet_analysis(output_names=None, parameter_specs=None, name_key="undefined"): """ Decorator used to wrap a function and replace it with an analysis_generic object that behaves like a function but adds the ability for saving the analysis to file and tracking provenance This is essentially the same as analysis_generic.from_function(....). :param func: The function to be wrapped :param output_names: Optional list of strings with the names of the outputs :param parameter_specs: Optional list of omsi.datastructures.analysis_data.parameter_data with
def main(self): """ Default main function for running an analysis from the command line. The default implementation exposes all specified analysis parameters as command line options to the user. The default implementation also provides means to print a help text for the function. :raises: ValueError is raised in case that the analysis class is unknown """ # Get the analysis object if needed if self.add_analysis_class_arg: try: self.get_analysis_class_from_cl() except (ImportError, AttributeError, ValueError): pass # Initialize the argument parser if self.parser is None: self.initialize_argument_parser() # Check if we have a valid analysis class if self.analysis_class is None: print self.parser.print_help() raise ValueError('Could not determine the analysis class.') if not issubclass(self.analysis_class, analysis_base): print self.parser.print_help() raise ValueError('Analysis class is not a subclass of analysis_base.') try: # Parse the command line arguments to determine the command line driver settings self.parse_cl_arguments() # Add and parse the command line arguments specific to the analysis to determine the analysis settings self.add_and_parse_analysis_arguments() except: self.remove_output_target() raise # Print the analysis settings if mpi_helper.get_rank() == self.mpi_root: self.print_settings() # Call the execute function of the analysis try: # Create the analysis object analysis_object = self.analysis_class() # Enable time and usage profiling if requested if self.profile_analysis: try: analysis_object.enable_time_and_usage_profiling(self.profile_analysis) except ImportError as e: log_helper.warning(__name__, "Profiling of time and usage not available due to missing packages.") log_helper.warning(__name__, e.message) # Enable memory profiling if requested if self.profile_analysis_mem: try: analysis_object.enable_memory_profiling(self.profile_analysis_mem) except ImportError as e: log_helper.warning(__name__, "Profiling of memory usage not available due to missing packages") log_helper.warning(__name__, e.message) # Execute the analysis log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments)) analysis_object.execute(**self.analysis_arguments) except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial # the condition of mpi_helper.get_rank() == self.mpi_root evaluates to True because # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial. if mpi_helper.get_rank() == self.mpi_root: # Print the profiling results of time and usage if self.profile_analysis: print "" print "PROFILING DATA: TIME AND USAGE" print "" analysis_object.get_profile_stats_object(consolidate=True).print_stats() # Print the profiling results for memory usage if self.profile_analysis_mem: print "" print "PROFILING DATA: MEMORY" print "" print analysis_object.get_memory_profile_info() # Print the time it took to run the analysis try: # Parallel case: We need to compile/collect timing data from all cores if isinstance(analysis_object.run_info['execution_time'] , list): # Time for each task to execute log_helper.info(__name__, "Time in seconds for each analysis process: " + str(analysis_object.run_info['execution_time'])) # Start times of each task log_helper.info(__name__, "Time when each of the processes started: " + str(analysis_object.run_info['start_time'])) # Stop times for each task log_helper.info(__name__, "Time when each of the processes finished: " + str(analysis_object.run_info['end_time'])) # Compile the time to execute string exec_time_array = np.asarray(analysis_object.run_info['execution_time'], dtype=float) max_exec_time = str(exec_time_array.max()) min_exec_time = str(exec_time_array.min()) mean_exec_time = str(exec_time_array.mean()) exec_time_string = max_exec_time + " s " + \ " ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )" # Serial case: We only have a single time to worry about else: exec_time_string = str(analysis_object.run_info['execution_time']) + " s" log_helper.info(__name__, "Time to execute analysis: " + exec_time_string) except: raise # Save the analysis to file if self.output_target is not None: from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager omsi_analysis_manager.create_analysis_static(analysis_parent=self.output_target, analysis=analysis_object)
def __compute_file_info(cls, filename, resolution): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis, data type for the intensities, format type :return: Numpy array with mz axis :return: string with data type :return: imzml file type :return: """ reader = ImzMLParser(filename) # Read the first spectrum mz_axes, intens = reader.getspectrum(0) # NOTE: mz_axes is a tuple # Read the coordinates coordinates = np.asarray(reader.coordinates) # Determine the data type for the internsity values dtype = np.asarray(intens).dtype.str # Compute the mz axis and file type file_type = cls.available_imzml_types['continuous'] min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes) for ind in range(coordinates.shape[0] ): #for ind, loc in enumerate(reader.coordinates): mz, intens = reader.getspectrum(ind) if mz == mz_axes: pass else: file_type = cls.available_imzml_types['processed'] if min_mz > np.amin(mz): min_mz = np.amin(mz) if max_mz < np.amax(mz): max_mz = np.amax(mz) # Reinterpolate the mz-axis if we have a processed mode imzml file if file_type == cls.available_imzml_types['processed']: f = np.ceil(1e6 * np.log(max_mz / min_mz) / resolution) mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f) log_helper.info( __name__, "Reinterpolated m/z axis for processed imzML file") # Construct the imzml metadata information dataset_metadata = metadata_dict() instrument_metadata = metadata_dict() method_metadata = metadata_dict() for k, v in reader.imzmldict.iteritems(): dataset_metadata[k] = metadata_value(name=k, value=v, unit=None, description=k, ontology=None) # Delete the parser and read the metadata del reader # Parse the metadata for the file. We try to parse only the header and ignore the # <run > group in the XML file to avoid going throught the whole file again # while extracting the majority of the relevant metadata try: with open(filename, 'r') as ins: metdata_header = '' for line in ins: if '<run' in line: break else: metdata_header += line metdata_header += '</mzML>' metdata_header_dict = xmltodict.parse(metdata_header)['mzML'] for k, v in metdata_header_dict.iteritems(): store_value = metadata_value( name=k, value=v, unit=None, description=str(k) + " extracted from imzML XML header.", ontology=None) if k == 'instrumentConfigurationList': instrument_metadata[k] = store_value elif k == 'dataProcessingList': method_metadata[k] = store_value elif k == 'scanSettingsList': dataset_metadata[k] = store_value elif k == 'softwareList': method_metadata[k] = store_value elif k == 'sampleList': method_metadata[k] = store_value else: dataset_metadata[k] = store_value dataset_metadata['imzml_xml_metadata_header'] = metadata_value( name='imzml_xml_metadata_header', value=metdata_header, unit=None, description='XML imzML header', ontology=None) except: log_helper.warning( __name__, "Extraction of additional imzML metadata failed") return coordinates, np.asarray( mz_axes ), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
def s_read_acqu(filename): """Construct an m/z axis for the given acqu file. :param filename: String with the name+path for the acqu file. :type filename: string :returns: Return dictonary with the parsed metadata information """ # Read the complete acqu file acqu = open(filename, 'r') lines = acqu.readlines() # read all lines of the file into a list acqu.close() # # Parse the acqu file and store all data in a python dictonary # acqu_dict = {} curr_line = 0 while curr_line < len(lines): # Skip lines with no data if len(lines[curr_line].rstrip("\n").rstrip("\r").rstrip( " ")) == 0: curr_line += 1 continue # All variables should start with ## in the acqu file if not lines[curr_line].startswith("##"): log_helper.warning(__name__, "WARNING: Error while reading line" + str(curr_line) + \ " of the acqu file. The error may have occured on the previous line") if curr_line > 0: log_helper.debug( __name__, str(curr_line - 1) + ": " + lines[curr_line - 1]) log_helper.debug(str(curr_line) + ": " + lines[curr_line]) curr_line += 1 continue sl = lines[curr_line].split("=") key = sl[0] # Remove beginning spaces and endline and tabs at the end from the # value value = sl[1].lstrip(' ').rstrip("\n").rstrip("\r").rstrip(" ") # Try to convert the value to a number is_number = False try: value = int(value) is_number = True except ValueError: try: value = float(value) is_number = True except ValueError: pass # Check whether the entry defines a vector of numbers unicode_value = unicode(value) if not is_number and unicode_value.startswith( "(") and unicode_value.endswith(")"): # How many values and lines do we need to read? sv = unicode_value.lstrip("(").rstrip(")").split("..") # low = int(sv[0]) high = int(sv[1]) num_vals = high + 1 vals_per_line = 8 num_lines = int(math.ceil(num_vals / float(vals_per_line))) # Read all the values into a list and convert the numbers if # possible value = [] curr_line += 1 # print str(num_lines) + " : " + sv[0] + "..." + sv[1] for _ in range(0, num_lines): sl = lines[curr_line].rstrip("\n").rstrip("\r").rstrip( " ").split(" ") try: sconv = [int(ix) for ix in sl] except ValueError: try: sconv = [float(ix) for ix in sl] except ValueError: sconv = sl value = value + sconv curr_line += 1 acqu_dict[key] = value else: acqu_dict[key] = value curr_line += 1 return acqu_dict
from omsi.analysis.base import analysis_base from omsi.datastructures.analysis_data import data_dtypes from omsi.datastructures.dependency_data import dependency_dict from omsi.datastructures.analysis_data import parameter_data from omsi.shared.log import log_helper try: import cloudpickle # Use the version of cloud-pickle installed on the system log_helper.debug(__name__, "Using system cloudpickle module") except ImportError: try: import omsi.shared.third_party.cloudpickle as cloudpickle log_helper.debug(__name__, "Using fallback cloudpickle version") except ImportError: log_helper.warning( __name__, "cloudpickle could not be imported. Using standard pickle instead. " + " Some features may not be available.") import pickle as cloudpickle import numpy as np def bastet_analysis(output_names=None, parameter_specs=None, name_key="undefined"): """ Decorator used to wrap a function and replace it with an analysis_generic object that behaves like a function but adds the ability for saving the analysis to file and tracking provenance This is essentially the same as analysis_generic.from_function(....).
def __init__(self, hdr_filename=None, t2m_filename=None, img_filename=None, basename=None, requires_slicing=True): """Open an img file for data reading. :param hdr_filename: The name of the hdr header file :type hdr_filename: string :param t2m_filename: The name of the t2m_filename :type t2m_filename: string :param img_filename: The name of the img data file :type img_filename: string :param basename: Instead of img_filename, t2m_filename, and hdr_filename one may also supply just a single basename. The basename is completed with the .img, .t2m, .hdr extension to load the data. :type basename: string :param requires_slicing: Unused here. Slicing is always supported by this reader. :type requires_slicing: Boolean :raises ValueError: In case that basename and hdr_filename, t2m_filename, and img_filename are specified. """ super(img_file, self).__init__(basename, requires_slicing) self.data_type = 'uint16' self.shape = [0, 0, 0] # Number of pixels in x,y, and z. NOTE: Type changed to tuple later on. self.mz = 0 # A numpy vector with the m/z values of the instrument if basename and hdr_filename and t2m_filename and img_filename: raise ValueError( "Conflicting input. Provide either basename or the " + "hdr_filename,t2m_filename,img_filename parameters but not both.") if basename: basefile = basename if os.path.isdir(basename): filelist = self.get_files_from_dir(basename) log_helper.log_var(__name__, filelist=filelist) if len(filelist) > 0: basefile = filelist[0] else: raise ValueError("No valid img file found in the given directory.") elif basefile.endswith(".img") and os.path.exists(basefile): basefile = basefile.rstrip(".img") elif basefile.endswith(".hdr") and os.path.exists(basefile): basefile = basefile.rstrip(".hdr") elif basefile.endswith(".t2m") and os.path.exists(basefile): basefile = basefile.rstrip(".t2m") log_helper.log_var(__name__, basefile=basefile) if os.path.exists(basefile + ".hdr") and \ os.path.exists(basefile + ".t2m") and \ os.path.exists(basefile + ".img"): hdr_filename = basefile + ".hdr" t2m_filename = basefile + ".t2m" img_filename = basefile + ".img" else: raise ValueError("No valid img file found for the given basename.") elif hdr_filename and t2m_filename and img_filename: pass # Nothing to be done else: raise ValueError("Missing input parameter. Either provide: " + " i) basename or ii) hdr_filename, t2m_filename, img_filename") # Initialize the x and y length hdr = open(hdr_filename, 'rb') hdrdata = np.fromfile(file=hdr_filename, dtype='int16', count=-1) self.shape[0] = int(hdrdata[23]) self.shape[1] = int(hdrdata[22]) hdr.close() # Initialize the z length t2m = open(t2m_filename, 'rb') self.mz = np.fromfile(file=t2m, dtype='float32', count=-1) self.shape[2] = self.mz.shape[0] t2m.close() # Convert the shape variable to the expected tuple self.shape = tuple(self.shape) # Open the img file with the spectrum data self.img_filename = img_filename self.file_opened = False try: self.m_img_file = np.memmap(filename=self.img_filename, dtype=self.data_type, shape=self.shape, mode='r', order='C') self.file_opened = True except ValueError: # Check if the size of the file matches what we expect imgsize = os.stat(self.img_filename).st_size itemsize = np.dtype(self.data_type).itemsize expectednumvalues = int(self.shape[0]) * int(self.shape[1]) * int(self.shape[2]) expectedsize = expectednumvalues * int(itemsize) sizedifference = expectedsize - imgsize log_helper.warning(__name__ , "IMG size: " + str(imgsize) + " Expected size: " + \ str(expectedsize) + " (difference="+str(sizedifference) + ")") if imgsize < expectedsize: # Check whether the missing data aligns with images or spectra slicesize = int(self.shape[0]) * int(self.shape[1]) * itemsize spectrumsize = int(self.shape[2]) * itemsize percentmissing = float(sizedifference)/float(expectedsize) valuesmissing = float(sizedifference) / itemsize warnings.warn("WARNING: Missing "+str(sizedifference) + " bytes in img file (missing " + str(valuesmissing) + " intensity values; "+str(percentmissing)+"%)." + " Expected shape: "+str(self.shape)) # Define how we should deal with the error expandslice = (sizedifference % slicesize) == 0 expandspectra = (sizedifference % spectrumsize) == 0 if not expandslice: expandspectra = True # Complete missing spectra if expandspectra: warnings.warn("Dealing with missing data in img file by completing last spectra with 0's.") # TODO np.require create an in-memory copy of the full data. Allow usage of memmap'ed tempfile. tempmap = np.require(np.memmap(filename=self.img_filename, dtype=self.data_type, mode='r', order='C'), requirements=['O', 'C']) # Extend the memmap to the expected size tempmap.resize((expectednumvalues, )) # Reshape the memmap to the expected shape self.m_img_file = tempmap.reshape(self.shape, order='C') self.file_opened = True # Complete missing slices elif expandslice: slicesmissing = sizedifference / slicesize self.mz = self.mz[:(-slicesmissing)] warnings.warn("Dealing with missing data in img file by updating he m/z axis.." + " It looks like the m/z axis data may be inconsistent" + " with the binary data. Removing "+str(slicesmissing) + " bins from the m/z axis.") self.shape = list(self.shape) self.shape[2] = self.mz.shape[0] self.shape = tuple(self.shape) self.m_img_file = np.memmap(filename=self.img_filename, dtype=self.data_type, shape=self.shape, mode='r', order='C') self.file_opened = True else: raise else: raise except: log_helper.error(__name__, "Error while opening the img file: " + img_filename) raise
def record_postexecute(self, execution_time=None): """ Function used to record runtime information after the task we want to track is comleted, e.g. the `execute_analysis(...)` function of a standard analysis. The function may be overwritten in child classes to add recording of additional runtime information. When overwriting the function we should call super(...,self).runinfo_record_postexecute(execution_time) in the custom version to ensure that the execution and end_time are properly recorded. :param execution_time: The total time it took to execute the analysis. May be None, in which case the function will attempt to compute the execution time based on the start_time (if available) and the the current time. :param comm: Used for logging only. The MPI communicator to be used. Default value is None, in which case MPI.COMM_WORLD is used. """ log_helper.debug(__name__, 'Recording post-execution runtime data', root=self.mpi_root, comm=self.mpi_comm) # Finalize recording of post execution provenance self['end_time'] = unicode(datetime.datetime.now()) if execution_time is not None: self['execution_time'] = unicode(execution_time) elif 'start_time' in self: start_time = run_info_dict.string_to_time(self['start_time']) stop_time = run_info_dict.string_to_time(self['end_time']) self['execution_time'] = unicode( stop_time - start_time ) # TODO: This only gives execution time in full seconds right now else: self['execution_time'] = None # Attempt to record psutil data try: import psutil process = psutil.Process() self['memory_info_after'] = unicode(process.memory_info()) except ImportError: log_helper.warning( __name__, 'psutil not installed. Recording of part of runtime information not possible', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn( "Recording of psutil-based runtime information failed: " + str(sys.exc_info())) # Record the time and use profiling data if possible if self.__time_and_use_profiler is not None: self.__time_and_use_profiler.disable() self.__time_and_use_profiler.create_stats() self['profile'] = unicode(self.__time_and_use_profiler.stats) # Save the summary statistics for the profiling data stats_io = StringIO.StringIO() profiler_stats = pstats.Stats( self.__time_and_use_profiler, stream=stats_io).sort_stats('cumulative') profiler_stats.print_stats() self['profile_stats'] = stats_io.getvalue() # Record the memory profiling data if possible if self.__memory_profiler is not None and self.get_profile_memory(): log_helper.debug(__name__, 'Recording memory profiling data', root=self.mpi_root, comm=self.mpi_comm) mem_stats_io = StringIO.StringIO() memory_profiler.show_results(self.__memory_profiler, stream=mem_stats_io) self['profile_mem'] = unicode(self.__memory_profiler.code_map) self['profile_mem_stats'] = mem_stats_io.getvalue()
def s_read_acqu(filename): """Construct an m/z axis for the given acqu file. :param filename: String with the name+path for the acqu file. :type filename: string :returns: Return dictonary with the parsed metadata information """ # Read the complete acqu file acqu = open(filename, 'r') lines = acqu.readlines() # read all lines of the file into a list acqu.close() # # Parse the acqu file and store all data in a python dictonary # acqu_dict = {} curr_line = 0 while curr_line < len(lines): # Skip lines with no data if len(lines[curr_line].rstrip("\n").rstrip("\r").rstrip(" ")) == 0: curr_line += 1 continue # All variables should start with ## in the acqu file if not lines[curr_line].startswith("##"): log_helper.warning(__name__, "WARNING: Error while reading line" + str(curr_line) + \ " of the acqu file. The error may have occured on the previous line") if curr_line > 0: log_helper.debug(__name__, str(curr_line - 1) + ": " + lines[curr_line - 1]) log_helper.debug(str(curr_line) + ": " + lines[curr_line]) curr_line += 1 continue sl = lines[curr_line].split("=") key = sl[0] # Remove beginning spaces and endline and tabs at the end from the # value value = sl[1].lstrip(' ').rstrip("\n").rstrip("\r").rstrip(" ") # Try to convert the value to a number is_number = False try: value = int(value) is_number = True except ValueError: try: value = float(value) is_number = True except ValueError: pass # Check whether the entry defines a vector of numbers unicode_value = unicode(value) if not is_number and unicode_value.startswith("(") and unicode_value.endswith(")"): # How many values and lines do we need to read? sv = unicode_value.lstrip("(").rstrip(")").split("..") # low = int(sv[0]) high = int(sv[1]) num_vals = high + 1 vals_per_line = 8 num_lines = int(math.ceil(num_vals / float(vals_per_line))) # Read all the values into a list and convert the numbers if # possible value = [] curr_line += 1 # print str(num_lines) + " : " + sv[0] + "..." + sv[1] for _ in range(0, num_lines): sl = lines[curr_line].rstrip("\n").rstrip("\r").rstrip(" ").split(" ") try: sconv = [int(ix) for ix in sl] except ValueError: try: sconv = [float(ix) for ix in sl] except ValueError: sconv = sl value = value + sconv curr_line += 1 acqu_dict[key] = value else: acqu_dict[key] = value curr_line += 1 return acqu_dict
def v_qspectrum(cls, analysis_object, x, y, viewer_option=0): """Implement support for qspectrum URL requests for the viewer""" # Retrieve the h5py objects for the requried datasets from the local peak finding if viewer_option == 0: from omsi.shared.data_selection import check_selection_string, selection_type, selection_to_indexlist import numpy as np peak_mz = analysis_object['peak_mz'] peak_values = analysis_object['peak_value'] array_indices = analysis_object['peak_arrayindex'][:] indata_mz = analysis_object['indata_mz'] # Determine the shape of the original raw data if (indata_mz is None) or (array_indices is None): return None, None num_x = array_indices[:, 0].max() num_y = array_indices[:, 1].max() num_mz = indata_mz.shape[0] num_spectra = array_indices.shape[0] # Determine the size of the selection and the set of selected items x_list = selection_to_indexlist(x, num_x) y_list = selection_to_indexlist(y, num_y) if (check_selection_string(x) == selection_type['indexlist']) and \ (check_selection_string(y) == selection_type['indexlist']): if len(x_list) == len(y_list): items = [(x_list[i], y_list[i]) for i in xrange(0, len(x_list))] else: return None, None else: items = [0] * (len(x_list) * len(y_list)) index = 0 for xi in x_list: for yi in y_list: items[index] = (xi, yi) index += 1 shape_x = len(items) shape_y = 1 shape_z = num_mz # Initialize the data cube to be returned data = np.zeros((shape_x, shape_y, shape_z), dtype=peak_values.dtype) # Fill the non-zero locations for the data cube with data for ni, ci in enumerate(items): try: # Pixel indices may be out of order (e.g, when we use MPI) so we look up the pixel location current_index = np.nonzero( np.logical_and(array_indices[0] == ci[0], array_indices[1] == ci[1]))[0][0] except: log_helper.warning( __name__, "Requested pixel not found: " + str(items[ni])) continue current_dx = ni current_dy = 0 start_index = array_indices[current_index][2] if current_index < num_spectra: end_index = array_indices[(current_index + 1)][2] else: end_index = peak_values.size if start_index != end_index: temp_values = peak_values[start_index:end_index] temp_mz = peak_mz[start_index:end_index] data[current_dx, current_dy, temp_mz] = temp_values else: # The start and end index may be the same in case that # no peaks for found for the given spectrum # The data is already initialized to 0 so there is nothing to do here pass if len(items) == 1: data = data.reshape((shape_x, shape_z)) # Return the spectra and indicate that no customMZ data values (i.e. None) are needed return data, None elif viewer_option > 0: return super(omsi_findpeaks_local, cls).v_qspectrum(analysis_object, x, y, viewer_option - 1) else: return None, None
def record_preexecute(self): """ Record basic runtime information in this dict before the exeuction is started. Function used to record runtime information prior to executing the process we want to track, e.g., the `execute_analysis(...)` of a standard analysis. The function may be overwritten in child classes to add recording of additional runtime information. All runtime data should be recorded in the main dict (i.e, self). This ensures in the case of standard analysis that the data is stored in the HDF5 file. Other data should be stored in separate variables that we may add to the object. When overwriting the function we should typically call super(...,self).runinfo_record_pretexecute() last in the custom version to ensure that the start_time is properly recorded right before the execution of the analysis. """ log_helper.debug(__name__, 'Recording pre-execution runtime data', root=self.mpi_root, comm=self.mpi_comm) # Record basic runtime environment information using the platform module try: self['architecture'] = unicode(platform.architecture()) self['java_ver'] = unicode(platform.java_ver()) self['libc_ver'] = unicode(platform.libc_ver()) self['linux_distribution'] = unicode(platform.linux_distribution()) self['mac_ver'] = unicode(platform.mac_ver()) self['machine'] = unicode(platform.machine()) self['node'] = unicode(platform.node()) self['platform'] = unicode(platform.platform()) self['processor'] = unicode(platform.processor()) self['python_branch'] = unicode(platform.python_branch()) self['python_build'] = unicode(platform.python_build()) self['python_compiler'] = unicode(platform.python_compiler()) self['python_implementation'] = unicode( platform.python_implementation()) self['python_revision'] = unicode(platform.python_revision()) self['python_version'] = unicode(platform.python_version()) self['release'] = unicode(platform.release()) self['system'] = unicode(platform.system()) self['uname'] = unicode(platform.uname()) self['version'] = unicode(platform.version()) self['win32_ver'] = unicode(platform.win32_ver()) except: warnings.warn( "WARNING: Recording of platform provenance failed: " + str(sys.exc_info())) # Attempt to record the svn version information try: import subprocess self['svn_ver'] = subprocess.check_output('svnversion').rstrip( '\n') except ImportError: log_helper.warning( __name__, 'Recording of svn version not possible. subprocess not installed', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn("Recording of svn version information failed: " + str(sys.exc_info())) # Attempt to record software library version try: import numpy as np self['numpy_version_full_version'] = unicode( np.version.full_version) self['numpy_version_release'] = unicode(np.version.release) self['numpy_version_git_revision'] = unicode( np.version.git_revision) except ImportError: log_helper.warning(__name__, 'Recording of numpy version not possible.', root=self.mpi_root, comm=self.mpi_comm) # Attempt to record psutil data try: import psutil self['logical_cpu_count'] = unicode(psutil.cpu_count()) self['cpu_count'] = unicode(psutil.cpu_count(logical=False)) process = psutil.Process() self['open_files'] = unicode(process.open_files()) self['memory_info_before'] = unicode(process.memory_info()) except ImportError: log_helper.warning( __name__, 'psutil not installed. Recording of part of runtime information not possible', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn( "Recording of psutil-based runtime information failed: " + str(sys.exc_info())) # Record the start time for the analysis self['start_time'] = unicode(datetime.datetime.now()) # Enable time and usage profiling if requested if self.__profile_time_and_usage: self.__time_and_use_profiler = Profile() self.__time_and_use_profiler.enable()
def __init__(self, hdr_filename=None, t2m_filename=None, img_filename=None, basename=None, requires_slicing=True): """Open an img file for data reading. :param hdr_filename: The name of the hdr header file :type hdr_filename: string :param t2m_filename: The name of the t2m_filename :type t2m_filename: string :param img_filename: The name of the img data file :type img_filename: string :param basename: Instead of img_filename, t2m_filename, and hdr_filename one may also supply just a single basename. The basename is completed with the .img, .t2m, .hdr extension to load the data. :type basename: string :param requires_slicing: Unused here. Slicing is always supported by this reader. :type requires_slicing: Boolean :raises ValueError: In case that basename and hdr_filename, t2m_filename, and img_filename are specified. """ super(img_file, self).__init__(basename, requires_slicing) self.data_type = 'uint16' self.shape = [ 0, 0, 0 ] # Number of pixels in x,y, and z. NOTE: Type changed to tuple later on. self.mz = 0 # A numpy vector with the m/z values of the instrument if basename and hdr_filename and t2m_filename and img_filename: raise ValueError( "Conflicting input. Provide either basename or the " + "hdr_filename,t2m_filename,img_filename parameters but not both." ) if basename: basefile = basename if os.path.isdir(basename): filelist = self.get_files_from_dir(basename) log_helper.log_var(__name__, filelist=filelist) if len(filelist) > 0: basefile = filelist[0] else: raise ValueError( "No valid img file found in the given directory.") elif basefile.endswith(".img") and os.path.exists(basefile): basefile = basefile.rstrip(".img") elif basefile.endswith(".hdr") and os.path.exists(basefile): basefile = basefile.rstrip(".hdr") elif basefile.endswith(".t2m") and os.path.exists(basefile): basefile = basefile.rstrip(".t2m") log_helper.log_var(__name__, basefile=basefile) if os.path.exists(basefile + ".hdr") and \ os.path.exists(basefile + ".t2m") and \ os.path.exists(basefile + ".img"): hdr_filename = basefile + ".hdr" t2m_filename = basefile + ".t2m" img_filename = basefile + ".img" else: raise ValueError( "No valid img file found for the given basename.") elif hdr_filename and t2m_filename and img_filename: pass # Nothing to be done else: raise ValueError( "Missing input parameter. Either provide: " + " i) basename or ii) hdr_filename, t2m_filename, img_filename") # Initialize the x and y length hdr = open(hdr_filename, 'rb') hdrdata = np.fromfile(file=hdr_filename, dtype='int16', count=-1) self.shape[0] = int(hdrdata[23]) self.shape[1] = int(hdrdata[22]) hdr.close() # Initialize the z length t2m = open(t2m_filename, 'rb') self.mz = np.fromfile(file=t2m, dtype='float32', count=-1) self.shape[2] = self.mz.shape[0] t2m.close() # Convert the shape variable to the expected tuple self.shape = tuple(self.shape) # Open the img file with the spectrum data self.img_filename = img_filename self.file_opened = False try: self.m_img_file = np.memmap(filename=self.img_filename, dtype=self.data_type, shape=self.shape, mode='r', order='C') self.file_opened = True except ValueError: # Check if the size of the file matches what we expect imgsize = os.stat(self.img_filename).st_size itemsize = np.dtype(self.data_type).itemsize expectednumvalues = int(self.shape[0]) * int(self.shape[1]) * int( self.shape[2]) expectedsize = expectednumvalues * int(itemsize) sizedifference = expectedsize - imgsize log_helper.warning(__name__ , "IMG size: " + str(imgsize) + " Expected size: " + \ str(expectedsize) + " (difference="+str(sizedifference) + ")") if imgsize < expectedsize: # Check whether the missing data aligns with images or spectra slicesize = int(self.shape[0]) * int(self.shape[1]) * itemsize spectrumsize = int(self.shape[2]) * itemsize percentmissing = float(sizedifference) / float(expectedsize) valuesmissing = float(sizedifference) / itemsize warnings.warn("WARNING: Missing " + str(sizedifference) + " bytes in img file (missing " + str(valuesmissing) + " intensity values; " + str(percentmissing) + "%)." + " Expected shape: " + str(self.shape)) # Define how we should deal with the error expandslice = (sizedifference % slicesize) == 0 expandspectra = (sizedifference % spectrumsize) == 0 if not expandslice: expandspectra = True # Complete missing spectra if expandspectra: warnings.warn( "Dealing with missing data in img file by completing last spectra with 0's." ) # TODO np.require create an in-memory copy of the full data. Allow usage of memmap'ed tempfile. tempmap = np.require(np.memmap(filename=self.img_filename, dtype=self.data_type, mode='r', order='C'), requirements=['O', 'C']) # Extend the memmap to the expected size tempmap.resize((expectednumvalues, )) # Reshape the memmap to the expected shape self.m_img_file = tempmap.reshape(self.shape, order='C') self.file_opened = True # Complete missing slices elif expandslice: slicesmissing = sizedifference / slicesize self.mz = self.mz[:(-slicesmissing)] warnings.warn( "Dealing with missing data in img file by updating he m/z axis.." + " It looks like the m/z axis data may be inconsistent" + " with the binary data. Removing " + str(slicesmissing) + " bins from the m/z axis.") self.shape = list(self.shape) self.shape[2] = self.mz.shape[0] self.shape = tuple(self.shape) self.m_img_file = np.memmap(filename=self.img_filename, dtype=self.data_type, shape=self.shape, mode='r', order='C') self.file_opened = True else: raise else: raise except: log_helper.error( __name__, "Error while opening the img file: " + img_filename) raise
def __populate_analysis__(cls, analysis_group, analysis): """ Populate the given h5py group with the analysis data. NOTE: This is a private helper function. Use the corresponding create_analysis function of omsi_file_experiment to create a completely new analysis. NOTE: At this point we assume that all in-memory dependencies have been resolved. If not, then the raw data associated with the given parameter will be saved instead. :param analysis_group: h5py group in which the analysis data should be stored. :param analysis: Instance of omsi.analysis.analysis_base defining the analysis :type analysis: omsi.analysis.analysis_base: :returns: The omsi_file_analysis object for the newly created analysis group. The analysis data is automatically written to file by this function so no addition work is required. """ from omsi.datastructures.analysis_data import analysis_data from omsi.dataformat.omsi_file.dependencies import omsi_file_dependencies from omsi.analysis.base import analysis_base # 1. Write the analysis name analysis_identifier_data = analysis_group.require_dataset( name=unicode(omsi_format_analysis.analysis_identifier), shape=(1, ), dtype=omsi_format_common.str_type) if omsi_format_common.str_type_unicode: analysis_identifier_data[0] = analysis.get_analysis_identifier() else: analysis_identifier_data[0] = str( analysis.get_analysis_identifier()) # 2. Write the analysis type analysis_type_data = analysis_group.require_dataset( name=unicode(omsi_format_analysis.analysis_type), shape=(1, ), dtype=omsi_format_common.str_type) if omsi_format_common.str_type_unicode: analysis_type_data[0] = analysis.get_analysis_type() else: analysis_type_data[0] = str(analysis.get_analysis_type()) # 3. Write the analysis data try: analysis.write_analysis_data(analysis_group=analysis_group) except NotImplementedError: for ana_data in analysis.get_all_analysis_data(): cls.__write_omsi_analysis_data__(analysis_group, ana_data) # 4. Determine all dependencies and parameters that we need to write dependencies = [ ] # [dep['data'] for dep in analysis.get_all_dependency_data()] parameters = [] # 4.1 Resolve in-memory dependencies if possible for dependent_parameter in analysis.get_all_dependency_data(): # 4.1.1 We have an in-memory dependency if isinstance(dependent_parameter['data']['omsi_object'], analysis_base): # 4.1.1.1 We can resolve the dependency to an object in an HDF5 file if dependent_parameter['data'][ 'omsi_object'].has_omsi_analysis_storage(): # Create a new dependency that points to the approbriate file location # NOTE: We do not modify the dependency in the analysis object that we save # but we only change it for the purpose of storage new_dep = dependent_parameter['data'].copy() new_dep_omsi_object = None # Check if we can find an analysis data store within the same parent (or at least file) parent_filename = os.path.abspath( analysis_group.file.filename) for analysis_store in dependent_parameter['data'][ 'omsi_object'].get_omsi_analysis_storage(): analysis_store_filename = os.path.abspath( analysis_store.managed_group.file.filename) if analysis_store.name == analysis_group.parent.name and \ analysis_store_filename == parent_filename: new_dep_omsi_object = analysis_store break elif analysis_store_filename == parent_filename: new_dep_omsi_object = analysis_store # We could not find a prior data store within the same file so use one from another file if new_dep_omsi_object is None: dep_object = dependent_parameter['data']['omsi_object'] new_dep[ 'omsi_object'] = dep_object.get_omsi_analysis_storage( )[0] else: new_dep['omsi_object'] = new_dep_omsi_object # Append it to the list of dependencies dependencies.append(new_dep) # 4.1.1.2 We cannot resolve the dependency and need to store it as an parameter instead else: # Replace the dependency with the actual data and save it as a parameter instead new_param = dependent_parameter.copy() new_param['data'] = new_param['data'].get_data() parameters.append(new_param) # 4.1.2 We have a file-based dependencies so keep it as is and add it to the list of dependencies else: dependencies.append(dependent_parameter['data']) # 4.2 Add all regular parameters to the list of parameters parameters += analysis.get_all_parameter_data( exclude_dependencies=True) # 5. Write all the parameters parameter_group = analysis_group.require_group( omsi_format_analysis.analysis_parameter_group) for param_data in parameters: if param_data['required'] or param_data.data_set( ) or param_data['default'] is not None: temp_data = param_data.get_data_or_default() if temp_data is not None: anadata = analysis_data( name=param_data['name'], data=param_data.get_data_or_default(), dtype=param_data['dtype']) cls.__write_omsi_analysis_data__(parameter_group, anadata) # Try to add the help string attribute try: help_attr = omsi_format_analysis.analysis_parameter_help_attr parameter_group[param_data['name']].attrs[ help_attr] = param_data['help'] except KeyError: pass # 6. Write all the runtime execution information runinfo_group = analysis_group.require_group( omsi_format_analysis.analysis_runinfo_group) for run_info_key, run_info_value in analysis.get_all_run_info().items( ): # Generate an analysis_data object in order to use the # __write_omsi_analysis_data function to write the data if isinstance(run_info_value, unicode) or isinstance( run_info_value, str): anadata = analysis_data(name=unicode(run_info_key), data=run_info_value, dtype=omsi_format_common.str_type) else: dat = np.asarray(run_info_value) if len(dat.shape) == 0: dat = np.asarray([run_info_value]) anadata = analysis_data(name=unicode(run_info_key), data=dat, dtype=dat.dtype) cls.__write_omsi_analysis_data__(runinfo_group, anadata) # 7. Write all dependencies omsi_file_dependencies.__create__(parent_group=analysis_group, dependencies_data_list=dependencies) # 8. Execute the custom data write for the analysis analysis.add_custom_data_to_omsi_file(analysis_group) # 9. Create the output object re = omsi_file_analysis(analysis_group) # 10. Save the output object in the ist of omsi analysis data stores as part of the analysis object analysis.omsi_analysis_storage.append(re) # 11. Check if we need to pickle and save the analysis class in case this is a custom class that is not part of BASTet try: from omsi.analysis.analysis_views import analysis_views _ = analysis_views.analysis_name_to_class( analysis.get_analysis_type()) except NameError: class_pickle = cloudpickle.dumps(analysis.__class__) # Convert the pickle string to an uint8 array to avoid problems # with storing string with NULL characters in HDF5 class_pickle_arr = np.fromstring( class_pickle, dtype=omsi_format_analysis.analysis_class_pickle_np_dtype) analysis_group[unicode( omsi_format_analysis.analysis_class)] = class_pickle_arr except: log_helper.warning(__name__, "Could not save the analysis class.") pass # 12. Retrun the new omsi_file_analysis object return re
def __compute_file_info(cls, filename, resolution): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis, data type for the intensities, format type :return: Numpy array with mz axis :return: string with data type :return: imzml file type :return: """ reader = ImzMLParser(filename) # Read the first spectrum mz_axes, intens = reader.getspectrum(0) # NOTE: mz_axes is a tuple # Read the coordinates coordinates = np.asarray(reader.coordinates) # #Start the data at [0,0,0] # coordinates[:,0] = coordinates[:,0] - np.amin(coordinates,axis=0)[0] # coordinates[:,1] = coordinates[:,1] - np.amin(coordinates,axis=0)[1] # coordinates[:,2] = coordinates[:,2] - np.amin(coordinates,axis=0)[2] # Determine the data type for the internsity values dtype = np.asarray(intens).dtype.str # Compute the mz axis and file type file_type = cls.available_imzml_types['continuous'] min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes) for ind in range(coordinates.shape[0]): #for ind, loc in enumerate(reader.coordinates): mz, intens = reader.getspectrum(ind) if mz == mz_axes: pass else: file_type = cls.available_imzml_types['processed'] if min_mz > np.amin(mz): min_mz = np.amin(mz) if max_mz < np.amax(mz): max_mz = np.amax(mz) # Reinterpolate the mz-axis if we have a processed mode imzml file if file_type == cls.available_imzml_types['processed']: f = np.ceil(1e6 * np.log(max_mz/min_mz)/resolution) mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f) log_helper.info(__name__, "Reinterpolated m/z axis for processed imzML file") # Construct the imzml metadata information dataset_metadata = metadata_dict() instrument_metadata = metadata_dict() method_metadata = metadata_dict() for k, v in reader.imzmldict.iteritems(): dataset_metadata[k] = metadata_value(name=k, value=v, unit=None, description=k, ontology=None) # Delete the parser and read the metadata del reader # Parse the metadata for the file. We try to parse only the header and ignore the # <run > group in the XML file to avoid going throught the whole file again # while extracting the majority of the relevant metadata try: with open(filename, 'r') as ins: metdata_header = '' for line in ins: if '<run' in line: break else: metdata_header += line metdata_header += '</mzML>' metdata_header_dict = xmltodict.parse(metdata_header)['mzML'] for k, v in metdata_header_dict.iteritems(): store_value = metadata_value(name=k, value=v, unit=None, description=str(k) + " extracted from imzML XML header.", ontology=None) if k == 'instrumentConfigurationList': instrument_metadata[k] = store_value elif k == 'dataProcessingList': method_metadata[k] = store_value elif k == 'scanSettingsList': dataset_metadata[k] = store_value elif k == 'softwareList': method_metadata[k] = store_value elif k =='sampleList': method_metadata[k] = store_value else: dataset_metadata[k] = store_value dataset_metadata['imzml_xml_metadata_header'] = metadata_value(name='imzml_xml_metadata_header', value=metdata_header, unit=None, description='XML imzML header', ontology=None) except: log_helper.warning(__name__, "Extraction of additional imzML metadata failed") return coordinates, np.asarray(mz_axes), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata