def execute(self, **kwargs): """ Overwrite the default implementation of execute to update parameter specifications/types when wrapping functions where the types are not known a priori. :param kwargs: Custom analysis parameters :return: The result of execute_analysis() """ # Update the dtype of all the input parameters to ensure we save them correctly to file log_helper.debug(__name__, "Setting parameters based on the given inputs") ana_dtypes = data_dtypes.get_dtypes() for k, v in kwargs.iteritems(): for param in self.parameters: if param['name'] == k: if hasattr(v, 'dtype'): param['dtype'] = ana_dtypes['ndarray'] else: param['dtype'] = type(v) # Determine the custom parameters custom_parameters = kwargs # Execute the analysis as usual result = super(analysis_generic, self).execute(**custom_parameters) return result
def __read_all(self, filename): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes. """ self.data = np.zeros(shape=self.shape, dtype=self.data_type) log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape]) reader = ImzMLParser(filename) log_helper.debug(__name__,'READING ALL DATA!! GIVE ME RAM (please)!') # Compute the bin edges for reinterpolation if needed if self.imzml_type == self.available_imzml_types['processed']: shift = np.diff(self.mz).mean() bin_edges = np.append(self.mz, self.mz[-1]+ shift) else: bin_edges = None for ind in xrange(0, len(reader.coordinates)): xidx, yidx = reader.coordinates[ind] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min # Read the spectrum mz, intens = reader.getspectrum(ind) # Reinterpolate intensities if we are in processed mode if bin_edges is not None: f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False) intens = f(self.mz) #intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) # Save the intensity values in our data cube self.data[xidx, yidx, :] = intens
def __init__(self, analysis_objects=None): """ Initialize the workflow executor :param analysis_objects: A list of analysis objects to be executed """ super(workflow_executor_base, self).__init__() log_helper.debug(__name__, "Creating workflow executor") if analysis_objects is not None: if not isinstance(analysis_objects, list) and not isinstance(analysis_objects, set): analysis_objects = [analysis_objects, ] log_helper.log_var(__name__, analysis_objects=analysis_objects, level='DEBUG') self.run_info = run_info_dict() self.analysis_tasks = analysis_task_list(analysis_objects) \ if analysis_objects is not None \ else analysis_task_list() self.mpi_comm = mpi_helper.get_comm_world() self.mpi_root = 0 self.workflow_identifier = "we" # self.parameters = [] # Inherited from parameter_manager and set in parent class dtypes = data_dtypes.get_dtypes() self.add_parameter(name='profile_time_and_usage', help='Enable/disable profiling of time and usage of the whole workflow', required=False, default=False, dtype=dtypes['bool']) self.add_parameter(name='profile_memory', help='Enable/disable profiling of memory usage of the whole workflow', required=False, default=False, dtype=dtypes['bool'])
def enable_profile_memory(self, enable=True): """ Enable/disable profiling of memory usage :param enable: boolean to enable (True) or disable (False) memory profiling """ if PROFILE_MEMORY_AVAILABLE: if not enable and self.__profile_memory: log_helper.debug(__name__, "Disabled memory profiling. ", root=self.mpi_root, comm=self.mpi_comm) if enable and not self.__profile_memory: log_helper.debug(__name__, "Enabled memory profiling. ", root=self.mpi_root, comm=self.mpi_comm) self.__profile_memory = enable else: self.__profile_memory = False if enable: log_helper.warning( __name__, 'Profiling of memory usage not available.' + ' Missing memory_profiler or StringIO package')
def record_postexecute(self, execution_time=None): """ Function used to record runtime information after the task we want to track is comleted, e.g. the `execute_analysis(...)` function of a standard analysis. The function may be overwritten in child classes to add recording of additional runtime information. When overwriting the function we should call super(...,self).runinfo_record_postexecute(execution_time) in the custom version to ensure that the execution and end_time are properly recorded. :param execution_time: The total time it took to execute the analysis. May be None, in which case the function will attempt to compute the execution time based on the start_time (if available) and the the current time. :param comm: Used for logging only. The MPI communicator to be used. Default value is None, in which case MPI.COMM_WORLD is used. """ log_helper.debug(__name__, 'Recording post-execution runtime data', root=self.mpi_root, comm=self.mpi_comm) # Finalize recording of post execution provenance self['end_time'] = unicode(datetime.datetime.now()) if execution_time is not None: self['execution_time'] = unicode(execution_time) elif 'start_time' in self: start_time = run_info_dict.string_to_time(self['start_time']) stop_time = run_info_dict.string_to_time(self['end_time']) self['execution_time'] = unicode(stop_time - start_time) # TODO: This only gives execution time in full seconds right now else: self['execution_time'] = None # Attempt to record psutil data try: import psutil process = psutil.Process() self['memory_info_after'] = unicode(process.memory_info()) except ImportError: log_helper.warning(__name__, 'psutil not installed. Recording of part of runtime information not possible', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn("Recording of psutil-based runtime information failed: "+str(sys.exc_info())) # Record the time and use profiling data if possible if self.__time_and_use_profiler is not None: self.__time_and_use_profiler.disable() self.__time_and_use_profiler.create_stats() self['profile'] = unicode(self.__time_and_use_profiler.stats) # Save the summary statistics for the profiling data stats_io = StringIO.StringIO() profiler_stats = pstats.Stats(self.__time_and_use_profiler, stream=stats_io).sort_stats('cumulative') profiler_stats.print_stats() self['profile_stats'] = stats_io.getvalue() # Record the memory profiling data if possible if self.__memory_profiler is not None and self.get_profile_memory(): log_helper.debug(__name__, 'Recording memory profiling data', root=self.mpi_root, comm=self.mpi_comm) mem_stats_io = StringIO.StringIO() memory_profiler.show_results(self.__memory_profiler, stream=mem_stats_io) self['profile_mem'] = unicode(self.__memory_profiler.code_map) self['profile_mem_stats'] = mem_stats_io.getvalue()
def enable_profile_time_and_usage(self, enable=True): """ Enable/disable time and usage profiling :param enable: boolean to enable (True) or disable (False) time and usage profiling """ if PROFILE_AVAILABLE: if not enable and self.__profile_time_and_usage: log_helper.debug(__name__, "Disabled time and usage profiling. ", root=self.mpi_root, comm=self.mpi_comm) if enable and not self.__profile_time_and_usage: log_helper.debug(__name__, "Enabled time and usage profiling. ", root=self.mpi_root, comm=self.mpi_comm) self.__profile_time_and_usage = enable else: self.__profile_time_and_usage = False if enable: log_helper.warning( __name__, 'Profiling of time and usage not available.' + ' Missing profile and/or pstats package')
def append(self, analysis_object): """ Add a given analysis to the set of object to be executed by the workflow This is the same as set.add() but we ensure that only analysis_base objects are added. :param analysis_object: Analysis object to be added to the execution. All dependencies of the analysis will also be executed as part of the execution. :type analysis_object: omsi.analysis.base.analysis_base :raises: ValueError is raised if the given analysis_object is invalid """ from omsi.analysis.base import analysis_base if isinstance(analysis_object, analysis_base): if analysis_object in self: log_helper.debug(__name__, "Analysis already in the list of tasks") return log_helper.info( __name__, "Adding analysis object to the workflow set. " + str(analysis_object)) super(analysis_task_list, self).append(analysis_object) else: raise ValueError( 'Analysis is not of type omsi.analysis.base.analysis_base')
def gather(self): """ Simple helper function to gather the runtime information---that has been collected on multiple processes when running using MPI---on a single root process :return: If we have more than one processes then this function returns a dictionary with the same keys as usual for the run_info but the values are now lists with one entry per mpi processes. If we only have a single process, then the run_info object will be returned without changes. NOTE: Similar to mpi gather, the function only collects information on the root. All other processes will return just their own private runtime information. """ if mpi_helper.MPI_AVAILABLE: if self.mpi_comm.Get_size() > 1: log_helper.debug(__name__, 'Gather runtime data from parallel tasks', root=self.mpi_root, comm=self.mpi_comm) self['mpi_rank'] = self.mpi_comm.Get_rank() run_data = self.mpi_comm.gather(self, self.mpi_root) if self.mpi_comm.Get_rank() == self.mpi_root: merged_run_data = {} for run_dict in run_data: for key in run_dict: try: merged_run_data[key].append(run_dict[key]) except KeyError: merged_run_data[key] = [run_dict[key]] return merged_run_data return self
def __read_all(self, filename): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes. """ self.data = np.zeros(shape=self.shape, dtype=self.data_type) log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape]) reader = ImzMLParser(filename) log_helper.debug(__name__, 'READING ALL DATA!! GIVE ME RAM (please)!') # Compute the bin edges for reinterpolation if needed if self.imzml_type == self.available_imzml_types['processed']: shift = np.diff(self.mz).mean() bin_edges = np.append(self.mz, self.mz[-1] + shift) else: bin_edges = None for ind in xrange(0, len(reader.coordinates)): xidx, yidx = reader.coordinates[ind] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min # Read the spectrum mz, intens = reader.getspectrum(ind) # Reinterpolate intensities if we are in processed mode if bin_edges is not None: intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) # Save the intensity values in our data cube self.data[xidx, yidx, :] = intens
def clear(self): """ Remove all analyses from the workflow. Shorthand for: self.analysis_tasks.clear() """ log_helper.debug(__name__, "Clearing the workflow", root=self.mpi_root, comm=self.mpi_comm) self.analysis_tasks.clear()
def __read_all(self): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes """ self.data = [ np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types) ] for scan_idx, scantype in enumerate(self.scan_types): reader = mzml.read(self.basename) spectrumid = 0 if not self.scan_profiled[scan_idx]: shift = np.diff(self.mz_all[scan_idx]).mean() bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1] + shift) else: bin_edges = None for spectrum in reader: if spectrum['scanList']['scan'][0][ 'filter string'] == scantype: x = spectrum['m/z array'] try: y = spectrum['intensity array'] except KeyError: raise KeyError if bin_edges is None: yi = np.interp( self.mz_all[scan_idx], x, y, 0, 0) # Re-interpolate the data in profiled mode else: yi, _ = np.histogram( x, bins=bin_edges, weights=y ) # Re-histogram the data in centroided mode xidx = np.nonzero( self.x_pos == self.coordinates[spectrumid, 0])[0] yidx = np.nonzero( self.y_pos == self.coordinates[spectrumid, 1])[0] try: self.data[scan_idx][xidx, yidx, :] = yi except: log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape) # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly if spectrumid % 1000 == 0: log_helper.info( __name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype)) spectrumid += 1
def define_missing_parameters(self): """ Set any required parameters that have not been defined to their respective default values. This function may be overwritten in child classes to customize the definition of default parameter values and to apply any modifications (or checks) of parameters before the analysis is executed. Any changes applied here will be recorded in the parameter of the analysis. """ log_helper.debug(__name__, "Define missing parameters to default") for param in self.parameters: if param['required'] and not param.data_set(): param['data'] = param['default']
def execute(self): """ Execute the workflow. This uses the main() function to run the actual workflow. """ log_helper.debug(__name__, "Execute", root=self.mpi_root, comm=self.mpi_comm) result = self.run_info(self.main)() try: log_helper.debug(__name__, 'Execution time: ' + str(self.run_info['execution_time']) + "s", root=self.mpi_root, comm=self.mpi_comm) except (KeyError, ValueError): pass # 4) Return the result of the execution execution return result
def set_parameter_default_value(self, name, value): """ Set the default value of the parameter with the given name :param name: Name of the parameter :param value: New value :raises: KeyError if parameter not found """ log_helper.debug(__name__, "Setting default value of " +str(name) + " to " + str(value)) param = self.get_parameter_data_by_name(dataname=name) if isinstance(param, parameter_data): param['default'] = value else: raise KeyError('Unknown parameter ' + str(name))
def __setitem__(self, key, value): """ Set worflow driver parameter options directly via slicing Overwrite this function in child classes to implement custom setting behavior, e.g., error checking for valid values before setting a non-standard parameter. :param key: name of the parameters :param value: new value :raise: ValueError if an invalid value is given :raise: KeyError if an invalid key is given """ log_helper.debug(__name__, 'Setting parameter ' + key, root=self.mpi_root, comm=self.mpi_comm) return super(workflow_executor_base, self).__setitem__(key, value)
def create_analysis_object(self): """ Initialize the analysis object, i.e., set self.analysis_object """ if self.analysis_class is not None: if not isinstance(self.analysis_object, self.analysis_class): self.analysis_object = None if self.analysis_object is None: log_helper.debug(__name__, 'Initalizing analysis object', root=self.mpi_root, comm=self.mpi_comm) self.analysis_object = None if self.analysis_class is None else self.analysis_class() self.analysis_object.mpi_root = self.mpi_root self.analysis_object.mpi_comm = self.mpi_comm else: pass else: self.analysis_object = None
def insert(self, index, analysis_object): """ Insert a given analysis object at the given location :param index: Location where the obejct should be inserted :param analysis_object: The analysis object to be inserted """ from omsi.analysis.base import analysis_base if isinstance(analysis_object, analysis_base): if analysis_object in self: log_helper.debug(__name__, "Analysis already in the list of tasks") return log_helper.info(__name__, "Inserting analysis object in the workflow list. " + str(analysis_object)) super(analysis_task_list, self).insert(index, analysis_object) else: raise ValueError('Analysis is not of type omsi.analysis.base.analysis_base')
def set_parameter_default_value(self, name, value): """ Set the default value of the parameter with the given name :param name: Name of the parameter :param value: New value :raises: KeyError if parameter not found """ log_helper.debug( __name__, "Setting default value of " + str(name) + " to " + str(value)) param = self.get_parameter_data_by_name(dataname=name) if isinstance(param, parameter_data): param['default'] = value else: raise KeyError('Unknown parameter ' + str(name))
def main(self): """Execute the analysis workflow""" if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty") return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow") log_helper.info(__name__, "Adding all dependencies") self.add_analysis_dependencies() # Record the runtime information log_helper.debug(__name__, "Recording runtime information") self.run_info.clear() self.run_info.record_preexecute() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow") all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len( analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis)) analysis.execute() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.") break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning( __name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies." ) iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG') # Record the runtime information after we are done with the workflow self.run_info.record_postexecute() self.run_info.gather()
def make_analysis_identifiers_unique(self): """ Update analysis identifiers to be unique. Side effects: This function updates the analysis tasks stored in the set :return: self, i.e., the modified object with identifiers updated """ identifiers = self.get_all_analysis_identifiers() unique_identifiers = list(set(identifiers)) num_update = len(identifiers) - len(unique_identifiers) if num_update > 0: log_helper.debug(__name__, "%i analyses have non-unique identifiers and will be updated" % num_update) ana_index = 0 for ana in self: current_identifier = ana.get_analysis_identifier() if current_identifier not in unique_identifiers: ana.set_analysis_identifier('ana_' + str(ana_index) + "_" + unicode(current_identifier)) ana_index += 1 return self
def enable_profile_time_and_usage(self, enable=True): """ Enable/disable time and usage profiling :param enable: boolean to enable (True) or disable (False) time and usage profiling """ if PROFILE_AVAILABLE: if not enable and self.__profile_time_and_usage: log_helper.debug(__name__, "Disabled time and usage profiling. ", root=self.mpi_root, comm=self.mpi_comm) if enable and not self.__profile_time_and_usage: log_helper.debug(__name__, "Enabled time and usage profiling. ", root=self.mpi_root, comm=self.mpi_comm) self.__profile_time_and_usage = enable else: self.__profile_time_and_usage = False if enable: log_helper.warning(__name__, 'Profiling of time and usage not available.' + ' Missing profile and/or pstats package')
def enable_profile_memory(self, enable=True): """ Enable/disable profiling of memory usage :param enable: boolean to enable (True) or disable (False) memory profiling """ if PROFILE_MEMORY_AVAILABLE: if not enable and self.__profile_memory: log_helper.debug(__name__, "Disabled memory profiling. ", root=self.mpi_root, comm=self.mpi_comm) if enable and not self.__profile_memory: log_helper.debug(__name__, "Enabled memory profiling. ", root=self.mpi_root, comm=self.mpi_comm) self.__profile_memory = enable else: self.__profile_memory = False if enable: log_helper.warning(__name__, 'Profiling of memory usage not available.' + ' Missing memory_profiler or StringIO package')
def __compute_scan_types_and_indices(self, filename=None): """ Internal helper function used to compute a list of unique scan types in the mzml file. Also computes a numpy 1d array of ints which index every scan to relevant datacube. """ reader = mzml.read(filename) scantypes = [] scan_indices = [] scan_profiled = [] for idx, spectrum in enumerate(reader): try: scanfilter = spectrum['scanList']['scan'][0]['filter string'] if scanfilter not in scantypes: scantypes.append(scanfilter) scan_profiled.append(spectrum.has_key('profile spectrum')) scan_indices.append(scantypes.index(scanfilter)) except: log_helper.debug(__name__, idx) assert len(scan_indices) == self.num_scans return scantypes, scan_indices, scan_profiled
def add_analysis_from_scripts(self, script_files): """ Evaluate the list of scripts and add all (i.e., zero, one, or multiple) analyses to this workflow NOTE: This function executes scripts using exec(..), i.e., there are NO safeguards against malicious codes. :param script_files: List of strings with the paths to the script files. If only a single script is used, then a single string may be used as well. """ new_analysis_objects = analysis_task_list.from_script_files(script_files) if new_analysis_objects is not None and len(new_analysis_objects) > 0: log_helper.debug(__name__, "Adding %i new analyses to the workflow from scripts" % len(new_analysis_objects), root=self.mpi_root, comm=self.mpi_comm) self.analysis_tasks = self.analysis_tasks.union(new_analysis_objects) else: log_helper.debug(__name__, "No analysis found in scripts", root=self.mpi_root, comm=self.mpi_comm)
def clean_up(self): """ Clean up the runinfo object. In particular remove empty keys that either recorded None or recorded just an empty string. This function may be overwritten to also do clean-up needed due to additional custom runtime instrumentation. When overwriting this function we should call super(..., self).runinfo_clean_up() at the end of the function to ensure that the runinfo dictionary is clean, i.e., does not contain any empty entries. """ log_helper.debug(__name__, 'Clean up runtime data', root=self.mpi_root, comm=self.mpi_comm) # Remove empty items from the run_info dict for ri_key, ri_value in self.items(): try: if ri_value is None or len(ri_value) == 0: self.pop(ri_key) except: pass
def add_parameter(self, name, help, dtype=unicode, required=False, default=None, choices=None, data=None, group=None): """ Add a new parameter for the analysis. This function is typically used in the constructor of a derived analysis to specify the parameters of the analysis. :param name: The name of the parameter :param help: Help string describing the parameter :param dtype: Optional type. Default is string. :param required: Boolean indicating whether the parameter is required (True) or optional (False). Default False. :param default: Optional default value for the parameter. Default None. :param choices: Optional list of choices with allowed data values. Default None, indicating no choices set. :param data: The data assigned to the parameter. None by default. :param group: Optional group string used to organize parameters. Default None, indicating that parameters are automatically organized by driver class (e.g. in required and optional parameters) :raises: ValueError is raised if the parameter with the given name already exists. """ log_helper.debug(__name__, "Add parameter " + str(name)) if self.get_parameter_data_by_name(name) is not None: raise ValueError('A parameter with the name ' + unicode(name) + " already exists.") self.parameters.append( parameter_data(name=name, help=help, dtype=dtype, required=required, default=default, choices=choices, data=data, group=group))
def __read_all(self): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes """ self.data = [np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types)] for scan_idx, scantype in enumerate(self.scan_types): reader = mzml.read(self.basename) spectrumid = 0 if not self.scan_profiled[scan_idx]: shift = np.diff(self.mz_all[scan_idx]).mean() bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1]+ shift) else: bin_edges = None for spectrum in reader: if spectrum['scanList']['scan'][0]['filter string'] == scantype: x = spectrum['m/z array'] try: y = spectrum['intensity array'] except KeyError: raise KeyError if bin_edges is None: yi = np.interp(self.mz_all[scan_idx], x, y, 0, 0) # Re-interpolate the data in profiled mode else: yi, _ = np.histogram(x, bins=bin_edges, weights=y) # Re-histogram the data in centroided mode xidx = np.nonzero(self.x_pos == self.coordinates[spectrumid, 0])[0] yidx = np.nonzero(self.y_pos == self.coordinates[spectrumid, 1])[0] try: self.data[scan_idx][xidx, yidx, :] = yi except: log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape) # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly if spectrumid%1000 == 0: log_helper.info(__name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype)) spectrumid += 1
def execute_analysis(self): """ Nothing to do here. """ if self['__analysis_function'] is not None: log_helper.debug( __name__, "Compiling the input dict for the analysis function.") input_dict = {} for arg in self.parameters: if arg['data'] is not None and arg['name'] not in [ '__analysis_function', 'profile_time_and_usage', 'profile_memory' ]: if isinstance(arg['data'], dependency_dict): input_dict[arg['name']] = arg['data'].get_data() else: input_dict[arg['name']] = arg['data'] # When we restored the analysis we did not know that the parameter was supposed to be unicode log_helper.debug(__name__, "Unpickel the analysis function") # Convert to string as we stored the pickle string as uint8 array to avoid problems # with HDF5, NULL, and special chars analysis_function = self['__analysis_function'].tostring() analysis_function = pickle.loads(analysis_function) log_helper.debug(__name__, "Executing the analysis function") result = analysis_function(**input_dict) log_helper.debug( __name__, "Creating output data names and returning results") if isinstance(result, tuple): if len(self.data_names) >= len(result): pass else: self.data_names = [ (self.DEFAULT_OUTPUT_PREFIX + str(i)) for i in range(len(self.data_names), len(result)) ] elif result is None: self.data_names = [] else: if len(self.data_names) >= 1: pass else: self.data_names = [self.DEFAULT_OUTPUT_PREFIX + '0'] return result else: raise NotImplementedError( "We cannot run this analysis. Analysis_generic cannot run " + "an analysis unless an analysis function is set.")
def __setitem__(self, key, value): """ Set parameter options directly via slicing Overwrite this function in child classes to implement custom setting behavior, e.g., error checking for valid values before setting a non-standard parameter. :param key: name of the parameters :param value: new value :raise: ValueError if an invalid value is given :raise: KeyError if an invalid key is given """ # Check if we have a valid key param_set = False if isinstance(key, basestring): for param in self.parameters: if param['name'] == key: log_helper.debug(__name__, "Setting parameter " + key) param['data'] = value param_set = True if not param_set: raise KeyError('Invalid parameter key')
def create_workflow_executor_object(self): """ Initialize the workflow executor object, i.e., set self.workflow_executor *Side effects* This function potentially modifies self.workflow_executor """ if self.workflow_executor is None: log_helper.debug(__name__, 'Initializing workflow executor', root=self.mpi_root, comm=self.mpi_comm) default_executor_class = workflow_executor_base.get_default_executor_class( ) if self.script_files is None or len(self.script_files) == 0: self.workflow_executor = default_executor_class() else: self.workflow_executor = default_executor_class.from_script_files( self.script_files) self.workflow_executor.mpi_root = self.mpi_root self.workflow_executor.mpi_comm = self.mpi_comm else: pass
def append(self, analysis_object): """ Add a given analysis to the set of object to be executed by the workflow This is the same as set.add() but we ensure that only analysis_base objects are added. :param analysis_object: Analysis object to be added to the execution. All dependencies of the analysis will also be executed as part of the execution. :type analysis_object: omsi.analysis.base.analysis_base :raises: ValueError is raised if the given analysis_object is invalid """ from omsi.analysis.base import analysis_base if isinstance(analysis_object, analysis_base): if analysis_object in self: log_helper.debug(__name__, "Analysis already in the list of tasks") return log_helper.debug(__name__, "Adding analysis object to the workflow set. " + str(analysis_object)) super(analysis_task_list, self).append(analysis_object) else: raise ValueError('Analysis is not of type omsi.analysis.base.analysis_base')
def main(self): """Execute the analysis workflow""" if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty") return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow") log_helper.info(__name__, "Adding all dependencies") self.add_analysis_dependencies() # Record the runtime information log_helper.debug(__name__, "Recording runtime information") self.run_info.clear() self.run_info.record_preexecute() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow") all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis)) analysis.execute() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.") break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies.") iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG') # Record the runtime information after we are done with the workflow self.run_info.record_postexecute() self.run_info.gather()
def add_parameter(self, name, help, dtype=unicode, required=False, default=None, choices=None, data=None, group=None): """ Add a new parameter for the analysis. This function is typically used in the constructor of a derived analysis to specify the parameters of the analysis. :param name: The name of the parameter :param help: Help string describing the parameter :param dtype: Optional type. Default is string. :param required: Boolean indicating whether the parameter is required (True) or optional (False). Default False. :param default: Optional default value for the parameter. Default None. :param choices: Optional list of choices with allowed data values. Default None, indicating no choices set. :param data: The data assigned to the parameter. None by default. :param group: Optional group string used to organize parameters. Default None, indicating that parameters are automatically organized by driver class (e.g. in required and optional parameters) :raises: ValueError is raised if the parameter with the given name already exists. """ log_helper.debug(__name__, "Add parameter " + str(name)) if self.get_parameter_data_by_name(name) is not None: raise ValueError('A parameter with the name ' + unicode(name) + " already exists.") self.parameters.append(parameter_data(name=name, help=help, dtype=dtype, required=required, default=default, choices=choices, data=data, group=group))
def execute_analysis(self): """ Nothing to do here. """ if self['__analysis_function'] is not None: log_helper.debug(__name__, "Compiling the input dict for the analysis function.") input_dict = {} for arg in self.parameters: if arg['data'] is not None and arg['name'] not in ['__analysis_function', 'profile_time_and_usage', 'profile_memory']: if isinstance(arg['data'], dependency_dict): input_dict[arg['name']] = arg['data'].get_data() else: input_dict[arg['name']] = arg['data'] # When we restored the analysis we did not know that the parameter was supposed to be unicode log_helper.debug(__name__, "Unpickel the analysis function") # Convert to string as we stored the pickle string as uint8 array to avoid problems # with HDF5, NULL, and special chars analysis_function = self['__analysis_function'].tostring() analysis_function = pickle.loads(analysis_function) log_helper.debug(__name__, "Executing the analysis function") result = analysis_function(**input_dict) log_helper.debug(__name__, "Creating output data names and returning results") if isinstance(result, tuple): if len(self.data_names) >= len(result): pass else: self.data_names = [(self.DEFAULT_OUTPUT_PREFIX + str(i)) for i in range(len(self.data_names), len(result))] elif result is None: self.data_names = [] else: if len(self.data_names) >= 1: pass else: self.data_names = [self.DEFAULT_OUTPUT_PREFIX + '0'] return result else: raise NotImplementedError("We cannot run this analysis. Analysis_generic cannot run " + "an analysis unless an analysis function is set.")
def main(self): """ Execute the analysis workflow """ # Do the optional MPI barrier if self['synchronize']: mpi_helper.barrier(comm=self.mpi_comm) # Check if we have anything to do at all if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm) return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm) log_helper.info(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm) self.add_analysis_dependencies() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm) all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis), root=self.mpi_root, comm=self.mpi_comm) analysis.execute() if self['reduce_memory_usage']: analysis.clear_and_restore() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm) break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies.", root=self.mpi_root, comm=self.mpi_comm) iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
def __init__(self, basename, requires_slicing=True, resolution=5): """ Open an img file for data reading. :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found in the directory will be used instead. :type basename: string :param requires_slicing: Should the complete data be read into memory (this makes slicing easier). (default is True) :type requires_slicing: bool :param resolution: For profile data only, the minimum m/z spacing to use for creating the "full" reprofiled data cube :type resolution: float """ # Determine the correct base if os.path.isdir(basename): filelist = self.get_files_from_dir(basename) if len(filelist) > 0: basename = filelist[0] else: raise ValueError("No valid mzML file found in the given directory.") # self.basename = basename # self.requires_slicing = requires_slicing # Call super constructor. This sets self.basename and self.readall super(xmassmzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing) self.resolution = resolution self.data_type = 'uint32' # TODO What data type should we use for the interpolated data? self.num_scans = self.__compute_num_scans(filename=self.basename) log_helper.info(__name__, 'Read %s scans from mzML file.' % self.num_scans) log_helper.debug(__name__, 'Compute coordinates') self.coordinates = self.__compute_coordinates(filename=self.basename,num_scans=self.num_scans) # Compute the spatial configuration of the matrix self.x_pos = np.unique(self.coordinates[:, 0]) self.y_pos = np.unique(self.coordinates[:, 1]) self.step_size = min([min(np.diff(self.x_pos)), min(np.diff(self.y_pos))]) # Compute the mz axis log_helper.debug(__name__, 'Compute mz axes') self.mz = self.__compute_mz_axis(filename=self.basename) log_helper.debug(__name__, 'mz axes computed') # Determine the shape of the dataset, result is a list of shapes for each datacube # self.shape_all_data = [(self.x_pos.shape[0], self.y_pos.shape[0], mz.shape[0]) for mz in self.mz_all log_helper.debug(__name__, 'Compute shape') self.shape = (self.x_pos.shape[0], self.y_pos.shape[0], len(self.mz))#self.shape[0]) # self.shape = None # self.mz = None # Read the data into memory # self.data = None log_helper.debug(__name__, 'read all') if requires_slicing: self.data = self.__read_all() log_helper.debug(__name__, 'Finished with init')
def record_postexecute(self, execution_time=None): """ Function used to record runtime information after the task we want to track is comleted, e.g. the `execute_analysis(...)` function of a standard analysis. The function may be overwritten in child classes to add recording of additional runtime information. When overwriting the function we should call super(...,self).runinfo_record_postexecute(execution_time) in the custom version to ensure that the execution and end_time are properly recorded. :param execution_time: The total time it took to execute the analysis. May be None, in which case the function will attempt to compute the execution time based on the start_time (if available) and the the current time. :param comm: Used for logging only. The MPI communicator to be used. Default value is None, in which case MPI.COMM_WORLD is used. """ log_helper.debug(__name__, 'Recording post-execution runtime data', root=self.mpi_root, comm=self.mpi_comm) # Finalize recording of post execution provenance self['end_time'] = unicode(datetime.datetime.now()) if execution_time is not None: self['execution_time'] = unicode(execution_time) elif 'start_time' in self: start_time = run_info_dict.string_to_time(self['start_time']) stop_time = run_info_dict.string_to_time(self['end_time']) self['execution_time'] = unicode( stop_time - start_time ) # TODO: This only gives execution time in full seconds right now else: self['execution_time'] = None # Attempt to record psutil data try: import psutil process = psutil.Process() self['memory_info_after'] = unicode(process.memory_info()) except ImportError: log_helper.warning( __name__, 'psutil not installed. Recording of part of runtime information not possible', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn( "Recording of psutil-based runtime information failed: " + str(sys.exc_info())) # Record the time and use profiling data if possible if self.__time_and_use_profiler is not None: self.__time_and_use_profiler.disable() self.__time_and_use_profiler.create_stats() self['profile'] = unicode(self.__time_and_use_profiler.stats) # Save the summary statistics for the profiling data stats_io = StringIO.StringIO() profiler_stats = pstats.Stats( self.__time_and_use_profiler, stream=stats_io).sort_stats('cumulative') profiler_stats.print_stats() self['profile_stats'] = stats_io.getvalue() # Record the memory profiling data if possible if self.__memory_profiler is not None and self.get_profile_memory(): log_helper.debug(__name__, 'Recording memory profiling data', root=self.mpi_root, comm=self.mpi_comm) mem_stats_io = StringIO.StringIO() memory_profiler.show_results(self.__memory_profiler, stream=mem_stats_io) self['profile_mem'] = unicode(self.__memory_profiler.code_map) self['profile_mem_stats'] = mem_stats_io.getvalue()
def record_preexecute(self): """ Record basic runtime information in this dict before the exeuction is started. Function used to record runtime information prior to executing the process we want to track, e.g., the `execute_analysis(...)` of a standard analysis. The function may be overwritten in child classes to add recording of additional runtime information. All runtime data should be recorded in the main dict (i.e, self). This ensures in the case of standard analysis that the data is stored in the HDF5 file. Other data should be stored in separate variables that we may add to the object. When overwriting the function we should typically call super(...,self).runinfo_record_pretexecute() last in the custom version to ensure that the start_time is properly recorded right before the execution of the analysis. """ log_helper.debug(__name__, 'Recording pre-execution runtime data', root=self.mpi_root, comm=self.mpi_comm) # Record basic runtime environment information using the platform module try: self['architecture'] = unicode(platform.architecture()) self['java_ver'] = unicode(platform.java_ver()) self['libc_ver'] = unicode(platform.libc_ver()) self['linux_distribution'] = unicode(platform.linux_distribution()) self['mac_ver'] = unicode(platform.mac_ver()) self['machine'] = unicode(platform.machine()) self['node'] = unicode(platform.node()) self['platform'] = unicode(platform.platform()) self['processor'] = unicode(platform.processor()) self['python_branch'] = unicode(platform.python_branch()) self['python_build'] = unicode(platform.python_build()) self['python_compiler'] = unicode(platform.python_compiler()) self['python_implementation'] = unicode( platform.python_implementation()) self['python_revision'] = unicode(platform.python_revision()) self['python_version'] = unicode(platform.python_version()) self['release'] = unicode(platform.release()) self['system'] = unicode(platform.system()) self['uname'] = unicode(platform.uname()) self['version'] = unicode(platform.version()) self['win32_ver'] = unicode(platform.win32_ver()) except: warnings.warn( "WARNING: Recording of platform provenance failed: " + str(sys.exc_info())) # Attempt to record the svn version information try: import subprocess self['svn_ver'] = subprocess.check_output('svnversion').rstrip( '\n') except ImportError: log_helper.warning( __name__, 'Recording of svn version not possible. subprocess not installed', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn("Recording of svn version information failed: " + str(sys.exc_info())) # Attempt to record software library version try: import numpy as np self['numpy_version_full_version'] = unicode( np.version.full_version) self['numpy_version_release'] = unicode(np.version.release) self['numpy_version_git_revision'] = unicode( np.version.git_revision) except ImportError: log_helper.warning(__name__, 'Recording of numpy version not possible.', root=self.mpi_root, comm=self.mpi_comm) # Attempt to record psutil data try: import psutil self['logical_cpu_count'] = unicode(psutil.cpu_count()) self['cpu_count'] = unicode(psutil.cpu_count(logical=False)) process = psutil.Process() self['open_files'] = unicode(process.open_files()) self['memory_info_before'] = unicode(process.memory_info()) except ImportError: log_helper.warning( __name__, 'psutil not installed. Recording of part of runtime information not possible', root=self.mpi_root, comm=self.mpi_comm) except: warnings.warn( "Recording of psutil-based runtime information failed: " + str(sys.exc_info())) # Record the start time for the analysis self['start_time'] = unicode(datetime.datetime.now()) # Enable time and usage profiling if requested if self.__profile_time_and_usage: self.__time_and_use_profiler = Profile() self.__time_and_use_profiler.enable()
def from_function(cls, analysis_function, output_names=None, parameter_specs=None, name_key="undefined"): """ Create a generic analysis class for a given analysis function. This functionality is useful to ease quick scripting on analyses but should not be used in production. NOTE: __analysis_function is a reserved parameter name used to store the analysis function and may not be used as an input parameter for the analysis function. :param analysis_function: The analysis function to be wrapped for provenance tracking and storage :param output_names: Optionally, define a list of the names of the outputs :param parameter_specs: Optional list of omsi.datastructures.analysis_data.parameter_data with additional information about the parameters of the function. :param name_key: The name for the analysis, i.e., the analysis identifier :return: A new generic analysis class """ log_helper.debug(__name__, "Creating generic analysis from function") ana_dtypes = data_dtypes.get_dtypes() generic_analysis = cls(name_key=name_key) generic_analysis.real_analysis_type = analysis_function.__code__.co_name function_argcount = analysis_function.__code__.co_argcount # Get the number of function parameters function_args = analysis_function.__code__.co_varnames[0:function_argcount] # Get the function arguments # Get the default values for the function parameters function_defaults = () if hasattr(analysis_function, 'func_defaults'): if analysis_function.func_defaults is not None: function_defaults = analysis_function.func_defaults function_nondefaults = function_argcount - len(function_defaults) default_pos = 0 # Add all parameters of the function to our generic analysis for varindex, varname in enumerate(function_args): # Determine the default value (if any) for the current parameter has_default = varindex >= function_nondefaults default = None if has_default: default = function_defaults[default_pos] default_pos += 1 # Check if the user has supplied an additional specification for the current parameter param_spec = None if parameter_specs is not None: for ps in parameter_specs: if isinstance(ps, dict) or isinstance(ps, parameter_data): if ps['name'] == varname: param_spec = ps else: raise ValueError("Invalid parameter specification. Spec is not a dict or parameter_data object") # Try to determine the dtype from the default values of the function dtype = None if default is not None: if isinstance(default, list) or isinstance(default, np.ndarray): dtype = ana_dtypes['ndarray'] elif isinstance(default, bool): dtype = ana_dtypes['bool'] elif isinstance(default, basestring): dtype=str else: for k, v in ana_dtypes.iteritems(): try: if isinstance(default, v): dtype = v break except: pass # Add the parameter to our analysis if param_spec is None: generic_analysis.add_parameter(name=varname, help=' ', dtype=dtype, default=default) else: generic_analysis.add_parameter( name=varname, help=' ' if 'help' not in param_spec else param_spec['help'], dtype=dtype if 'dtype' not in param_spec else param_spec['dtype'], required=(not has_default) if 'required' not in param_spec else param_spec['required'], default=default if 'default' not in param_spec else param_spec['default'], choices=None if 'choices' not in param_spec else param_spec['choices'], group=None if 'group' not in param_spec else param_spec['group'], data=None if 'data' not in param_spec else param_spec['data']) # Add the analysis function as an internal parameter to our analysis generic_analysis.add_parameter(name='__analysis_function', help='The analysis function we want to execute', dtype=ana_dtypes['ndarray']) # Assign the names of the outputs if output_names is not None: generic_analysis.data_names = output_names # Pickle out analysis function and save it generic_analysis['__analysis_function'] = np.fromstring(cloudpickle.dumps(analysis_function), cls.PICKLE_DTYPE) # Return our initalized analysis return generic_analysis
def main(self): """ Default main function for running an analysis from the command line. The default implementation exposes all specified analysis parameters as command line options to the user. The default implementation also provides means to print a help text for the function. :raises: ValueError is raised in case that the analysis class is unknown """ # Initialize the argument parser if self.parser is None: self.initialize_argument_parser() try: # Parse the command line arguments to determine the command line driver settings self.parse_cl_arguments() except: self.remove_output_target() raise if self.workflow_executor is None: self.remove_output_target() log_helper.error( __name__, 'Missing --script parameter or worfklow_executor object') raise ValueError('Workflow not initalized') # Add and parse the command line arguments specific to the analysis to determine the analysis settings try: self.add_and_parse_workflow_arguments() except: self.remove_output_target() raise # Print the analysis settings if mpi_helper.get_rank() == self.mpi_root: self.print_settings() # Enable time and usage profiling try: # Enable time and usage profiling if requested if self.profile_analyses: try: self.workflow_executor.analysis_tasks.enable_time_and_usage_profiling( self.profile_analyses) except ImportError as e: log_helper.warning( __name__, "Profiling of time and usage not available due to missing packages." ) log_helper.warning(__name__, e.message) # Enable memory profiling if requested if self.profile_analyses_mem: try: self.workflow_executor.analysis_tasks.enable_memory_profiling( self.profile_analyses_mem) except ImportError as e: log_helper.warning( __name__, "Profiling of memory usage not available due to missing packages" ) log_helper.warning(__name__, e.message) except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Execute the analysis try: log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments), root=self.mpi_root, comm=self.mpi_comm) self.workflow_executor.execute() except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial # the condition of mpi_helper.get_rank() == self.mpi_root evaluates to True because # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial. if mpi_helper.get_rank() == self.mpi_root: # Print usage profiles if available try: self.print_time_and_usage_profiles() except: log_helper.error( __name__, "An error occured while trying to print time and usage profiles", root=self.mpi_root, comm=self.mpi_comm) # Print memory profile data if available try: self.print_memory_profiles() except: log_helper.error( __name__, "An error occured while trying to print memory profiles", root=self.mpi_root, comm=self.mpi_comm) # Print the time it took to run the analysis try: # Parallel case: We need to compile/collect timing data from all cores if isinstance( self.workflow_executor.run_info['execution_time'], list): # Time for each task to execute log_helper.info( __name__, "Time in seconds for each analysis process: " + str(self.workflow_executor.run_info['execution_time']), root=self.mpi_root, comm=self.mpi_comm) # Start times of each task log_helper.info( __name__, "Time when each of the processes started: " + str(self.workflow_executor.run_info['start_time']), root=self.mpi_root, comm=self.mpi_comm) # Stop times for each task log_helper.info( __name__, "Time when each of the processes finished: " + str(self.workflow_executor.run_info['end_time']), root=self.mpi_root, comm=self.mpi_comm) # Compile the time to execute string exec_time_array = np.asarray( self.workflow_executor.run_info['execution_time'], dtype=float) max_exec_time = str(exec_time_array.max()) min_exec_time = str(exec_time_array.min()) mean_exec_time = str(exec_time_array.mean()) exec_time_string = max_exec_time + " s " + \ " ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )" # Serial case: We only have a single time to worry about else: exec_time_string = str(self.workflow_executor. run_info['execution_time']) + " s" log_helper.info(__name__, "Time to execute analysis: " + exec_time_string, root=self.mpi_root, comm=self.mpi_comm) except: raise # Save the analysis to file if self.output_target is not None: from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager for analysis in self.workflow_executor.analysis_tasks: omsi_analysis_manager.create_analysis_static( analysis_parent=self.output_target, analysis=analysis)
omsi_format_common, \ omsi_format_analysis, \ omsi_format_dependencies from omsi.dataformat.omsi_file.dependencies import omsi_dependencies_manager from omsi.dataformat.omsi_file.common import omsi_file_common, omsi_file_object_manager from omsi.datastructures.run_info_data import run_info_dict import omsi.shared.mpi_helper as mpi_helper from omsi.shared.log import log_helper #try: # import cloudpickle # Use the version of cloud-pickle installed on the system # log_helper.debug(__name__, "Using system cloudpickle module") #except ImportError: # try: import omsi.shared.third_party.cloudpickle as cloudpickle log_helper.debug(__name__, "Using fallback cloudpickle version") # except ImportError: # log_helper.warning(__name__, "cloudpickle could not be imported. Using standard pickle instead. " + # " Some features may not be available.") # import pickle as cloudpickle import pickle # TODO create_analysis_static(...) and other create functions need to handle the case when a file is opened with the MPI I/O backend. Currently we assume a serial write from root class omsi_analysis_manager(omsi_file_object_manager): """ Analysis manager helper class used to define common functionality needed for analysis-related data. Usually, a class that defines a format that contains an omsi_file_analysis object will inherit from this class (in addition to omsi_file_common) to acquire the common features.
def __init__(self, basename, requires_slicing=True, resolution=5): """ Open an img file for data reading. :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found in the directory will be used instead. :type basename: string :param requires_slicing: Should the complete data be read into memory (this makes slicing easier). (default is True) :type requires_slicing: bool :param resolution: For profile data only, the minimum m/z spacing to use for creating the "full" reprofiled data cube :type resolution: float """ # Determine the correct base if os.path.isdir(basename): filelist = self.get_files_from_dir(basename) if len(filelist) > 0: basename = filelist[0] else: raise ValueError( "No valid mzML file found in the given directory.") # self.basename = basename # self.requires_slicing = requires_slicing # Call super constructor. This sets self.basename and self.readall super(xmassmzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing) self.resolution = resolution self.data_type = 'uint32' # TODO What data type should we use for the interpolated data? self.num_scans = self.__compute_num_scans(filename=self.basename) log_helper.info(__name__, 'Read %s scans from mzML file.' % self.num_scans) log_helper.debug(__name__, 'Compute coordinates') self.coordinates = self.__compute_coordinates(filename=self.basename, num_scans=self.num_scans) # Compute the spatial configuration of the matrix self.x_pos = np.unique(self.coordinates[:, 0]) self.y_pos = np.unique(self.coordinates[:, 1]) self.step_size = min( [min(np.diff(self.x_pos)), min(np.diff(self.y_pos))]) # Compute the mz axis log_helper.debug(__name__, 'Compute mz axes') self.mz = self.__compute_mz_axis(filename=self.basename) log_helper.debug(__name__, 'mz axes computed') # Determine the shape of the dataset, result is a list of shapes for each datacube # self.shape_all_data = [(self.x_pos.shape[0], self.y_pos.shape[0], mz.shape[0]) for mz in self.mz_all log_helper.debug(__name__, 'Compute shape') self.shape = (self.x_pos.shape[0], self.y_pos.shape[0], len(self.mz) ) #self.shape[0]) # self.shape = None # self.mz = None # Read the data into memory # self.data = None log_helper.debug(__name__, 'read all') if requires_slicing: self.data = self.__read_all() log_helper.debug(__name__, 'Finished with init')
def __run_dynamic(self): """ Run the task function using dynamic task scheduling. The root rank divides the data into sub-tasks and sends the tasks to available MPI processes on request. :return: Tuple with the following elements: 1) List with the results from the local execution of the task_function. Each entry is the result from one return of the task_function. 2) List of block_indexes. Each block_index is a tuple with the selection used to divide the data into sub-blocks. In the case of static decomposition we have a range slice object along the axes used for decomposition. """ try: from omsi.shared.log import log_helper except ImportError: from pactolus.third_party.log import log_helper import time rank = get_rank(comm=self.comm) size = get_size(comm=self.comm) if size < 2: warnings.warn('DYNAMIC task scheduling requires at least 2 MPI ranks. Using STATIC scheduling instead.') return self.__run_static_1D() # We are the controlling rank if rank == self.root: self.result = [] self.blocks = [] self.block_times = [] # Get data shape parameters and compute the data blocks axes_shapes = np.asarray(self.main_data.shape)[self.split_axes] total_num_subblocks = np.prod(axes_shapes) if total_num_subblocks < size: if rank == self.root: warnings.warn("Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle") # Compute the list of all possible blocks base_blocks = [[slice(None)]] * len(self.main_data.shape) for axis_index in self.split_axes: base_blocks[axis_index] = range(self.main_data.shape[axis_index]) block_tuples = itertools.product(*base_blocks) # Communicate blocks with task ranks log_helper.info(__name__, "PROCESSING DATA BLOCKS") start_time = time.time() block_index = 0 for block_selection in block_tuples: request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) self.comm.send((block_index, block_selection), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) block_index += 1 if (block_index % 100) == 0: log_helper.debug(__name__, str((block_index, total_num_subblocks, request_rank))) end_time = time.time() run_time = end_time - start_time log_helper.info(__name__, "TIME FOR SCHEDULING ALL TASKS: " + str(run_time)) start_time = time.time() log_helper.info(__name__, "FINALIZING") # Terminate all ranks and receive all data from the different ranks if requested all_ranks_status = np.zeros(size, 'bool') all_ranks_status[self.root] = True while not np.all(all_ranks_status): request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) self.comm.send((None, None), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) all_ranks_status[request_rank] = True end_time = time.time() run_time = end_time - start_time log_helper.info(__name__, "TIME FOR FINALIZING TASKS: " + str(run_time)) # We are a rank that has to run tasks else: # Request a new data block self.result = [] self.blocks = [] self.block_times = [] while True: start_time = time.time() self.comm.send(rank, dest=self.root, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) block_index, block_selection = self.comm.recv(source=self.root, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) if block_index is None: break # Execute the task_function on the given data block task_params = self.task_function_params task_params[self.main_data_param_name] = self.main_data[block_selection] self.result.append(self.task_function(**task_params)) self.blocks.append(block_selection) # Record the timings end_time = time.time() run_time = end_time - start_time self.block_times.append(run_time) # Return the result return self.result, self.blocks
Generic analysis class used to represent analyses of unknown type, e.g., when loading a custom user-defined analysis from file for which the indicate class may not be available with the local installation. In this case we want to at least be able to load and investigate the data. """ import pickle from omsi.analysis.base import analysis_base from omsi.datastructures.analysis_data import data_dtypes from omsi.datastructures.dependency_data import dependency_dict from omsi.datastructures.analysis_data import parameter_data from omsi.shared.log import log_helper try: import cloudpickle # Use the version of cloud-pickle installed on the system log_helper.debug(__name__, "Using system cloudpickle module") except ImportError: try: import omsi.shared.third_party.cloudpickle as cloudpickle log_helper.debug(__name__, "Using fallback cloudpickle version") except ImportError: log_helper.warning(__name__, "cloudpickle could not be imported. Using standard pickle instead. " + " Some features may not be available.") import pickle as cloudpickle import numpy as np def bastet_analysis(output_names=None, parameter_specs=None, name_key="undefined"): """ Decorator used to wrap a function and replace it with an analysis_generic object that behaves like a function but adds the ability for saving the
def clear_parameter_data(self): """Clear the list of parameter data""" log_helper.debug(__name__, "Clearing parameter data") for param in self.parameters: param.clear_data()
def __run_dynamic(self): """ Run the task function using dynamic task scheduling. The root rank divides the data into sub-tasks and sends the tasks to available MPI processes on request. :return: Tuple with the following elements: 1) List with the results from the local execution of the task_function. Each entry is the result from one return of the task_function. 2) List of block_indexes. Each block_index is a tuple with the selection used to divide the data into sub-blocks. In the case of static decomposition we have a range slice object along the axes used for decomposition. """ from omsi.shared.log import log_helper import time rank = get_rank(comm=self.comm) size = get_size(comm=self.comm) if size < 2: warnings.warn('DYNAMIC task scheduling requires at least 2 MPI ranks. Using STATIC scheduling instead.') return self.__run_static_1D() # We are the controlling rank if rank == self.root: self.result = [] self.blocks = [] self.block_times = [] # Get data shape parameters and compute the data blocks axes_shapes = np.asarray(self.main_data.shape)[self.split_axes] total_num_subblocks = np.prod(axes_shapes) if total_num_subblocks < size: if rank == self.root: warnings.warn("Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle") # Compute the list of all possible blocks base_blocks = [[slice(None)]] * len(self.main_data.shape) for axis_index in self.split_axes: base_blocks[axis_index] = range(self.main_data.shape[axis_index]) block_tuples = itertools.product(*base_blocks) # Communicate blocks with task ranks log_helper.info(__name__, "PROCESSING DATA BLOCKS") start_time = time.time() block_index = 0 for block_selection in block_tuples: request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) self.comm.send((block_index, block_selection), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) block_index += 1 if (block_index % 100) == 0: log_helper.debug(__name__, str((block_index, total_num_subblocks, request_rank))) end_time = time.time() run_time = end_time - start_time log_helper.info(__name__, "TIME FOR SCHEDULING ALL TASKS: " + str(run_time)) start_time = time.time() log_helper.info(__name__, "FINALIZING") # Terminate all ranks and receive all data from the different ranks if requested all_ranks_status = np.zeros(size, 'bool') all_ranks_status[self.root] = True while not np.all(all_ranks_status): request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) self.comm.send((None, None), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) all_ranks_status[request_rank] = True end_time = time.time() run_time = end_time - start_time log_helper.info(__name__, "TIME FOR FINALIZING TASKS: " + str(run_time)) # We are a rank that has to run tasks else: # Request a new data block self.result = [] self.blocks = [] self.block_times = [] while True: start_time = time.time() self.comm.send(rank, dest=self.root, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) block_index, block_selection = self.comm.recv(source=self.root, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) if block_index is None: break # Execute the task_function on the given data block task_params = self.task_function_params task_params[self.main_data_param_name] = self.main_data[block_selection] self.result.append(self.task_function(**task_params)) self.blocks.append(block_selection) # Record the timings end_time = time.time() run_time = end_time - start_time self.block_times.append(run_time) # Return the result return self.result, self.blocks
def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass. :returns: A series of numpy arrays with the score data for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match'] * 'pixel_index' , int, 2D array of pixel indices describing for each spectrum \ the (x,y) pixel location in the imag * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] ms1_mass_tol = self['ms1_mass_tolerance'] ms2_mass_tol = self['ms2_mass_tolerance'] neutralizations = self['neutralizations'] max_depth = self['max_depth'] # Make the numpy array with the list of tree files and their MS1 masses if file_lookup_table is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root) if os.path.isfile(self['trees']): if self['trees'].endswith('.npy'): file_lookup_table = np.load(self['trees']) else: in_treefile = open(self['trees'], 'r') tree_files = [line.rstrip('\n') for line in in_treefile] in_treefile.close() file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass( tree_files=tree_files) elif os.path.isdir(self['trees']): file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass( path=self['trees']) # Define the common pactolus paramters pactolus_parameters = { 'file_lookup_table': file_lookup_table, 'ms1_mass_tol': ms1_mass_tol, 'ms2_mass_tol': ms2_mass_tol, 'neutralizations': neutralizations, 'max_depth': max_depth } # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([ spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root) # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [ 0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self. execute_analysis, # Execute this function task_function_params={ 'file_lookup_table': file_lookup_table }, # Reuse the file_lookup_table main_data= spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution pixel_index = np.zeros((0, 2), dtype='int') score = np.zeros((0, ), dtype='f4') id_data = np.zeros((0, ), dtype='a100') name = np.zeros((0, ), dtype='a100') mass = np.zeros((0, ), dtype='f4') n_peaks = np.zeros((0, ), dtype='i4') n_match = np.zeros((0, ), dtype='i4') use_dynamic_schedule = ( self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: log_helper.debug(__name__, 'Compiling output') # Compile pixel_index temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) temp_data = [ri[1] for ri in result[0]] # Compile scores if len(temp_data) > 0: score = np.concatenate(tuple(temp_data), axis=0) # Compile id temp_data = [ri[2] for ri in result[0]] if len(temp_data) > 0: id_data = np.concatenate(tuple(temp_data), axis=0) # Compile name temp_data = [ri[3] for ri in result[0]] if len(temp_data) > 0: name = np.concatenate(tuple(temp_data), axis=0) # Compile mass temp_data = [ri[4] for ri in result[0]] if len(temp_data) > 0: mass = np.concatenate(tuple(temp_data), axis=0) # Compile n_peaks temp_data = [ri[5] for ri in result[0]] if len(temp_data) > 0: n_peaks = np.concatenate(tuple(temp_data), axis=0) # Compile n_match temp_data = [ri[6] for ri in result[0]] if len(temp_data) > 0: n_match = np.concatenate(tuple(temp_data), axis=0) log_helper.log_var(__name__, score=score) # Return the compiled output return pixel_index, score, id_data, name, mass, n_peaks, n_match ############################################################# # Serial processing of the current data block ############################################################# log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root) # Initialize the output data structures # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] # if len(pixel_index.shape) == 1: # pixel_index = pixel_index[np.newaxis, :] hit_matrix = [] # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = int(fpl_peak_arrayindex[spectrum_index, 2]) stop = int(fpl_peak_arrayindex[(spectrum_index + 1), 2] if spectrum_index < (num_spectra - 1) else fpl_peak_value.size) spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = precursor_mz if len( precursor_mz) == 1 else precursor_mz[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = score_frag_dag.score_scan_list_against_trees( scan_list=[ current_peaks_list, ], ms1_mz=[ current_parent_mass, ], params=pactolus_parameters) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str((current_hits > 0).sum()) #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) #sys.stdout.flush() print time_str sys.stdout.flush() # Save the hits for the current pixel hit_matrix.append(current_hits[0, :]) # Index the results based on the given metabolite database score = [] id_data = [] name = [] mass = [] n_peaks = [] n_match = [] pixel_index = [] if len(metabolite_database) > 0: # We don't have an empty string for current_index, spectrum_index in enumerate(spectrum_indexes): non_zero_scores = np.where(hit_matrix[current_index] > 0) if non_zero_scores.size > 0: current_hit_table = np.asarray( score_frag_dag.make_pactolus_hit_table( pactolus_results=hit_matrix[current_index], table_file=file_lookup_table, original_db=metabolite_database)) for score_index in non_zero_scores: pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2]) score.append(current_hit_table['score'][score_index]) id_data.append(current_hit_table['id'][score_index]) name.append(current_hit_table['name'][score_index]) mass.append(current_hit_table['mass'][score_index]) n_peaks.append( current_hit_table['n_peaks'][score_index]) n_match.append( current_hit_table['n_match'][score_index]) else: pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] score = np.asarray(hit_matrix) # Return the hit_table and the index of the pixel each hit_table applies to print "rank : " + str( mpi_helper.get_rank()) + " : scores " + str(score) sys.stdout.flush() return np.asarray(pixel_index), \ np.asarray(score), \ np.asarray(id_data), \ np.asarray(name), \ np.asarray(mass), \ np.asarray(n_peaks), \ np.asarray(n_match)