def write_analysis_data(self, analysis_group=None): """ This function is used to write the actual analysis data to file. If not implemented, then the omsi_file_analysis API's default behavior is used instead. :param analysis_group: The h5py.Group object where the analysis is stored. """ # Check if a user attempts to do parallel I/O with collect being disabled if mpi_helper.get_size() > 1 and not self['collect']: # Check if any of the other ranks have data num_elements = self['peak_arrayindex'].shape[0] if len( self['peak_arrayindex'].shape) == 2 else 0 result_sizes = mpi_helper.gather(num_elements, comm=self.mpi_comm, root=self.mpi_root) if mpi_helper.get_rank() == self.mpi_root: for element_size in result_sizes[1:]: if element_size > 0: raise ValueError( 'Parallel I/O with collect parameter set to false not supported' ) raise NotImplementedError """
def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass. :returns: A series of numpy arrays with the score data for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match'] * 'pixel_index' , int, 2D array of pixel indices describing for each spectrum \ the (x,y) pixel location in the imag * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] ms1_mass_tol = self['ms1_mass_tolerance'] ms2_mass_tol = self['ms2_mass_tolerance'] neutralizations = self['neutralizations'] max_depth = self['max_depth'] # Make the numpy array with the list of tree files and their MS1 masses if file_lookup_table is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root) if os.path.isfile(self['trees']): if self['trees'].endswith('.npy'): file_lookup_table = np.load(self['trees']) else: in_treefile = open(self['trees'], 'r') tree_files = [line.rstrip('\n') for line in in_treefile] in_treefile.close() file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass( tree_files=tree_files) elif os.path.isdir(self['trees']): file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass( path=self['trees']) # Define the common pactolus paramters pactolus_parameters = { 'file_lookup_table': file_lookup_table, 'ms1_mass_tol': ms1_mass_tol, 'ms2_mass_tol': ms2_mass_tol, 'neutralizations': neutralizations, 'max_depth': max_depth } # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([ spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root) # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [ 0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self. execute_analysis, # Execute this function task_function_params={ 'file_lookup_table': file_lookup_table }, # Reuse the file_lookup_table main_data= spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution pixel_index = np.zeros((0, 2), dtype='int') score = np.zeros((0, ), dtype='f4') id_data = np.zeros((0, ), dtype='a100') name = np.zeros((0, ), dtype='a100') mass = np.zeros((0, ), dtype='f4') n_peaks = np.zeros((0, ), dtype='i4') n_match = np.zeros((0, ), dtype='i4') use_dynamic_schedule = ( self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: log_helper.debug(__name__, 'Compiling output') # Compile pixel_index temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) temp_data = [ri[1] for ri in result[0]] # Compile scores if len(temp_data) > 0: score = np.concatenate(tuple(temp_data), axis=0) # Compile id temp_data = [ri[2] for ri in result[0]] if len(temp_data) > 0: id_data = np.concatenate(tuple(temp_data), axis=0) # Compile name temp_data = [ri[3] for ri in result[0]] if len(temp_data) > 0: name = np.concatenate(tuple(temp_data), axis=0) # Compile mass temp_data = [ri[4] for ri in result[0]] if len(temp_data) > 0: mass = np.concatenate(tuple(temp_data), axis=0) # Compile n_peaks temp_data = [ri[5] for ri in result[0]] if len(temp_data) > 0: n_peaks = np.concatenate(tuple(temp_data), axis=0) # Compile n_match temp_data = [ri[6] for ri in result[0]] if len(temp_data) > 0: n_match = np.concatenate(tuple(temp_data), axis=0) log_helper.log_var(__name__, score=score) # Return the compiled output return pixel_index, score, id_data, name, mass, n_peaks, n_match ############################################################# # Serial processing of the current data block ############################################################# log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root) # Initialize the output data structures # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] # if len(pixel_index.shape) == 1: # pixel_index = pixel_index[np.newaxis, :] hit_matrix = [] # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = int(fpl_peak_arrayindex[spectrum_index, 2]) stop = int(fpl_peak_arrayindex[(spectrum_index + 1), 2] if spectrum_index < (num_spectra - 1) else fpl_peak_value.size) spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = precursor_mz if len( precursor_mz) == 1 else precursor_mz[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = score_frag_dag.score_scan_list_against_trees( scan_list=[ current_peaks_list, ], ms1_mz=[ current_parent_mass, ], params=pactolus_parameters) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str((current_hits > 0).sum()) #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) #sys.stdout.flush() print time_str sys.stdout.flush() # Save the hits for the current pixel hit_matrix.append(current_hits[0, :]) # Index the results based on the given metabolite database score = [] id_data = [] name = [] mass = [] n_peaks = [] n_match = [] pixel_index = [] if len(metabolite_database) > 0: # We don't have an empty string for current_index, spectrum_index in enumerate(spectrum_indexes): non_zero_scores = np.where(hit_matrix[current_index] > 0) if non_zero_scores.size > 0: current_hit_table = np.asarray( score_frag_dag.make_pactolus_hit_table( pactolus_results=hit_matrix[current_index], table_file=file_lookup_table, original_db=metabolite_database)) for score_index in non_zero_scores: pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2]) score.append(current_hit_table['score'][score_index]) id_data.append(current_hit_table['id'][score_index]) name.append(current_hit_table['name'][score_index]) mass.append(current_hit_table['mass'][score_index]) n_peaks.append( current_hit_table['n_peaks'][score_index]) n_match.append( current_hit_table['n_match'][score_index]) else: pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] score = np.asarray(hit_matrix) # Return the hit_table and the index of the pixel each hit_table applies to print "rank : " + str( mpi_helper.get_rank()) + " : scores " + str(score) sys.stdout.flush() return np.asarray(pixel_index), \ np.asarray(score), \ np.asarray(id_data), \ np.asarray(name), \ np.asarray(mass), \ np.asarray(n_peaks), \ np.asarray(n_match)
def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass. :returns: A series of numpy arrays with the score data for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match'] * 'pixel_index' , int, 2D array of pixel indices describing for each spectrum \ the (x,y) pixel location in the imag * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] ms1_mass_tol = self['ms1_mass_tolerance'] ms2_mass_tol = self['ms2_mass_tolerance'] neutralizations = self['neutralizations'] max_depth = self['max_depth'] # Make the numpy array with the list of tree files and their MS1 masses if file_lookup_table is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root) if os.path.isfile(self['trees']): if self['trees'].endswith('.npy'): file_lookup_table = np.load(self['trees']) else: in_treefile = open(self['trees'], 'r') tree_files = [line.rstrip('\n') for line in in_treefile] in_treefile.close() file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(tree_files=tree_files) elif os.path.isdir(self['trees']): file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(path=self['trees']) # Define the common pactolus paramters pactolus_parameters = {'file_lookup_table': file_lookup_table, 'ms1_mass_tol': ms1_mass_tol, 'ms2_mass_tol': ms2_mass_tol, 'neutralizations': neutralizations, 'max_depth': max_depth} # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root) # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self.execute_analysis, # Execute this function task_function_params={'file_lookup_table': file_lookup_table}, # Reuse the file_lookup_table main_data=spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution pixel_index = np.zeros((0, 2), dtype='int') score = np.zeros((0,), dtype='f4') id_data = np.zeros((0,), dtype='a100') name = np.zeros((0,), dtype='a100') mass = np.zeros((0,), dtype='f4') n_peaks = np.zeros((0,), dtype='i4') n_match = np.zeros((0,), dtype='i4') use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: log_helper.debug(__name__, 'Compiling output') # Compile pixel_index temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) temp_data = [ri[1] for ri in result[0]] # Compile scores if len(temp_data) > 0: score = np.concatenate(tuple(temp_data), axis=0) # Compile id temp_data = [ri[2] for ri in result[0]] if len(temp_data) > 0: id_data = np.concatenate(tuple(temp_data), axis=0) # Compile name temp_data = [ri[3] for ri in result[0]] if len(temp_data) > 0: name = np.concatenate(tuple(temp_data), axis=0) # Compile mass temp_data = [ri[4] for ri in result[0]] if len(temp_data) > 0: mass = np.concatenate(tuple(temp_data), axis=0) # Compile n_peaks temp_data = [ri[5] for ri in result[0]] if len(temp_data) > 0: n_peaks = np.concatenate(tuple(temp_data), axis=0) # Compile n_match temp_data = [ri[6] for ri in result[0]] if len(temp_data) > 0: n_match = np.concatenate(tuple(temp_data), axis=0) log_helper.log_var(__name__, score=score) # Return the compiled output return pixel_index, score, id_data, name, mass, n_peaks, n_match ############################################################# # Serial processing of the current data block ############################################################# log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root) # Initialize the output data structures # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] # if len(pixel_index.shape) == 1: # pixel_index = pixel_index[np.newaxis, :] hit_matrix = [] # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = int(fpl_peak_arrayindex[spectrum_index, 2]) stop = int(fpl_peak_arrayindex[(spectrum_index+1), 2] if spectrum_index < (num_spectra-1) else fpl_peak_value.size) spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = precursor_mz if len(precursor_mz) == 1 else precursor_mz[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = score_frag_dag.score_scan_list_against_trees(scan_list=[current_peaks_list, ], ms1_mz=[current_parent_mass, ], params=pactolus_parameters) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str((current_hits > 0).sum()) #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) #sys.stdout.flush() print time_str sys.stdout.flush() # Save the hits for the current pixel hit_matrix.append(current_hits[0, :]) # Index the results based on the given metabolite database score = [] id_data = [] name = [] mass = [] n_peaks = [] n_match = [] pixel_index = [] if len(metabolite_database) > 0: # We don't have an empty string for current_index, spectrum_index in enumerate(spectrum_indexes): non_zero_scores = np.where(hit_matrix[current_index] > 0) if non_zero_scores.size > 0: current_hit_table = np.asarray(score_frag_dag.make_pactolus_hit_table( pactolus_results=hit_matrix[current_index], table_file=file_lookup_table, original_db=metabolite_database)) for score_index in non_zero_scores: pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2]) score.append(current_hit_table['score'][score_index]) id_data.append(current_hit_table['id'][score_index]) name.append(current_hit_table['name'][score_index]) mass.append(current_hit_table['mass'][score_index]) n_peaks.append(current_hit_table['n_peaks'][score_index]) n_match.append(current_hit_table['n_match'][score_index]) else: pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] score = np.asarray(hit_matrix) # Return the hit_table and the index of the pixel each hit_table applies to print "rank : " + str(mpi_helper.get_rank()) + " : scores " + str(score) sys.stdout.flush() return np.asarray(pixel_index), \ np.asarray(score), \ np.asarray(id_data), \ np.asarray(name), \ np.asarray(mass), \ np.asarray(n_peaks), \ np.asarray(n_match)
def __init__(self, file_params, max_depth, isotope_dict=None): """ :param file_params: :return: """ self.input_inchi_file = file_params['input_inchi_file'] self.output_hdf5_file_base = file_params['output_hdf5_file_base'] self.output_error_log = file_params['output_error_log'] self.output_directory = file_params['output_directory'] # Make output directory if it does not exist if not os.path.isdir(self.output_directory): try: os.mkdir(self.output_directory) except OSError: # When executed in parallel it is possible that another rank already created the dir # in the meantime. We can safely ignore this error. if os.path.isdir(self.output_directory): pass else: raise # Get isotope dictionary (if none was provided) if isotope_dict is None: self.isotope_dict = get_isotope_dict() else: self.isotope_dict = isotope_dict self.max_depth = max_depth # make list of inchis inchi_list = [] with open(self.input_inchi_file, 'r') as inchi_file: for line in inchi_file: inchi_list.append(line.strip()) assert inchi_list self.inchi_list = inchi_list # ensure any pre-existing output logs are overwritten with open(self.output_error_log, 'w') as _: pass # execute in parallel if possible if mpi_helper.MPI_AVAILABLE and mpi_helper.get_size() > 1: scheduler = mpi_helper.parallel_over_axes( task_function=self.grow_tree_from_inchi, task_function_params={}, main_data=np.unique( np.asarray(inchi_list) ), # FIXME Why are there duplicates in the inchi list split_axes=[ 0, ], main_data_param_name='inchi', root=0, schedule=mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'], comm=mpi_helper.get_comm_world()) _ = scheduler.run() else: for inchi in inchi_list: self.grow_tree_from_inchi(inchi) return
def execute_analysis(self, msidata_subblock=None): """ Execute the local peak finder for the given msidata. :param msidata_subblock: Optional input parameter used for parallel execution of the analysis only. If msidata_subblock is set, then the given subblock will be processed in SERIAL instead of processing self['msidata'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only. """ # Make sure needed imports are available from omsi.analysis.findpeaks.third_party.findpeaks import findpeaks import numpy as np # Assign parameters to local variables for convenience msidata = self['msidata'] if msidata_subblock is not None: msidata = msidata_subblock mzdata = self['mzdata'] integration_width = self['integration_width'] peakheight = self['peakheight'] slwindow = self['slwindow'] smoothwidth = self['smoothwidth'] print_status = self['printStatus'] if print_status: import sys ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(self['msidata'].shape) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if msidata_subblock is None: # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = range( len(self['msidata'].shape) - 1) # The axes along which we can split the data scheduler = mpi_helper.parallel_over_axes( task_function=self. execute_analysis, # Execute this function task_function_params={}, # No added parameters main_data=msidata, # Process the msidata split_axes=split_axis, # Split along axes main_data_param_name='msidata_subblock', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel schedule comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # TODO Record runtime information data from the scheduler in our provenance data # self.run_info['SCHEDULER_blocks'] = scheduler.blocks # self.run_info['SCHEDULER_block_times'] = scheduler.block_times # self.run_info['SCHEDULER_run_time'] = scheduler.run_time # self.run_info['SCHEDULER_schedule'] = scheduler.schedule # Compile the data from the parallel execution # Case Table: # # collect + worker 2 # worker 2 # collect + root 3 # root 1 use_dynamic_schedule = ( self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # Case 1: root rank without collect data disabled if mpi_helper.get_rank( ) == self.mpi_root and not self['collect']: # We did not process any data on the root if DYNAMIC scheduling was used if use_dynamic_schedule: return None, None, None, mzdata # We processed a data block using dynamic scheduling else: return result[0][0] # Case 2: Compile the data on the worker elif mpi_helper.get_rank( ) != self.mpi_root: # and use_dynamic_schedule: # Compile the results from all processing task (on workers) or from all workers (on the root) peak_mz = np.concatenate(tuple([ri[0] for ri in result[0]]), axis=-1) peak_values = np.concatenate(tuple( [ri[1] for ri in result[0]]), axis=-1) if len( result[1] ) > 1: # Correct indices from the individual runs since they all start at 0 peak_arrayindex = np.asarray([[b[0], b[1], 0] for b in result[1]]) peak_arrayindex[:, 2] = np.cumsum( [0] + [len(ri[0]) for ri in result[0]])[:-1] else: peak_arrayindex = result[0][0][2] mzdata = result[0][0][3] return peak_mz, peak_values, peak_arrayindex, mzdata # Case 3: Compile collected data on the root elif mpi_helper.get_rank( ) == self.mpi_root: # and use_dynamic_schedule: # Compile the results from all processing task (on workers) or from all workers (on the root) peak_mz = np.concatenate(tuple([ri[0] for ri in result[0]]), axis=-1) peak_values = np.concatenate(tuple( [ri[1] for ri in result[0]]), axis=-1) # Dynamic scheduling uses selections of (int,int,slice) while the static # scheduling uses (slice, slice, slice), hence we need to compile the peak_arrayindex # slightly differently depending on the scheduler used if use_dynamic_schedule: peak_arrayindex = np.asarray([[b[0], b[1], 0] for b in result[1]]) peak_arrayindex[:, 2] = np.cumsum( [0] + [len(ri[0]) for ri in result[0]])[:-1] else: peak_arrayindex = np.concatenate(tuple( [ri[2] for ri in result[0]]), axis=0) d = np.cumsum([0] + [len(ri[0]) for ri in result[0]]) d2 = np.cumsum([0] + [len(ri[2]) for ri in result[0]]) for di in range(len(d2) - 1): peak_arrayindex[d2[di]:d2[di + 1], 2] += d[di] mzdata = result[0][0][3] return peak_mz, peak_values, peak_arrayindex, mzdata ############################################################# # Serial processing of the current data block ############################################################# # Ensure the our MSI dataset has sufficient numbers of dimensions if len(msidata.shape) == 1: msidata = msidata[:][np.newaxis, np.newaxis, :] elif len(msidata.shape) == 2: msidata = msidata[:][np.newaxis, :] # Determine the data dimensions shape_x = msidata.shape[0] shape_y = msidata.shape[1] peak_mz = [] # The x values for all peaks, stored in a linear array peak_values = [ ] # The y values for all peaks, stored in a linear array # List describing for each pixel the start index where its peaks # are stored in the peaks_MZ and peaks_values array peak_arrayindex = np.zeros(shape=(shape_x * shape_y, 3), dtype='int64') current_index = long(0) pixel_index = 0 for xi in xrange(0, shape_x): for yi in xrange(0, shape_y): if print_status: sys.stdout.write("[" + str( int(100. * float(pixel_index) / float(shape_x * shape_y))) + "%]" + "\r") sys.stdout.flush() # Load the spectrum y = msidata[xi, yi, :] # Find peaks in the spectrum peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow, peakheight) y = peak_finder.smoothListGaussian() # from the smoothed spectra subtract a sliding minima peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow, peakheight) slmin = [x for x in peak_finder.sliding_window_minimum()] y = y - slmin # find peaks in the smoothed, background subtracted spectra peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow, peakheight) [pkmax, pkmin] = peak_finder.peakdet() xp = [x[0] for x in pkmax] yp = [x[1] for x in pkmax] peak_mz = peak_mz + xp peak_values = peak_values + yp peak_arrayindex[pixel_index, 0] = xi peak_arrayindex[pixel_index, 1] = yi peak_arrayindex[pixel_index, 2] = current_index pixel_index += 1 current_index += len(yp) # Add the analysis results and parameters to the anlaysis data so that it can be accessed and written to file # We here convert the single scalars to 1D numpy arrays to ensure consistency. The data write function can # handle also a large range of python built_in types by converting them to numpy for storage in HDF5 but # to ensure a consitent behavior we convert the values directly here # Save the analysis data to the __data_list so that the data can be # saved automatically by the omsi HDF5 file API return np.asarray(peak_mz), np.asarray( peak_values), peak_arrayindex, mzdata[:]
def execute_analysis(self, spectrum_indexes=None, compound_list=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param compound_list: List of the compounds from the database file. This parameter is used to avoid having to read the compound database on every compute task that calls this function when running in parallel. This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :returns: A tuple with an array of hit_tables with the scores for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. The hit_table is an array of (#spectra x #compounds). The hit_table is a structured numpy array with the following columns: * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] precursor_type = self['precursor_type'] parent_mass_windows = self['parent_mass_windows'] positive_ion_fragment_mass_windows = self['positive_ion_fragment_mass_windows'] negative_ion_fragment_mass_windows = self['negative_ion_fragment_mass_windows'] mass_tolerance_parent_ion = self['mass_tolerance_parent_ion'] mass_tolerance_fragment_ions = self['mass_tolerance_fragment_ions'] break_rings = self['break_rings'] fragmentation_depth = self['fragmentation_depth'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] # FIXME Get the precursor_mz from the MS2 data if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] default_charge = self['default_charge'] # FIXME Is this an input or should we get this from file proton_mass = 1.00782503207 - 5.4857990946e-4 parent_mass = precursor_mz - (default_charge * proton_mass) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Get the compound list if we have not read it previously. if compound_list is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all compound_list = MIDAS.ReadCompoundFile(metabolite_database) # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self.execute_analysis, # Execute this function task_function_params={'compound_list': compound_list}, # Reuse the compound_list main_data=spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution hit_table = np.zeros((0, 0), dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE) # initialize hit_table as empty pixel_index = np.zeros((0, 2), dtype='int') use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass #elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: hit_table = np.concatenate(tuple(temp_data), axis=-1) temp_data = [ri[1] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) return hit_table, pixel_index ############################################################# # Serial processing of the current data block ############################################################# # Initialize the output data structures pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] if len(pixel_index.shape) == 1: pixel_index = pixel_index[np.newaxis, :] hit_table = None # FIXME The initalization of the hit_table is only valid if we assume that all spectra have the same precursor m/z, which may not be the case # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = fpl_peak_arrayindex[spectrum_index, 2] stop = fpl_peak_arrayindex[(spectrum_index+1), 2] \ if spectrum_index < (num_spectra-1) \ else fpl_peak_value.size spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." print time_str continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 3), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = parent_mass if len(parent_mass) == 1 else parent_mass[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = MIDAS.scoring_C.score_main( Compound_list=compound_list, bBreakRing=break_rings, dCurrentPrecursor_type=precursor_type, dCurrentParentMass=current_parent_mass, current_peaks_list=current_peaks_list, iParentMassWindow_list=parent_mass_windows, dMass_Tolerance_Parent_Ion=mass_tolerance_parent_ion, dMass_Tolerance_Fragment_Ions=mass_tolerance_fragment_ions, iFragmentation_Depth=fragmentation_depth, iPositive_Ion_Fragment_Mass_Windows_list=positive_ion_fragment_mass_windows, iNegative_Ion_Fragment_Mass_Windows_list=negative_ion_fragment_mass_windows, top_n=None) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str(current_hits.shape[0]) print time_str sys.stdout.flush() # Initialize the hit_table if necessary if hit_table is None: # If our compound database does not contain any related compounds then just finish if current_hits.shape[0] == 0: # Initialize the results as empty and finish as there is nothing to do hit_table = np.zeros(shape=(pixel_index.shape[0], 0), dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE) # FIXME the number of hits may be different for different spectra if we have varying precursor m/z continue # If our compound database contains at least one relevant compound then check all spectra else: # Create the data structure to store all results hit_table = np.zeros(shape=(pixel_index.shape[0], current_hits.shape[0]), dtype=current_hits.dtype) # FIXME the number of hits may be different for different spectra if we have varying precursor m/z # Save the hits for the current pixel hit_table[current_index] = current_hits if hit_table is None: hit_table = np.zeros(shape=(pixel_index.shape[0], 0), dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE) # Return the hit_table and the index of the pixel each hit_table applies to return hit_table, pixel_index