Exemplo n.º 1
0
    def log(cls,
            module_name,
            message,
            root=0,
            comm=None,
            level=None,
            *args,
            **kwargs):
        """
        Convenience function used to select the log message level using an input parameter
        rathern than by selecting the approbriate function.

        :param module_name: __name__ of the calling module or None in case the ROOT logger should be used.
        :param message: The message to be added to the log
        :param root: The root process to be used for output when running in parallel. If None, then all
                     calling ranks will perform logging. Default is 0.
        :param comm: The MPI communicator to be used to determine the MPI rank. None by default, in which case
                      mpi.comm_world is used.
        :param level: To which logging level should we send the message
        :param args: Additional positional arguments for the python logger.debug function. See the python docs.
        :param kwargs: Additional keyword arguments for the python logger.debug function. See the python docs.
        """
        if level is None:
            level = log_helper.log_levels['INFO']
        if level in log_helper.log_levels.keys():
            level = log_helper.log_levels[level]
        if root is None or root == mpi_helper.get_rank(comm=comm):
            cls.get_logger(module_name).log(level, message, *args, **kwargs)
Exemplo n.º 2
0
def main():
    """
    Main function
    """
    sys.path.append(os.path.dirname(__file__))

    # Parse the command line arguments
    cl_params = command_line_params()
    if mpi_helper.get_rank() == 0:
        print_arguments(cl_params)

    # Create the file parameter for the FragTreeLibrary
    file_params = {
        'input_inchi_file': cl_params['inchi_file'],
        'output_directory': cl_params['output_dir'],
        'output_hdf5_file_base': cl_params['output_base_name'],
        'output_error_log': cl_params['error_log']
    }

    isotope_dict = get_isotope_dict(isostope_file=cl_params['isotope_file'])

    # Generate the fragmentation tree
    FragTreeLibrary(max_depth=cl_params['max_depth'],
                    isotope_dict=isotope_dict,
                    file_params=file_params)
    return
Exemplo n.º 3
0
    def parse_cl_arguments(self):
        """
        The function assumes that the command line parser has been setup using the initialize_argument_parser(..)

        This function parses all arguments that are specific to the command-line parser itself. Analysis
        arguments are added and parsed later by the add_and_parse_analysis_arguments(...) function.
        The reason for this is two-fold: i) to separate the parsing of analysis arguments and arguments of the
        command-line driver and ii) if the same HDF5 file is used as input and output target, then we need to
        open it first here in append mode before it gets opened in read mode later by the arguments.

        *Side effects:* The function sets ``self.output_target`` and ``self.profile_analysis``

        """
        # Parse the arguments and convert them to a dict using vars
        parsed_arguments = vars(self.parser.parse_known_args()[0])

        # Clean up the arguments to remove default arguments of the driver class
        # before we hand the arguments to the analysis class
        if self.analysis_class_arg_name in parsed_arguments:
            parsed_arguments.pop(self.analysis_class_arg_name)

        # Process the --save argument to determine where we should save the output
        if self.output_save_arg_name in parsed_arguments and mpi_helper.get_rank() == self.mpi_root:
            # Determine the filename and experiment group from the path
            self.output_target = parsed_arguments.pop(self.output_save_arg_name)
            if self.output_target is not None:
                output_filename, output_object_path = omsi_file_common.parse_path_string(self.output_target)
                # Create the output file
                if output_filename is None:
                    raise ValueError("ERROR: Invalid save parameter specification " + self.output_target)
                elif os.path.exists(output_filename) and not os.path.isfile(output_filename):
                    raise ValueError("ERROR: Save parameter not specify a file.")
                if not os.path.exists(output_filename):
                    out_file = omsi_file(output_filename, mode='a')
                    self.output_target = out_file.create_experiment()
                    self. __output_target_self = output_filename
                else:
                    out_file = omsi_file(output_filename, mode='r+')
                    if output_object_path is not None:
                        self.output_target = omsi_file_common.get_omsi_object(out_file[output_object_path])
                    else:
                        if out_file.get_num_experiments() > 0:
                            self.output_target = out_file.get_experiment(0)
                        else:
                            self.output_target = out_file.create_experiment()
        else:
            self.output_target = parsed_arguments.pop(self.output_save_arg_name)

        # The --loglovel argument
        if self.log_level_arg_name in parsed_arguments:
            user_log_level = parsed_arguments.pop(self.log_level_arg_name)
            if user_log_level in log_helper.log_levels.keys():
                log_helper.set_log_level(level=log_helper.log_levels[user_log_level])
            else:
                log_helper.error(module_name=__name__, message="Invalid log level specified")
Exemplo n.º 4
0
def print_arguments(cl_params):
    """
    Print the settings from the dict with command line arguments

    :param cl_params: Dict with the command line arguments
    """
    print ""
    print "Settings:"
    print "---------"
    if mpi_helper.get_rank() == 0:
        for param_key, param_val in cl_params.iteritems():
            print str(param_key) + " = " + str(param_val)
    print ""
Exemplo n.º 5
0
    def exception(cls, module_name, message, root=0, comm=None, *args, **kwargs):
        """
        Create a exception log entry. This function is typically called as:

        log_helper.exception(module_name=__name__, message="your message")

        :param module_name: __name__ of the calling module or None in case the ROOT logger should be used.
        :param message: The message to be added to the log
        :param root: The root process to be used for output when running in parallel. If None, then all
                     calling ranks will perform logging. Default is 0.
        :param comm: The MPI communicator to be used to determin the MPI rank. None by default, in which case
                      mpi.comm_world is used.
        :param args: Additional positional arguments for the python logger.debug function. See the python docs.
        :param kwargs: Additional keyword arguments for the python logger.debug function. See the python docs.

        """
        if root is None or root == mpi_helper.get_rank(comm=comm):
            cls.get_logger(module_name).exception(message, *args, **kwargs)
Exemplo n.º 6
0
    def error(cls, module_name, message, root=0, comm=None, *args, **kwargs):
        """
        Create a error log entry. This function is typically called as:

        log_helper.error(module_name=__name__, message="your message")

        :param module_name: __name__ of the calling module or None in case the ROOT logger should be used.
        :param message: The message to be added to the log
        :param root: The root process to be used for output when running in parallel. If None, then all
                     calling ranks will perform logging. Default is 0.
        :param comm: The MPI communicator to be used to determin the MPI rank. None by default, in which case
                      mpi.comm_world is used.
        :param args: Additional positional arguments for the python logger.debug function. See the python docs.
        :param kwargs: Additional keyword arguments for the python logger.debug function. See the python docs.

        """
        if root is None or root == mpi_helper.get_rank(comm=comm):
            cls.get_logger(module_name).error(message, *args, **kwargs)
Exemplo n.º 7
0
    def create_analysis_static(analysis_parent,
                               analysis,
                               flush_io=True,
                               force_save=False,
                               save_unsaved_dependencies=True,
                               mpi_root=0,
                               mpi_comm=None):
        """
        Same as create_analysis(...) but instead of relying on object-level, this function
        allows additional parameters (specifically the analysis_parent) to be provided as
        input, rather than being determined based on self

        :param analysis_parent: The h5py.Group object or omsi.dataformat.omsi_file.common.omsi_file_common object
            where the analysis should be created
        :param kwargs: Additional keyword arguments for create_analysis(...). See create_analysis(...) for details.

        :return: The output of create_analysis
        """
        if mpi_helper.get_rank(comm=mpi_comm) == mpi_root:
            if isinstance(analysis_parent, h5py.Group):
                parent_group = analysis_parent
            elif isinstance(analysis_parent, omsi_file_common):
                parent_group = analysis_parent.managed_group
            else:
                log_helper.error(
                    __name__,
                    'Illegal analysis_parent type. Expected h5py.Group or omsi_file_common'
                )
                raise ValueError("Illegal value for analysis parent")
            return omsi_file_analysis.__create__(
                parent_group=parent_group,
                analysis=analysis,
                analysis_index=None,  # Same as self.get_num_analysis()
                flush_io=flush_io,
                force_save=force_save,
                save_unsaved_dependencies=save_unsaved_dependencies)
        else:
            try:
                analysis.write_analysis_data()
                return None
            except NotImplementedError:
                pass
Exemplo n.º 8
0
    def log(cls, module_name, message, root=0, comm=None, level=None,  *args, **kwargs):
        """
        Convenience function used to select the log message level using an input parameter
        rathern than by selecting the approbriate function.

        :param module_name: __name__ of the calling module or None in case the ROOT logger should be used.
        :param message: The message to be added to the log
        :param root: The root process to be used for output when running in parallel. If None, then all
                     calling ranks will perform logging. Default is 0.
        :param comm: The MPI communicator to be used to determine the MPI rank. None by default, in which case
                      mpi.comm_world is used.
        :param level: To which logging level should we send the message
        :param args: Additional positional arguments for the python logger.debug function. See the python docs.
        :param kwargs: Additional keyword arguments for the python logger.debug function. See the python docs.
        """
        if level is None:
            level = log_helper.log_levels['INFO']
        if level in log_helper.log_levels.keys():
            level = log_helper.log_levels[level]
        if root is None or root == mpi_helper.get_rank(comm=comm):
            cls.get_logger(module_name).log(level, message, *args, **kwargs)
Exemplo n.º 9
0
    def create_analysis_static(analysis_parent,
                               analysis,
                               flush_io=True,
                               force_save=False,
                               save_unsaved_dependencies=True,
                               mpi_root=0,
                               mpi_comm=None):
        """
        Same as create_analysis(...) but instead of relying on object-level, this function
        allows additional parameters (specifically the analysis_parent) to be provided as
        input, rather than being determined based on self

        :param analysis_parent: The h5py.Group object or omsi.dataformat.omsi_file.common.omsi_file_common object
            where the analysis should be created
        :param kwargs: Additional keyword arguments for create_analysis(...). See create_analysis(...) for details.

        :return: The output of create_analysis
        """
        if mpi_helper.get_rank(comm=mpi_comm) == mpi_root:
            if isinstance(analysis_parent, h5py.Group):
                parent_group = analysis_parent
            elif isinstance(analysis_parent, omsi_file_common):
                parent_group = analysis_parent.managed_group
            else:
                log_helper.error(__name__, 'Illegal analysis_parent type. Expected h5py.Group or omsi_file_common')
                raise ValueError("Illegal value for analysis parent")
            return omsi_file_analysis.__create__(parent_group=parent_group,
                                                 analysis=analysis,
                                                 analysis_index=None,  # Same as self.get_num_analysis()
                                                 flush_io=flush_io,
                                                 force_save=force_save,
                                                 save_unsaved_dependencies=save_unsaved_dependencies)
        else:
            try:
                analysis.write_analysis_data()
                return None
            except NotImplementedError:
                pass
Exemplo n.º 10
0
    def write_analysis_data(self, analysis_group=None):
        """
        This function is used to write the actual analysis data to file. If not implemented, then the
        omsi_file_analysis API's default behavior is used instead.

        :param analysis_group: The h5py.Group object where the analysis is stored.

        """
        # Check if a user attempts to do parallel I/O with collect being disabled
        if mpi_helper.get_size() > 1 and not self['collect']:
            # Check if any of the other ranks have data
            num_elements = self['peak_arrayindex'].shape[0] if len(
                self['peak_arrayindex'].shape) == 2 else 0
            result_sizes = mpi_helper.gather(num_elements,
                                             comm=self.mpi_comm,
                                             root=self.mpi_root)
            if mpi_helper.get_rank() == self.mpi_root:
                for element_size in result_sizes[1:]:
                    if element_size > 0:
                        raise ValueError(
                            'Parallel I/O with collect parameter set to false not supported'
                        )
        raise NotImplementedError
        """
Exemplo n.º 11
0
    def execute_analysis(self, spectrum_indexes=None, compound_list=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param compound_list: List of the compounds from the database file. This parameter is used
            to avoid having to read the compound database on every compute task that calls this function
            when running in parallel.  This  parameter is strictly optional and intended for internal
            use only to facilitate the efficient parallel implementation.

        :returns: A tuple with an array of hit_tables with the scores for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image. The
            hit_table is an array of (#spectra x #compounds). The hit_table is a structured numpy
            array with the following columns:

                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        precursor_type = self['precursor_type']
        parent_mass_windows = self['parent_mass_windows']
        positive_ion_fragment_mass_windows = self['positive_ion_fragment_mass_windows']
        negative_ion_fragment_mass_windows = self['negative_ion_fragment_mass_windows']
        mass_tolerance_parent_ion = self['mass_tolerance_parent_ion']
        mass_tolerance_fragment_ions = self['mass_tolerance_fragment_ions']
        break_rings = self['break_rings']
        fragmentation_depth = self['fragmentation_depth']

        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']              # FIXME  Get the precursor_mz from the MS2 data
        if precursor_mz == -1:
             precursor_mz = self['fpl_data']['precursor_mz'][:]
        default_charge = self['default_charge']          # FIXME  Is this an input or should we get this from file
        proton_mass = 1.00782503207 - 5.4857990946e-4
        parent_mass = precursor_mz - (default_charge * proton_mass)

        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']

        # Get the compound list if we have not read it previously.
        if compound_list is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            compound_list = MIDAS.ReadCompoundFile(metabolite_database)

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([spectrum_indexes, ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [0, ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.execute_analysis,                    # Execute this function
                    task_function_params={'compound_list': compound_list},  # Reuse the compound_list
                    main_data=spectrum_indexes,                             # Process the spectra independently
                    split_axes=split_axis,                                  # Split along axes
                    main_data_param_name='spectrum_indexes',                # data input param
                    root=self.mpi_root,                                     # The root MPI task
                    schedule=self['schedule'],                              # Parallel scheduling scheme
                    comm=self.mpi_comm)                                     # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                hit_table = np.zeros((0, 0), dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE)  # initialize hit_table as empty
                pixel_index = np.zeros((0, 2), dtype='int')
                use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                #elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                    temp_data = [ri[1] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                return hit_table, pixel_index

        #############################################################
        # Serial processing of the current data block
        #############################################################
        # Initialize the output data structures
        pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        if len(pixel_index.shape) == 1:
            pixel_index = pixel_index[np.newaxis, :]
        hit_table = None  # FIXME The initalization of the hit_table is only valid if we assume that all spectra have the same precursor m/z, which may not be the case
        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = fpl_peak_arrayindex[spectrum_index, 2]
            stop = fpl_peak_arrayindex[(spectrum_index+1), 2] \
                if spectrum_index < (num_spectra-1) \
                else fpl_peak_value.size
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str =  "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                print time_str
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 3), dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = parent_mass if len(parent_mass) == 1 else parent_mass[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = MIDAS.scoring_C.score_main(
                Compound_list=compound_list,
                bBreakRing=break_rings,
                dCurrentPrecursor_type=precursor_type,
                dCurrentParentMass=current_parent_mass,
                current_peaks_list=current_peaks_list,
                iParentMassWindow_list=parent_mass_windows,
                dMass_Tolerance_Parent_Ion=mass_tolerance_parent_ion,
                dMass_Tolerance_Fragment_Ions=mass_tolerance_fragment_ions,
                iFragmentation_Depth=fragmentation_depth,
                iPositive_Ion_Fragment_Mass_Windows_list=positive_ion_fragment_mass_windows,
                iNegative_Ion_Fragment_Mass_Windows_list=negative_ion_fragment_mass_windows,
                top_n=None)

            end_time = time.time()
            execution_time = end_time - start_time
            time_str =  "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str(current_hits.shape[0])
            print time_str
            sys.stdout.flush()

            # Initialize the hit_table if necessary
            if hit_table is None:
                # If our compound database does not contain any related compounds then just finish
                if current_hits.shape[0] == 0:
                    # Initialize the results as empty and finish as there is nothing to do
                    hit_table = np.zeros(shape=(pixel_index.shape[0], 0),
                                         dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE)  # FIXME the number of hits may be different for different spectra if we have varying precursor m/z
                    continue
                # If our compound database contains at least one relevant compound then check all spectra
                else:
                    # Create the data structure to store all results
                    hit_table = np.zeros(shape=(pixel_index.shape[0], current_hits.shape[0]),
                                         dtype=current_hits.dtype)  # FIXME the number of hits may be different for different spectra if we have varying precursor m/z
            # Save the hits for the current pixel
            hit_table[current_index] = current_hits

        if hit_table is None:
            hit_table = np.zeros(shape=(pixel_index.shape[0], 0),
                                 dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE)

        # Return the hit_table and the index of the pixel each hit_table applies to
        return hit_table, pixel_index
Exemplo n.º 12
0
    def main(self):
        """
        Default main function for running an analysis from the command line.
        The default implementation exposes all specified analysis parameters as command
        line options to the user. The default implementation also provides means to
        print a help text for the function.

        :raises: ValueError is raised in case that the analysis class is unknown
        """
        # Get the analysis object if needed
        if self.add_analysis_class_arg:
            try:
                self.get_analysis_class_from_cl()
            except (ImportError, AttributeError, ValueError):
                pass

        # Initialize the argument parser
        if self.parser is None:
            self.initialize_argument_parser()

        # Check if we have a valid analysis class
        if self.analysis_class is None:
            print self.parser.print_help()
            raise ValueError('Could not determine the analysis class.')
        if not issubclass(self.analysis_class, analysis_base):
            print self.parser.print_help()
            raise ValueError('Analysis class is not a subclass of analysis_base.')

        try:
            # Parse the command line arguments to determine the command line driver settings
            self.parse_cl_arguments()
            # Add and parse the command line arguments specific to the analysis to determine the analysis settings
            self.add_and_parse_analysis_arguments()
        except:
            self.remove_output_target()
            raise

        # Print the analysis settings
        if mpi_helper.get_rank() == self.mpi_root:
            self.print_settings()

        # Call the execute function of the analysis
        try:
            # Create the analysis object
            if self.analysis_object is None:
                self.create_analysis_object()
            # Execute the analysis
            log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments),
                             root=self.mpi_root, comm=self.mpi_comm)
            self.analysis_object.execute(**self.analysis_arguments)
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial
        # the condition of  mpi_helper.get_rank() ==  self.mpi_root evaluates to True because
        # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial.
        if mpi_helper.get_rank() == self.mpi_root:
            # Print the profiling results of time and usage
            if self.analysis_object['profile_time_and_usage']:
                print ""
                print "PROFILING DATA: TIME AND USAGE"
                print ""
                self.analysis_object.get_profile_stats_object(consolidate=True).print_stats()

            # Print the profiling results for memory usage
            if self.analysis_object['profile_memory']:
                print ""
                print "PROFILING DATA: MEMORY"
                print ""
                print self.analysis_object.get_memory_profile_info()

            # Print the time it took to run the analysis
            try:
                # Parallel case: We need to compile/collect timing data from all cores
                if isinstance(self.analysis_object.run_info['execution_time'], list):
                    # Time for each task to execute
                    log_helper.info(__name__, "Time in seconds for each analysis process: " +
                                    str(self.analysis_object.run_info['execution_time']),
                                    root=self.mpi_root, comm=self.mpi_comm)
                    # Start times of each task
                    log_helper.info(__name__, "Time when each of the processes started: " +
                                    str(self.analysis_object.run_info['start_time']),
                                    root=self.mpi_root, comm=self.mpi_comm)
                    # Stop times for each task

                    log_helper.info(__name__, "Time when each of the processes finished: " +
                                    str(self.analysis_object.run_info['end_time']),
                                    root=self.mpi_root, comm=self.mpi_comm)

                    # Compile the time to execute string
                    exec_time_array = np.asarray(self.analysis_object.run_info['execution_time'], dtype=float)
                    max_exec_time = str(exec_time_array.max())
                    min_exec_time = str(exec_time_array.min())
                    mean_exec_time = str(exec_time_array.mean())
                    exec_time_string = max_exec_time + " s " + \
                        "    ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )"
                # Serial case: We only have a single time to worry about
                else:
                    exec_time_string = str(self.analysis_object.run_info['execution_time']) + " s"
                log_helper.info(__name__, "Time to execute analysis: " + exec_time_string,
                                root=self.mpi_root, comm=self.mpi_comm)
            except:
                raise

        # Save the analysis to file
        if self.output_target is not None:
            from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager
            omsi_analysis_manager.create_analysis_static(analysis_parent=self.output_target,
                                                         analysis=self.analysis_object)
Exemplo n.º 13
0
    def main(self):
        """
        Default main function for running an analysis from the command line.
        The default implementation exposes all specified analysis parameters as command
        line options to the user. The default implementation also provides means to
        print a help text for the function.

        :raises: ValueError is raised in case that the analysis class is unknown

        """

        # Initialize the argument parser
        if self.parser is None:
            self.initialize_argument_parser()

        try:
            # Parse the command line arguments to determine the command line driver settings
            self.parse_cl_arguments()
        except:
            self.remove_output_target()
            raise

        if self.workflow_executor is None:
            self.remove_output_target()
            log_helper.error(
                __name__,
                'Missing --script parameter or worfklow_executor object')
            raise ValueError('Workflow not initalized')

        # Add and parse the command line arguments specific to the analysis to determine the analysis settings
        try:
            self.add_and_parse_workflow_arguments()
        except:
            self.remove_output_target()
            raise

        # Print the analysis settings
        if mpi_helper.get_rank() == self.mpi_root:
            self.print_settings()

        # Enable time and usage profiling
        try:
            # Enable time and usage profiling if requested
            if self.profile_analyses:
                try:
                    self.workflow_executor.analysis_tasks.enable_time_and_usage_profiling(
                        self.profile_analyses)
                except ImportError as e:
                    log_helper.warning(
                        __name__,
                        "Profiling of time and usage not available due to missing packages."
                    )
                    log_helper.warning(__name__, e.message)
            # Enable memory profiling if requested
            if self.profile_analyses_mem:
                try:
                    self.workflow_executor.analysis_tasks.enable_memory_profiling(
                        self.profile_analyses_mem)
                except ImportError as e:
                    log_helper.warning(
                        __name__,
                        "Profiling of memory usage not available due to missing packages"
                    )
                    log_helper.warning(__name__, e.message)
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Execute the analysis
        try:
            log_helper.debug(__name__,
                             'Analysis arguments: ' +
                             str(self.analysis_arguments),
                             root=self.mpi_root,
                             comm=self.mpi_comm)
            self.workflow_executor.execute()
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial
        # the condition of  mpi_helper.get_rank() ==  self.mpi_root evaluates to True because
        # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial.
        if mpi_helper.get_rank() == self.mpi_root:

            # Print usage profiles if available
            try:
                self.print_time_and_usage_profiles()
            except:
                log_helper.error(
                    __name__,
                    "An error occured while trying to print time and usage profiles",
                    root=self.mpi_root,
                    comm=self.mpi_comm)

            # Print memory profile data if available
            try:
                self.print_memory_profiles()
            except:
                log_helper.error(
                    __name__,
                    "An error occured while trying to print memory profiles",
                    root=self.mpi_root,
                    comm=self.mpi_comm)

            # Print the time it took to run the analysis
            try:
                # Parallel case: We need to compile/collect timing data from all cores
                if isinstance(
                        self.workflow_executor.run_info['execution_time'],
                        list):
                    # Time for each task to execute
                    log_helper.info(
                        __name__,
                        "Time in seconds for each analysis process: " +
                        str(self.workflow_executor.run_info['execution_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)
                    # Start times of each task
                    log_helper.info(
                        __name__,
                        "Time when each of the processes started: " +
                        str(self.workflow_executor.run_info['start_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)
                    # Stop times for each task

                    log_helper.info(
                        __name__,
                        "Time when each of the processes finished: " +
                        str(self.workflow_executor.run_info['end_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)

                    # Compile the time to execute string
                    exec_time_array = np.asarray(
                        self.workflow_executor.run_info['execution_time'],
                        dtype=float)
                    max_exec_time = str(exec_time_array.max())
                    min_exec_time = str(exec_time_array.min())
                    mean_exec_time = str(exec_time_array.mean())
                    exec_time_string = max_exec_time + " s " + \
                        "    ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )"
                # Serial case: We only have a single time to worry about
                else:
                    exec_time_string = str(self.workflow_executor.
                                           run_info['execution_time']) + " s"
                log_helper.info(__name__,
                                "Time to execute analysis: " +
                                exec_time_string,
                                root=self.mpi_root,
                                comm=self.mpi_comm)
            except:
                raise

        # Save the analysis to file
        if self.output_target is not None:
            from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager
            for analysis in self.workflow_executor.analysis_tasks:
                omsi_analysis_manager.create_analysis_static(
                    analysis_parent=self.output_target, analysis=analysis)
Exemplo n.º 14
0
    def parse_cl_arguments(self):
        """
        The function assumes that the command line parser has been setup using the initialize_argument_parser(..)

        This function parses all arguments that are specific to the command-line parser itself. Analysis workflow
        arguments are added and parsed later by the add_and_parse_workflow_arguments(...) function.
        The reason for this is two-fold: i) to separate the parsing of analysis arguments and arguments of the
        command-line driver and ii) if the same HDF5 file is used as input and output target, then we need to
        open it first here in append mode before it gets opened in read mode later by the arguments.

        *Side effects:* The function sets:

            - ``self.output_target``
            - ``self.profile_analyses``

        """
        # Parse the arguments and convert them to a dict using vars
        parsed_arguments = vars(self.parser.parse_known_args()[0])

        # Process the --save argument to determine where we should save the output
        if self.output_save_arg_name in parsed_arguments and mpi_helper.get_rank(
        ) == self.mpi_root:
            # Determine the filename and experiment group from the path
            self.output_target = parsed_arguments.pop(
                self.output_save_arg_name)
            if self.output_target is not None:
                output_filename, output_object_path = omsi_file_common.parse_path_string(
                    self.output_target)
                # Create the output file
                if output_filename is None:
                    raise ValueError(
                        "ERROR: Invalid save parameter specification " +
                        self.output_target)
                elif os.path.exists(output_filename
                                    ) and not os.path.isfile(output_filename):
                    raise ValueError(
                        "ERROR: Save parameter not specify a file.")
                if not os.path.exists(output_filename):
                    out_file = omsi_file(output_filename, mode='a')
                    self.output_target = out_file.create_experiment()
                    self.__output_target_self = output_filename
                else:
                    out_file = omsi_file(output_filename, mode='r+')
                    if output_object_path is not None:
                        self.output_target = omsi_file_common.get_omsi_object(
                            out_file[output_object_path])
                    else:
                        if out_file.get_num_experiments() > 0:
                            self.output_target = out_file.get_experiment(0)
                        else:
                            self.output_target = out_file.create_experiment()
        else:
            self.output_target = parsed_arguments.pop(
                self.output_save_arg_name)

        # Process the --profile profiling argument
        if self.profile_arg_name in parsed_arguments:
            self.profile_analyses = parsed_arguments.pop(self.profile_arg_name)

        # Process the --memprofile argument
        if self.profile_mem_arg_name in parsed_arguments:
            self.profile_analyses_mem = parsed_arguments.pop(
                self.profile_mem_arg_name)

        # The --loglevel argument
        if self.log_level_arg_name in parsed_arguments:
            self.user_log_level = parsed_arguments.pop(self.log_level_arg_name)
            if self.user_log_level in log_helper.log_levels.keys():
                log_helper.set_log_level(
                    level=log_helper.log_levels[self.user_log_level])
            else:
                self.user_log_level = None
                log_helper.error(module_name=__name__,
                                 message="Invalid log level specified")

        # The --script arguments
        if self.script_arg_name in parsed_arguments:
            self.script_files = parsed_arguments.pop(self.script_arg_name)
            if self.workflow_executor is None:
                self.create_workflow_executor_object()
            else:
                self.workflow_executor.add_analysis_from_scripts(
                    script_files=self.script_files)
Exemplo n.º 15
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__,
                         'Reading inputs',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__,
                             'Preparing file lookup table',
                             comm=self.mpi_comm,
                             root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                        tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                    path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {
            'file_lookup_table': file_lookup_table,
            'ms1_mass_tol': ms1_mass_tol,
            'ms2_mass_tol': ms2_mass_tol,
            'neutralizations': neutralizations,
            'max_depth': max_depth
        }

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([
                    spectrum_indexes,
                ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__,
                                 'Preparing parallel execution',
                                 comm=self.mpi_comm,
                                 root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [
                    0,
                ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.
                    execute_analysis,  # Execute this function
                    task_function_params={
                        'file_lookup_table': file_lookup_table
                    },  # Reuse the file_lookup_table
                    main_data=
                    spectrum_indexes,  # Process the spectra independently
                    split_axes=split_axis,  # Split along axes
                    main_data_param_name='spectrum_indexes',  # data input param
                    root=self.mpi_root,  # The root MPI task
                    schedule=self['schedule'],  # Parallel scheduling scheme
                    comm=self.mpi_comm)  # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0, ), dtype='f4')
                id_data = np.zeros((0, ), dtype='a100')
                name = np.zeros((0, ), dtype='a100')
                mass = np.zeros((0, ), dtype='f4')
                n_peaks = np.zeros((0, ), dtype='i4')
                n_match = np.zeros((0, ), dtype='i4')

                use_dynamic_schedule = (
                    self['schedule'] ==
                    mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank()
                                            == self.mpi_root
                                            and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__,
                         'Processing spectra',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index + 1),
                                           2] if spectrum_index <
                       (num_spectra - 1) else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__,
                                time_str,
                                comm=self.mpi_comm,
                                root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2),
                                          dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(
                precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(
                scan_list=[
                    current_peaks_list,
                ],
                ms1_mz=[
                    current_parent_mass,
                ],
                params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(
                        score_frag_dag.make_pactolus_hit_table(
                            pactolus_results=hit_matrix[current_index],
                            table_file=file_lookup_table,
                            original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index,
                                                               0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(
                            current_hit_table['n_peaks'][score_index])
                        n_match.append(
                            current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(
            mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)
Exemplo n.º 16
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {'file_lookup_table': file_lookup_table,
                               'ms1_mass_tol': ms1_mass_tol,
                               'ms2_mass_tol': ms2_mass_tol,
                               'neutralizations': neutralizations,
                               'max_depth': max_depth}

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([spectrum_indexes, ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [0, ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.execute_analysis,                    # Execute this function
                    task_function_params={'file_lookup_table': file_lookup_table},  # Reuse the file_lookup_table
                    main_data=spectrum_indexes,                             # Process the spectra independently
                    split_axes=split_axis,                                  # Split along axes
                    main_data_param_name='spectrum_indexes',                # data input param
                    root=self.mpi_root,                                     # The root MPI task
                    schedule=self['schedule'],                              # Parallel scheduling scheme
                    comm=self.mpi_comm)                                     # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0,), dtype='f4')
                id_data = np.zeros((0,), dtype='a100')
                name = np.zeros((0,), dtype='a100')
                mass = np.zeros((0,), dtype='f4')
                n_peaks = np.zeros((0,), dtype='i4')
                n_match = np.zeros((0,), dtype='i4')

                use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index+1), 2]
                   if spectrum_index < (num_spectra-1)
                   else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(scan_list=[current_peaks_list, ],
                                                                        ms1_mz=[current_parent_mass, ],
                                                                        params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(score_frag_dag.make_pactolus_hit_table(
                        pactolus_results=hit_matrix[current_index],
                        table_file=file_lookup_table,
                        original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(current_hit_table['n_peaks'][score_index])
                        n_match.append(current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)
Exemplo n.º 17
0
    def execute_analysis(self, msidata_subblock=None):
        """
        Execute the local peak finder for the given msidata.

        :param msidata_subblock: Optional input parameter used for parallel execution of the
            analysis only. If msidata_subblock is set, then the given subblock will be processed
            in SERIAL instead of processing self['msidata'] in PARALLEL (if available). This
            parameter is strictly optional and intended for internal use only.

        """
        # Make sure needed imports are available
        from omsi.analysis.findpeaks.third_party.findpeaks import findpeaks
        import numpy as np

        # Assign parameters to local variables for convenience
        msidata = self['msidata']
        if msidata_subblock is not None:
            msidata = msidata_subblock
        mzdata = self['mzdata']
        integration_width = self['integration_width']
        peakheight = self['peakheight']
        slwindow = self['slwindow']
        smoothwidth = self['smoothwidth']
        print_status = self['printStatus']
        if print_status:
            import sys

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(self['msidata'].shape) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if msidata_subblock is None:
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = range(
                    len(self['msidata'].shape) -
                    1)  # The axes along which we can split the data
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.
                    execute_analysis,  # Execute this function
                    task_function_params={},  # No added parameters
                    main_data=msidata,  # Process the msidata
                    split_axes=split_axis,  # Split along axes
                    main_data_param_name='msidata_subblock',  # data input param
                    root=self.mpi_root,  # The root MPI task
                    schedule=self['schedule'],  # Parallel schedule
                    comm=self.mpi_comm)  # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # TODO Record runtime information data from the scheduler in our provenance data
                # self.run_info['SCHEDULER_blocks'] = scheduler.blocks
                # self.run_info['SCHEDULER_block_times'] = scheduler.block_times
                # self.run_info['SCHEDULER_run_time'] = scheduler.run_time
                # self.run_info['SCHEDULER_schedule'] = scheduler.schedule

                # Compile the data from the parallel execution
                # Case Table:
                #
                # collect + worker       2
                #           worker       2
                # collect + root         3
                #           root         1
                use_dynamic_schedule = (
                    self['schedule'] ==
                    mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])
                # Case 1: root rank without collect data disabled
                if mpi_helper.get_rank(
                ) == self.mpi_root and not self['collect']:
                    # We did not process any data on the root if DYNAMIC scheduling was used
                    if use_dynamic_schedule:
                        return None, None, None, mzdata
                    # We processed a data block using dynamic scheduling
                    else:
                        return result[0][0]
                # Case 2: Compile the data on the worker
                elif mpi_helper.get_rank(
                ) != self.mpi_root:  # and use_dynamic_schedule:
                    # Compile the results from all processing task (on workers) or from all workers (on the root)
                    peak_mz = np.concatenate(tuple([ri[0]
                                                    for ri in result[0]]),
                                             axis=-1)
                    peak_values = np.concatenate(tuple(
                        [ri[1] for ri in result[0]]),
                                                 axis=-1)
                    if len(
                            result[1]
                    ) > 1:  # Correct indices from the individual runs since they all start at 0
                        peak_arrayindex = np.asarray([[b[0], b[1], 0]
                                                      for b in result[1]])
                        peak_arrayindex[:, 2] = np.cumsum(
                            [0] + [len(ri[0]) for ri in result[0]])[:-1]
                    else:
                        peak_arrayindex = result[0][0][2]
                    mzdata = result[0][0][3]
                    return peak_mz, peak_values, peak_arrayindex, mzdata
                # Case 3: Compile collected data on the root
                elif mpi_helper.get_rank(
                ) == self.mpi_root:  # and use_dynamic_schedule:
                    # Compile the results from all processing task (on workers) or from all workers (on the root)
                    peak_mz = np.concatenate(tuple([ri[0]
                                                    for ri in result[0]]),
                                             axis=-1)
                    peak_values = np.concatenate(tuple(
                        [ri[1] for ri in result[0]]),
                                                 axis=-1)
                    # Dynamic scheduling uses selections of (int,int,slice) while the static
                    # scheduling uses (slice, slice, slice), hence we need to compile the peak_arrayindex
                    # slightly differently depending on the scheduler used
                    if use_dynamic_schedule:
                        peak_arrayindex = np.asarray([[b[0], b[1], 0]
                                                      for b in result[1]])
                        peak_arrayindex[:, 2] = np.cumsum(
                            [0] + [len(ri[0]) for ri in result[0]])[:-1]
                    else:
                        peak_arrayindex = np.concatenate(tuple(
                            [ri[2] for ri in result[0]]),
                                                         axis=0)
                        d = np.cumsum([0] + [len(ri[0]) for ri in result[0]])
                        d2 = np.cumsum([0] + [len(ri[2]) for ri in result[0]])
                        for di in range(len(d2) - 1):
                            peak_arrayindex[d2[di]:d2[di + 1], 2] += d[di]
                    mzdata = result[0][0][3]
                    return peak_mz, peak_values, peak_arrayindex, mzdata

        #############################################################
        # Serial processing of the current data block
        #############################################################
        # Ensure the our MSI dataset has sufficient numbers of dimensions
        if len(msidata.shape) == 1:
            msidata = msidata[:][np.newaxis, np.newaxis, :]
        elif len(msidata.shape) == 2:
            msidata = msidata[:][np.newaxis, :]

        # Determine the data dimensions
        shape_x = msidata.shape[0]
        shape_y = msidata.shape[1]

        peak_mz = []  # The x values for all peaks, stored in a linear array
        peak_values = [
        ]  # The y values for all peaks, stored in a linear array
        # List describing for each pixel the start index where its peaks
        # are stored in the peaks_MZ and peaks_values array
        peak_arrayindex = np.zeros(shape=(shape_x * shape_y, 3), dtype='int64')
        current_index = long(0)
        pixel_index = 0
        for xi in xrange(0, shape_x):
            for yi in xrange(0, shape_y):
                if print_status:
                    sys.stdout.write("[" + str(
                        int(100. * float(pixel_index) /
                            float(shape_x * shape_y))) + "%]" + "\r")
                    sys.stdout.flush()

                # Load the spectrum
                y = msidata[xi, yi, :]
                # Find peaks in the spectrum
                peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow,
                                        peakheight)
                y = peak_finder.smoothListGaussian()
                # from the smoothed spectra subtract a sliding minima
                peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow,
                                        peakheight)
                slmin = [x for x in peak_finder.sliding_window_minimum()]
                y = y - slmin
                # find peaks in the smoothed, background subtracted spectra
                peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow,
                                        peakheight)
                [pkmax, pkmin] = peak_finder.peakdet()
                xp = [x[0] for x in pkmax]
                yp = [x[1] for x in pkmax]
                peak_mz = peak_mz + xp
                peak_values = peak_values + yp
                peak_arrayindex[pixel_index, 0] = xi
                peak_arrayindex[pixel_index, 1] = yi
                peak_arrayindex[pixel_index, 2] = current_index
                pixel_index += 1
                current_index += len(yp)

        # Add the analysis results and parameters to the anlaysis data so that it can be accessed and written to file
        # We here convert the single scalars to 1D numpy arrays to ensure consistency. The data write function can
        # handle also a large range of python built_in types by converting them to numpy for storage in HDF5 but
        # to ensure a consitent behavior we convert the values directly here

        # Save the analysis data to the __data_list so that the data can be
        # saved automatically by the omsi HDF5 file API
        return np.asarray(peak_mz), np.asarray(
            peak_values), peak_arrayindex, mzdata[:]