Пример #1
0
    def __read_all(self, filename):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes.
        """

        self.data = np.zeros(shape=self.shape, dtype=self.data_type)
        log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape])
        reader = ImzMLParser(filename)
        log_helper.debug(__name__, 'READING ALL DATA!! GIVE ME RAM (please)!')

        # Compute the bin edges for reinterpolation if needed
        if self.imzml_type == self.available_imzml_types['processed']:
            shift = np.diff(self.mz).mean()
            bin_edges = np.append(self.mz, self.mz[-1] + shift)
        else:
            bin_edges = None
        for ind in xrange(0, len(reader.coordinates)):
            xidx, yidx = reader.coordinates[ind]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            # Read the spectrum
            mz, intens = reader.getspectrum(ind)
            # Reinterpolate intensities if we are in processed mode
            if bin_edges is not None:
                intens, bin_edges_new = np.histogram(mz,
                                                     bins=bin_edges,
                                                     weights=intens)
            # Save the intensity values in our data cube
            self.data[xidx, yidx, :] = intens
Пример #2
0
    def remove_output_target(self):
        """
        This function is used to delete any output target files created by the
        command line driver. This is done in case that an error occurred and
        we do not want to leave garbage files left over.

        *Side effects* The function modifies ``self.output_target``

        :return: Boolean indicating whether we succesfully cleaned up the output

        """
        success = False
        if self.__output_target_self is not None:
            try:
                os.remove(self.__output_target_self)
                log_helper.info(
                    __name__, "Successfully removed output target: " +
                    unicode(self.__output_target_self))
                success = True
            except:
                log_helper.error(
                    __name__,
                    "Clean-up of output failed. File may be left on system: " +
                    unicode(self.__output_target_self))
        elif self.output_target is not None:
            log_helper.info(
                __name__,
                "Output target not removed because it was not created " +
                "by the analysis but potentially modified by it")
        else:
            success = True
        return success
Пример #3
0
    def remove_output_target(self):
        """
        This function is used to delete any output target files created by the
        command line driver. This is done in case that an error occurred and
        we do not want to leave garbage files left over.

        *Side effects* The function modifies ``self.output_target``

        :return: Boolean indicating whether we succesfully cleaned up the output
        """
        success = False
        if self.__output_target_self is not None:
            try:
                os.remove(self.__output_target_self)
                log_helper.info(__name__, "Successfully removed output target: " + unicode(self.__output_target_self))
                success = True
            except:
                log_helper.error(__name__, "Clean-up of output failed. File may be left on system: "
                                 + unicode(self.__output_target_self))
        elif self.output_target is not None:
            log_helper.info(__name__, "Output target not removed because it was not created " +
                                      "by the analysis but potentially modified by it")
        else:
            success = True
        return success
Пример #4
0
    def __read_all(self, filename):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes.
        """

        self.data = np.zeros(shape=self.shape, dtype=self.data_type)
        log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape])
        reader = ImzMLParser(filename)
        log_helper.debug(__name__,'READING ALL DATA!! GIVE ME RAM (please)!')

        # Compute the bin edges for reinterpolation if needed
        if self.imzml_type == self.available_imzml_types['processed']:
            shift = np.diff(self.mz).mean()
            bin_edges = np.append(self.mz, self.mz[-1]+ shift)
        else:
            bin_edges = None
        for ind in xrange(0, len(reader.coordinates)):
            xidx, yidx = reader.coordinates[ind]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            # Read the spectrum
            mz, intens = reader.getspectrum(ind)
            # Reinterpolate intensities if we are in processed mode
            if bin_edges is not None:
                f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False)
                intens = f(self.mz)
                #intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens)
            # Save the intensity values in our data cube
            self.data[xidx, yidx, :] = intens
Пример #5
0
    def append(self, analysis_object):
        """
        Add a given analysis to the set of object to be executed by the workflow

        This is the same as set.add() but we ensure that only analysis_base objects
        are added.

        :param analysis_object: Analysis object to be added to the execution.
            All dependencies of the analysis will also be executed as part of the
            execution.
        :type analysis_object: omsi.analysis.base.analysis_base

        :raises: ValueError is raised if the given analysis_object is invalid
        """
        from omsi.analysis.base import analysis_base
        if isinstance(analysis_object, analysis_base):
            if analysis_object in self:
                log_helper.debug(__name__,
                                 "Analysis already in the list of tasks")
                return
            log_helper.info(
                __name__, "Adding analysis object to the workflow set. " +
                str(analysis_object))
            super(analysis_task_list, self).append(analysis_object)
        else:
            raise ValueError(
                'Analysis is not of type omsi.analysis.base.analysis_base')
Пример #6
0
    def __init__(self, basename, requires_slicing=True, resolution=5):
        """
        Open an img file for data reading.

        :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found
                             in the directory will be used instead.
        :type basename: string

        :param requires_slicing: Should the complete data be read into memory
                             (this makes slicing easier). (default is True)
        :type requires_slicing: bool

        :param resolution: For profile data only, the minimum m/z spacing to use for creating the "full" reprofiled
                            data cube
        :type resolution: float
        """
        # Determine the correct base
        if os.path.isdir(basename):
            filelist = self.get_files_from_dir(basename)
            if len(filelist) > 0:
                basename = filelist[0]
            else:
                raise ValueError("No valid mzML file found in the given directory.")
        # self.basename = basename
        # self.requires_slicing = requires_slicing
        
        # Call super constructor. This sets self.basename and self.readall
        super(xmassmzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing)
        self.resolution = resolution
        self.data_type = 'uint32'  # TODO What data type should we use for the interpolated data?
        self.num_scans = self.__compute_num_scans(filename=self.basename)
        log_helper.info(__name__, 'Read %s scans from mzML file.' % self.num_scans)
        log_helper.debug(__name__, 'Compute coordinates')
        self.coordinates = self.__compute_coordinates(filename=self.basename,num_scans=self.num_scans)
        # Compute the spatial configuration of the matrix
        self.x_pos = np.unique(self.coordinates[:, 0])
        self.y_pos = np.unique(self.coordinates[:, 1])
        self.step_size = min([min(np.diff(self.x_pos)), min(np.diff(self.y_pos))])

        # Compute the mz axis
        log_helper.debug(__name__, 'Compute mz axes')
        self.mz = self.__compute_mz_axis(filename=self.basename)
        log_helper.debug(__name__, 'mz axes computed')

        # Determine the shape of the dataset, result is a list of shapes for each datacube
        # self.shape_all_data = [(self.x_pos.shape[0], self.y_pos.shape[0], mz.shape[0]) for mz in self.mz_all

        log_helper.debug(__name__, 'Compute shape')
        self.shape = (self.x_pos.shape[0], self.y_pos.shape[0], len(self.mz))#self.shape[0])
        # self.shape = None
        # self.mz = None

        # Read the data into memory
        # self.data = None
        log_helper.debug(__name__, 'read all')
        if requires_slicing:
            self.data = self.__read_all()
        log_helper.debug(__name__, 'Finished with init')
Пример #7
0
    def __read_all(self):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes
        """

        self.data = [
            np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type)
            for scan_idx, scantype in enumerate(self.scan_types)
        ]

        for scan_idx, scantype in enumerate(self.scan_types):
            reader = mzml.read(self.basename)
            spectrumid = 0
            if not self.scan_profiled[scan_idx]:
                shift = np.diff(self.mz_all[scan_idx]).mean()
                bin_edges = np.append(self.mz_all[scan_idx],
                                      self.mz_all[scan_idx][-1] + shift)
            else:
                bin_edges = None

            for spectrum in reader:
                if spectrum['scanList']['scan'][0][
                        'filter string'] == scantype:
                    x = spectrum['m/z array']
                    try:
                        y = spectrum['intensity array']
                    except KeyError:
                        raise KeyError
                    if bin_edges is None:
                        yi = np.interp(
                            self.mz_all[scan_idx], x, y, 0,
                            0)  # Re-interpolate the data in profiled mode
                    else:
                        yi, _ = np.histogram(
                            x, bins=bin_edges, weights=y
                        )  # Re-histogram the data in centroided mode
                    xidx = np.nonzero(
                        self.x_pos == self.coordinates[spectrumid, 0])[0]
                    yidx = np.nonzero(
                        self.y_pos == self.coordinates[spectrumid, 1])[0]
                    try:
                        self.data[scan_idx][xidx, yidx, :] = yi
                    except:
                        log_helper.debug(__name__, spectrumid, scan_idx,
                                         scantype, self.mz_all[scan_idx].shape)
            # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly
                if spectrumid % 1000 == 0:
                    log_helper.info(
                        __name__,
                        'Processed data for %s spectra to datacube for scan type %s'
                        % (spectrumid, scantype))
                spectrumid += 1
Пример #8
0
    def collect_data(self, force_collect=False):
        """
        Collect the results from the parallel execution to the self.root rank.

        NOTE: On the root the self.result, self.blocks, and self.block_times variables are
              updated with the collected data as well and self.__data_collected will be set

        NOTE: If the data has already been collected previously (ie., collect_data has been called
            before), then the collection will not be performed again, unless force_collect is set.

        :param force_collect: Set this parameter to force that data collection is performed again.
            By default the collect_data is performed only once for each time the run(..) function
            is called and the results are reused to ensure consistent data structures. We can
            force that collect will be reexecuted anyways by setting force_collect.

        :return: On worker ranks (i.e., MPI_RANK!=self.root) this is simply the
            self.result and self.blocks containing the result created by run function.
            On the root rank (i.e., MPI_RANK!=self.root) this is a tuple of two lists
            containing the combined data of all  self.result and self.blocks from all ranks respectively.

        """
        try:
            from omsi.shared.log import log_helper
        except ImportError:
            from pactolus.third_party.log import log_helper
        # If we have collected the data already then we don't need to do it again
        if self.__data_collected and not force_collect:
            return self.result, self.blocks

        # Collect the output
        rank = get_rank(comm=self.comm)
        start_time = time.time()
        if rank == self.root:
            log_helper.info(__name__, "COLLECTING RESULTS")
        # Collect the data, blocks, and block_times from all ranks
        collected_data = self.comm.gather(self.result, root=self.root)
        collected_blocks = self.comm.gather(self.blocks, root=self.root)
        # Save the data to self.result, self.block, self.block_times if we are the root
        if rank == self.root:
            # Merge the results from all the processes into a single result and blocks list
            # rather than having a list of lists of results
            self.result = list(itertools.chain.from_iterable(collected_data))
            self.blocks = list(itertools.chain.from_iterable(collected_blocks))

        # Record the time we used to collect the data
        end_time = time.time()
        run_time = end_time - start_time
        if rank == self.root:
            log_helper.info(__name__, "TIME FOR COLLECTING DATA FROM ALL TASKS: " + str(run_time))
        # Return the result
        self.__data_collected = True
        return self.result, self.blocks
Пример #9
0
    def update_job_status(filepath, db_server, jobid, status='complete'):
        """
        Function used to update the status of the job on the server

        :param filepath: Path of the file to be added to the database (only needed update file permissions)
        :param db_server: The database server url
        :param jobid: The id of the current job.
        :param status: One of 'running', 'complete' or 'error'
        """
        import urllib2
        import urllib

        # If we are at NERSC then set the NERSC Apache permissions
        if 'nersc.gov' in db_server:
            WebHelper.set_apache_acl(filepath)

        # Construct the db add-file url
        update_status_url = os.path.join(db_server,
                                         "openmsi/processing/update")
        query_params = {'jobid': jobid, 'status': status}
        update_status_url += "?"
        update_status_url += urllib.urlencode(query_params)

        # Make the url request
        try:
            log_helper.info(__name__,
                            "Updating job status: " + update_status_url)
            url_response = urllib2.urlopen(url=update_status_url)
            if url_response.code == 200:
                return True
        except urllib2.HTTPError as request_error:
            raise ValueError("ERROR: job status could not be updated: \n" +
                             "      Error-code:" + str(request_error.code) +
                             "\n" + "      Error info:" +
                             str(request_error.read()))
        except urllib2.URLError as request_error:
            if sys.version_info >= (2, 7, 9):
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                url_response = urllib2.urlopen(url=update_status_url,
                                               context=ssl_context)
                if url_response.code == 200:
                    return True
            else:
                raise ValueError("ERROR: job status could not be updated: \n" +
                                 "      Error-code:" +
                                 str(request_error.code) + "\n" +
                                 "      Error info:" +
                                 str(request_error.read()))
Пример #10
0
    def collect_data(self, force_collect=False):
        """
        Collect the results from the parallel execution to the self.root rank.

        NOTE: On the root the self.result, self.blocks, and self.block_times variables are
              updated with the collected data as well and self.__data_collected will be set

        NOTE: If the data has already been collected previously (ie., collect_data has been called
            before), then the collection will not be performed again, unless force_collect is set.

        :param force_collect: Set this parameter to force that data collection is performed again.
            By default the collect_data is performed only once for each time the run(..) function
            is called and the results are reused to ensure consistent data structures. We can
            force that collect will be reexecuted anyways by setting force_collect.

        :return: On worker ranks (i.e., MPI_RANK!=self.root) this is simply the
            self.result and self.blocks containing the result created by run function.
            On the root rank (i.e., MPI_RANK!=self.root) this is a tuple of two lists
            containing the combined data of all  self.result and self.blocks from all ranks respectively.

        """
        from omsi.shared.log import log_helper
        # If we have collected the data already then we don't need to do it again
        if self.__data_collected and not force_collect:
            return self.result, self.blocks

        # Collect the output
        rank = get_rank(comm=self.comm)
        start_time = time.time()
        if rank == self.root:
            log_helper.info(__name__, "COLLECTING RESULTS")
        # Collect the data, blocks, and block_times from all ranks
        collected_data = self.comm.gather(self.result, root=self.root)
        collected_blocks = self.comm.gather(self.blocks, root=self.root)
        # Save the data to self.result, self.block, self.block_times if we are the root
        if rank == self.root:
            # Merge the results from all the processes into a single result and blocks list
            # rather than having a list of lists of results
            self.result = list(itertools.chain.from_iterable(collected_data))
            self.blocks = list(itertools.chain.from_iterable(collected_blocks))

        # Record the time we used to collect the data
        end_time = time.time()
        run_time = end_time - start_time
        if rank == self.root:
            log_helper.info(__name__, "TIME FOR COLLECTING DATA FROM ALL TASKS: " + str(run_time))
        # Return the result
        self.__data_collected = True
        return self.result, self.blocks
Пример #11
0
    def set_apache_acl(filepath):
        """
        Helper function used to set acl permissions for apache to make the given file accesible
        to Apache at NERSC. This necessary to make the file readable for adding it to the
        database.

        :param filepath: String with the path to the file for which ACL permission should be set

        """
        log_helper.info(__name__, "Setting NERSC ACL permissions for Apache")
        # Note u:48 is a replacement for u:apache to ensure that
        # that the command works properly on edison.nersc.gov which
        # does not have the apache user. However u:48 is equivalent.
        command = "setfacl -R -m u:48:rwx " + '"' + filepath + '"'
        os.system(command)
Пример #12
0
    def set_apache_acl(filepath):
        """
        Helper function used to set acl permissions for apache to make the given file accesible
        to Apache at NERSC. This necessary to make the file readable for adding it to the
        database.

        :param filepath: String with the path to the file for which ACL permission should be set

        """
        log_helper.info(__name__, "Setting NERSC ACL permissions for Apache")
        # Note u:48 is a replacement for u:apache to ensure that
        # that the command works properly on edison.nersc.gov which
        # does not have the apache user. However u:48 is equivalent.
        command = "setfacl -R -m u:48:rwx " + '"' + filepath + '"'
        os.system(command)
Пример #13
0
    def insert(self, index, analysis_object):
        """
        Insert a given analysis object at the given location

        :param index: Location where the obejct should be inserted
        :param analysis_object: The analysis object to be inserted

        """
        from omsi.analysis.base import analysis_base
        if isinstance(analysis_object, analysis_base):
            if analysis_object in self:
                log_helper.debug(__name__, "Analysis already in the list of tasks")
                return
            log_helper.info(__name__, "Inserting analysis object in the workflow list. " + str(analysis_object))
            super(analysis_task_list, self).insert(index, analysis_object)
        else:
            raise ValueError('Analysis is not of type omsi.analysis.base.analysis_base')
Пример #14
0
    def update_job_status(filepath, db_server, jobid, status='complete'):
        """
        Function used to update the status of the job on the server

        :param filepath: Path of the file to be added to the database (only needed update file permissions)
        :param db_server: The database server url
        :param jobid: The id of the current job.
        :param status: One of 'running', 'complete' or 'error'
        """
        import urllib2
        import urllib

        # If we are at NERSC then set the NERSC Apache permissions
        if 'nersc.gov' in db_server:
            WebHelper.set_apache_acl(filepath)

        # Construct the db add-file url
        update_status_url = os.path.join(db_server, "openmsi/processing/update")
        query_params = {'jobid': jobid, 'status': status}
        update_status_url += "?"
        update_status_url += urllib.urlencode(query_params)

        # Make the url request
        try:
            log_helper.info(__name__, "Updating job status: " + update_status_url)
            url_response = urllib2.urlopen(url=update_status_url)
            if url_response.code == 200:
                return True
        except urllib2.HTTPError as request_error:
            raise ValueError("ERROR: job status could not be updated: \n" +
                             "      Error-code:" + str(request_error.code) + "\n" +
                             "      Error info:" + str(request_error.read()))
        except urllib2.URLError as request_error:
            if sys.version_info >= (2, 7, 9):
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                url_response = urllib2.urlopen(url=update_status_url, context=ssl_context)
                if url_response.code == 200:
                    return True
            else:
                raise ValueError("ERROR: job status could not be updated: \n" +
                                 "      Error-code:" + str(request_error.code) + "\n" +
                                 "      Error info:" + str(request_error.read()))
Пример #15
0
 def print_settings(self):
     """
     Print the analysis settings.
     """
     log_helper.info(__name__, "Inputs:")
     for key, value in self.analysis_arguments.iteritems():
         log_helper.info(__name__, "   " + unicode(key) + " = " + unicode(value))
     if self.output_target is not None:
         if isinstance(self.output_target, omsi_file_common):
             h5py_object = omsi_file_common.get_h5py_object(self.output_target)
             log_helper.info(__name__, "Save to: " + unicode(h5py_object.file.filename) + u":" + unicode(h5py_object.name))
         else:
             log_helper.info(__name__, "Save to: " + unicode(self.output_target))
Пример #16
0
    def __init__(self, basename, requires_slicing=True, resolution=15):
        """
        Open an imzml file for data reading.

        :param  basename:   The name of the mzml file. If basename is a directory, then the first mzML file found
                             in the directory will be used instead.
        :type   basename:   string

        :param  requires_slicing:   Should the complete data be read into memory
                             (this makes slicing easier). (default is True)
        :type   requires_slicing:   bool

        :param resolution: For processed data only, the minimum m/z spacing to use for creating the "full" reprofiled
                            data cube
        :type resolution: float
        """
        # Determine the correct base
        if os.path.isdir(basename):
            filelist = self.get_files_from_dir(basename)
            if len(filelist) > 0:
                basename = filelist[0]
            else:
                raise ValueError(
                    "No valid imzML file found in the given directory.")

        # Call super constructor. This sets self.basename and self.readall
        super(imzml_file, self).__init__(basename=basename,
                                         requires_slicing=requires_slicing)
        self.resolution = resolution

        # Compute the mz axis, pixel coordinates data type etc.
        self.coordinates, self.mz, self.data_type, self.imzml_type, self.dataset_metadata, self.instrument_metadata, \
        self.method_metadata = self.__compute_file_info(filename=self.basename, resolution=self.resolution)

        self.num_scans = self.coordinates.size
        log_helper.info(__name__,
                        'Read %s scans from imzML file.' % self.num_scans)

        # Compute step size
        self.x_pos = np.unique(self.coordinates[:, 0])
        self.y_pos = np.unique(self.coordinates[:, 1])
        self.x_pos_min = self.x_pos.min()
        self.y_pos_min = self.y_pos.min()

        # self.step_size = min([min(np.diff(self.x_pos)), min(np.diff(self.y_pos))])

        # Determine the shape of the dataset ## TODO: after solving imzML generation prob, fix this for multicube data
        num_x = self.x_pos.max() - self.x_pos.min() + 1
        num_y = self.y_pos.max() - self.y_pos.min() + 1

        self.shape = (num_x, num_y, self.mz.size)

        # Read the data into memory
        self.data = None
        if requires_slicing:
            self.__read_all(filename=basename)

        log_helper.info(__name__, "IMZML file type: " + str(self.imzml_type))
        log_helper.info(__name__, "IMZML data type: " + str(self.data_type))
Пример #17
0
    def __read_all(self):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes
        """

        self.data = [np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types)]

        for scan_idx, scantype in enumerate(self.scan_types):
            reader = mzml.read(self.basename)
            spectrumid = 0
            if not self.scan_profiled[scan_idx]:
                shift = np.diff(self.mz_all[scan_idx]).mean()
                bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1]+ shift)
            else:
                bin_edges = None

            for spectrum in reader:
                if spectrum['scanList']['scan'][0]['filter string'] == scantype:
                    x = spectrum['m/z array']
                    try:
                        y = spectrum['intensity array']
                    except KeyError:
                        raise KeyError
                    if bin_edges is None:
                        yi = np.interp(self.mz_all[scan_idx], x, y, 0, 0)  # Re-interpolate the data in profiled mode
                    else:
                         yi, _ = np.histogram(x, bins=bin_edges, weights=y)   # Re-histogram the data in centroided mode
                    xidx = np.nonzero(self.x_pos == self.coordinates[spectrumid, 0])[0]
                    yidx = np.nonzero(self.y_pos == self.coordinates[spectrumid, 1])[0]
                    try:
                        self.data[scan_idx][xidx, yidx, :] = yi
                    except:
                        log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape)
            # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly
                if spectrumid%1000 == 0:
                    log_helper.info(__name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype))
                spectrumid += 1
Пример #18
0
    def __init__(self, basename, requires_slicing=True, resolution=15):
        """
        Open an imzml file for data reading.

        :param  basename:   The name of the mzml file. If basename is a directory, then the first mzML file found
                             in the directory will be used instead.
        :type   basename:   string

        :param  requires_slicing:   Should the complete data be read into memory
                             (this makes slicing easier). (default is True)
        :type   requires_slicing:   bool

        :param resolution: For processed data only, the minimum m/z spacing to use for creating the "full" reprofiled
                            data cube
        :type resolution: float
        """
        # Determine the correct base
        if os.path.isdir(basename):
            filelist = self.get_files_from_dir(basename)
            if len(filelist) > 0:
                basename = filelist[0]
            else:
                raise ValueError("No valid imzML file found in the given directory.")

        # Call super constructor. This sets self.basename and self.readall
        super(imzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing)
        self.resolution=resolution

        # Compute the mz axis, pixel coordinates data type etc.
        self.coordinates, self.mz, self.data_type, self.imzml_type, self.dataset_metadata, self.instrument_metadata, \
        self.method_metadata = self.__compute_file_info(filename=self.basename, resolution=self.resolution)

        self.num_scans = self.coordinates.size
        log_helper.info(__name__, 'Read %s scans from imzML file.' % self.num_scans)

        # Compute step size
        self.x_pos = np.unique(self.coordinates[:, 0])
        self.y_pos = np.unique(self.coordinates[:, 1])
        self.x_pos_min = self.x_pos.min()
        self.y_pos_min = self.y_pos.min()

        # self.step_size = min([min(np.diff(self.x_pos)), min(np.diff(self.y_pos))])

        # Determine the shape of the dataset ## TODO: after solving imzML generation prob, fix this for multicube data
        num_x = self.x_pos.max() - self.x_pos.min() + 1
        num_y = self.y_pos.max() - self.y_pos.min() + 1

        self.shape = (num_x,num_y, self.mz.size)

        # Read the data into memory
        self.data = None
        if requires_slicing:
            self.__read_all(filename=basename)

        log_helper.info(__name__, "IMZML file type: " + str(self.imzml_type))
        log_helper.info(__name__, "IMZML data type: " + str(self.data_type))
Пример #19
0
 def print_settings(self):
     """
     Print the analysis settings.
     """
     log_helper.info(__name__, "Inputs:")
     for key, value in sorted(self.analysis_arguments.iteritems()):
         log_helper.info(__name__,
                         "   " + unicode(key) + " = " + unicode(value))
     if self.output_target is not None:
         if isinstance(self.output_target, omsi_file_common):
             h5py_object = omsi_file_common.get_h5py_object(
                 self.output_target)
             log_helper.info(
                 __name__,
                 "Save to: " + unicode(h5py_object.file.filename) + u":" +
                 unicode(h5py_object.name))
         else:
             log_helper.info(__name__,
                             "Save to: " + unicode(self.output_target))
Пример #20
0
    def main(self):
        """Execute the analysis workflow"""
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty")
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow")
        log_helper.info(__name__, "Adding all dependencies")
        self.add_analysis_dependencies()

        # Record the runtime information
        log_helper.debug(__name__, "Recording runtime information")
        self.run_info.clear()
        self.run_info.record_preexecute()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow")
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(
                        analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__,
                                     "Execute analysis: " + str(analysis))
                    analysis.execute()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.")
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(
                    __name__,
                    "Workflow could not be fully executed. " + str(num_tasks) +
                    " remain in the queue but cannot be completed due to unresolved dependencies."
                )
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG')

        # Record the runtime information after we are done with the workflow
        self.run_info.record_postexecute()
        self.run_info.gather()
Пример #21
0
    def main(self):
        """
        Execute the analysis workflow
        """
        # Do the optional MPI barrier
        if self['synchronize']:
            mpi_helper.barrier(comm=self.mpi_comm)

        # Check if we have anything to do at all
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm)
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm)
        log_helper.info(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm)
        self.add_analysis_dependencies()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm)
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis),
                                     root=self.mpi_root, comm=self.mpi_comm)
                    analysis.execute()
                    if self['reduce_memory_usage']:
                        analysis.clear_and_restore()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm)
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies.",
                                   root=self.mpi_root, comm=self.mpi_comm)
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
Пример #22
0
    def main(self):
        """Execute the analysis workflow"""
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty")
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow")
        log_helper.info(__name__, "Adding all dependencies")
        self.add_analysis_dependencies()

        # Record the runtime information
        log_helper.debug(__name__, "Recording runtime information")
        self.run_info.clear()
        self.run_info.record_preexecute()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow")
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis))
                    analysis.execute()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.")
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies.")
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG')

        # Record the runtime information after we are done with the workflow
        self.run_info.record_postexecute()
        self.run_info.gather()
Пример #23
0
    def register_file_with_db(filepath,
                              db_server,
                              file_user_name,
                              jobid=None,
                              check_add_nersc=True):
        """ Function used to register a given file with the database

            :param filepath: Path of the file to be added to the database
            :param db_server: The database server url
            :param file_user_name: The user to be used, or None if the user should
                                    be determined based on the file URL.
            :param jobid: Optional input parameter defining the jobid to be updated.
                          If the jobid is given then the job will be updated with the
                          database instead of adding the file explicitly. I.e.,
                          instead of register_filer_with_db the update_job_status call
                          is executed.
            :param check_add_nersc: Boolean if set to True performs additional actions to add the
                         file to the pupblic OpenMSI gateway hosted at NERSC.

            :returns: Boolean indicating whether the operation was successful

        """
        import urllib2
        import urllib

        if jobid is not None:
            return WebHelper.update_job_status(filepath=filepath,
                                               db_server=db_server,
                                               jobid=jobid,
                                               status='complete')
        # Check if the
        if db_server == WebHelper.default_db_server_url and check_add_nersc:
            is_allowed_path = False
            for allowed_nersc_location in WebHelper.allowed_nersc_locations:
                if filepath.startswith(allowed_nersc_location):
                    is_allowed_path = True
                    break
            if not is_allowed_path and file_user_name in WebHelper.super_users:
                print "WARNING: Attempt to add a file to openmsi.nersc.gov that is not in a default location."
                print "Do you want to add the file? (Y/N):"
                num_trys = 3
                timeout = 5 * 60  # Timeout after 5 minutes
                for i in range(num_trys):
                    # user_input = raw_input()
                    user_input = UserInput.userinput_with_timeout(
                        timeout=timeout, default=None)
                    if user_input is None:
                        warnings.warn(
                            "WARNING: Attempt to add a file to openmsi.nersc.gov that,"
                            +
                            " is not in a default location. Timeout occurred before"
                            +
                            " user confirmed. Aborted adding the file to the DB."
                        )
                        return False
                    if user_input == "Y" or user_input == "y" or user_input == "Yes" or \
                            user_input == "yes" or user_input == "YES":
                        break
                    elif user_input == "N" or user_input == "n" or user_input == "No" or \
                            user_input == "no" or user_input == "NO":
                        return False
                    else:
                        if i == (num_trys - 1):
                            warnings.warn(
                                "WARNING: Attempt to add a file to openmsi.nersc.gov that,"
                                +
                                " is not in a default location. User input unrecognized."
                                + " Aborted adding the file to the DB.")
                            return False
                        print "Unrecognized response. Do you want to add the file? (Y/N): "
            elif not is_allowed_path:
                warnings.warn(
                    "Adding file to the OpenMSI database in unconventional location not permitted for user."
                )
                return False
            else:
                pass  # Adding the file to the db is allowed

        # If we are at NERSC then set the NERSC Apache permissions
        if 'nersc.gov' in db_server and check_add_nersc:
            WebHelper.set_apache_acl(filepath)

        # Determine the user
        curr_user = file_user_name
        if not curr_user:
            curr_user = os.path.dirname(filepath).split("/")[-1]
        if not curr_user:
            raise ValueError(
                "ERROR: File could not be added to DB. Owner could not be determined."
            )

        # Construct the db add-file url
        add_file_url = os.path.join(db_server, "openmsi/resources/addfile")
        addfilepath = filepath
        # Correct the filepath if we are on openmsi.nersc.gov, as /global is not mounted but only /project.
        if db_server == WebHelper.default_db_server_url and addfilepath.startswith(
                "/global/project/projectdirs"):
            addfilepath = filepath.lstrip("/global")
        query_params = {
            'file': os.path.abspath(addfilepath),
            'owner': curr_user
        }
        add_file_url += "?"
        add_file_url += urllib.urlencode(query_params)
        # add_file_url = add_file_url + "?file=" + \
        #    os.path.abspath(filepath) + "&user="******"Registering file with DB: " + add_file_url)
            url_response = urllib2.urlopen(url=add_file_url)
            if url_response.code == 200:
                return True
        except urllib2.HTTPError as request_error:
            raise ValueError("ERROR: File could not be added to DB: \n" +
                             "      Error-code:" + str(request_error.code) +
                             "\n" + "      Error info:" +
                             str(request_error.read()))

        return False
Пример #24
0
    def __init__(self, basename, requires_slicing=True, resolution=5):
        """
        Open an img file for data reading.

        :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found
                             in the directory will be used instead.
        :type basename: string

        :param requires_slicing: Should the complete data be read into memory
                             (this makes slicing easier). (default is True)
        :type requires_slicing: bool

        :param resolution: For profile data only, the minimum m/z spacing to use for creating the "full" reprofiled
                            data cube
        :type resolution: float
        """
        # Determine the correct base
        if os.path.isdir(basename):
            filelist = self.get_files_from_dir(basename)
            if len(filelist) > 0:
                basename = filelist[0]
            else:
                raise ValueError(
                    "No valid mzML file found in the given directory.")
        # self.basename = basename
        # self.requires_slicing = requires_slicing

        # Call super constructor. This sets self.basename and self.readall
        super(xmassmzml_file, self).__init__(basename=basename,
                                             requires_slicing=requires_slicing)
        self.resolution = resolution
        self.data_type = 'uint32'  # TODO What data type should we use for the interpolated data?
        self.num_scans = self.__compute_num_scans(filename=self.basename)
        log_helper.info(__name__,
                        'Read %s scans from mzML file.' % self.num_scans)
        log_helper.debug(__name__, 'Compute coordinates')
        self.coordinates = self.__compute_coordinates(filename=self.basename,
                                                      num_scans=self.num_scans)
        # Compute the spatial configuration of the matrix
        self.x_pos = np.unique(self.coordinates[:, 0])
        self.y_pos = np.unique(self.coordinates[:, 1])
        self.step_size = min(
            [min(np.diff(self.x_pos)),
             min(np.diff(self.y_pos))])

        # Compute the mz axis
        log_helper.debug(__name__, 'Compute mz axes')
        self.mz = self.__compute_mz_axis(filename=self.basename)
        log_helper.debug(__name__, 'mz axes computed')

        # Determine the shape of the dataset, result is a list of shapes for each datacube
        # self.shape_all_data = [(self.x_pos.shape[0], self.y_pos.shape[0], mz.shape[0]) for mz in self.mz_all

        log_helper.debug(__name__, 'Compute shape')
        self.shape = (self.x_pos.shape[0], self.y_pos.shape[0], len(self.mz)
                      )  #self.shape[0])
        # self.shape = None
        # self.mz = None

        # Read the data into memory
        # self.data = None
        log_helper.debug(__name__, 'read all')
        if requires_slicing:
            self.data = self.__read_all()
        log_helper.debug(__name__, 'Finished with init')
Пример #25
0
    def register_file_with_db(filepath, db_server, file_user_name, jobid=None, check_add_nersc=True):
        """ Function used to register a given file with the database

            :param filepath: Path of the file to be added to the database
            :param db_server: The database server url
            :param file_user_name: The user to be used, or None if the user should
                                    be determined based on the file URL.
            :param jobid: Optional input parameter defining the jobid to be updated.
                          If the jobid is given then the job will be updated with the
                          database instead of adding the file explicitly. I.e.,
                          instead of register_filer_with_db the update_job_status call
                          is executed.
            :param check_add_nersc: Boolean if set to True performs additional actions to add the
                         file to the pupblic OpenMSI gateway hosted at NERSC.

            :returns: Boolean indicating whether the operation was successful

        """
        import urllib2
        import urllib

        if jobid is not None:
            return WebHelper.update_job_status(filepath=filepath,
                                               db_server=db_server,
                                               jobid=jobid,
                                               status='complete')
        # Check if the
        if db_server == WebHelper.default_db_server_url and check_add_nersc:
            is_allowed_path = False
            for allowed_nersc_location in WebHelper.allowed_nersc_locations:
                if filepath.startswith(allowed_nersc_location):
                    is_allowed_path = True
                    break
            if not is_allowed_path and file_user_name in WebHelper.super_users:
                print "WARNING: Attempt to add a file to openmsi.nersc.gov that is not in a default location."
                print "Do you want to add the file? (Y/N):"
                num_trys = 3
                timeout = 5*60  # Timeout after 5 minutes
                for i in range(num_trys):
                    # user_input = raw_input()
                    user_input = UserInput.userinput_with_timeout(timeout=timeout, default=None)
                    if user_input is None:
                        warnings.warn("WARNING: Attempt to add a file to openmsi.nersc.gov that," +
                                      " is not in a default location. Timeout occurred before" +
                                      " user confirmed. Aborted adding the file to the DB.")
                        return False
                    if user_input == "Y" or user_input == "y" or user_input == "Yes" or \
                            user_input == "yes" or user_input == "YES":
                        break
                    elif user_input == "N" or user_input == "n" or user_input == "No" or \
                            user_input == "no" or user_input == "NO":
                        return False
                    else:
                        if i == (num_trys - 1):
                            warnings.warn("WARNING: Attempt to add a file to openmsi.nersc.gov that," +
                                          " is not in a default location. User input unrecognized." +
                                          " Aborted adding the file to the DB.")
                            return False
                        print "Unrecognized response. Do you want to add the file? (Y/N): "
            elif not is_allowed_path:
                warnings.warn("Adding file to the OpenMSI database in unconventional location not permitted for user.")
                return False
            else:
                pass  # Adding the file to the db is allowed

        # If we are at NERSC then set the NERSC Apache permissions
        if 'nersc.gov' in db_server and check_add_nersc:
            WebHelper.set_apache_acl(filepath)

        # Determine the user
        curr_user = file_user_name
        if not curr_user:
            curr_user = os.path.dirname(filepath).split("/")[-1]
        if not curr_user:
            raise ValueError("ERROR: File could not be added to DB. Owner could not be determined.")

        # Construct the db add-file url
        add_file_url = os.path.join(db_server, "openmsi/resources/addfile")
        addfilepath = filepath
        # Correct the filepath if we are on openmsi.nersc.gov, as /global is not mounted but only /project.
        if db_server == WebHelper.default_db_server_url and addfilepath.startswith("/global/project/projectdirs"):
            addfilepath = filepath.lstrip("/global")
        query_params = {'file': os.path.abspath(addfilepath), 'owner': curr_user}
        add_file_url += "?"
        add_file_url += urllib.urlencode(query_params)
        # add_file_url = add_file_url + "?file=" + \
        #    os.path.abspath(filepath) + "&user="******"Registering file with DB: " + add_file_url)
            url_response = urllib2.urlopen(url=add_file_url)
            if url_response.code == 200:
                return True
        except urllib2.HTTPError as request_error:
            raise ValueError("ERROR: File could not be added to DB: \n" +
                             "      Error-code:" + str(request_error.code) + "\n" +
                             "      Error info:" + str(request_error.read()))

        return False
Пример #26
0
    def __run_static_1D(self):
        """
        Run the task function using a static task decomposition schema.

        The data is divided into sub-blocks along the largest split_axis

        :return: Tuple with the following elements:

            1) List with the results from the local execution of the task_function. Each
               entry is the result from one return of the task_function. In the case of static
               execution, this is always a list of length 1.
            2) List of block_indexes. Each block_index is a tuple with the selection used to
               divide the data into sub-blocks. In the case of static decomposition we have
               a range slice object along the axes used for decomposition.

        """
        try:
            from omsi.shared.log import log_helper
        except ImportError:
            from pactolus.third_party.log import log_helper
        start_time = time.time()
        # Get MPI parameters
        rank = get_rank(comm=self.comm)
        size = get_size(comm=self.comm)

        # Get data shape parameters and compute the data blocks
        # Determine the longest axis along which we can split the data
        axes_shapes = np.asarray(self.main_data.shape)[self.split_axes]
        total_num_subblocks = np.prod(axes_shapes)
        if total_num_subblocks < size:
            size = total_num_subblocks
            if rank == self.root:
                log_helper.info(__name__,
                                "Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle")
        axes_sort_index = np.argsort(axes_shapes)[::-1]
        split_axis = self.split_axes[axes_sort_index[0]]
        split_axis_size = axes_shapes[split_axis]
        if split_axis_size < size:
            raise NotImplementedError("STATIC scheduling currently parallelizes only over one axis, " +
                                      "and the largest axis is too small to fill all MPI tasks")
        # Determine the size of 1D block
        block_size = int(split_axis_size / float(size) + 0.5)
        if block_size * size > split_axis_size and block_size > 1:
            block_size -= 1

        # Compute a block for every rank
        self.blocks = [slice(None)] * len(self.main_data.shape)
        start_index = rank * block_size
        stop_index = start_index + block_size
        if rank == (size-1):
            if stop_index != split_axis_size:
                stop_index = split_axis_size
        self.blocks[axes_sort_index[0]] = slice(start_index, stop_index)
        self.blocks = tuple(self.blocks)
        log_helper.info(__name__, "Rank: " + str(rank) + " Block: " + str(self.blocks))

        # Execute the task_function on the given data block
        task_params = self.task_function_params
        task_params[self.main_data_param_name] = self.main_data[self.blocks]
        self.result = self.task_function(**task_params)

        end_time = time.time()
        run_time = end_time - start_time
        self.block_times = [run_time, ]
        log_helper.info(__name__, "TIME FOR PROCESSING THE DATA BLOCK: " + str(run_time))

        # Return the output
        self.result = [self.result, ]
        self.blocks = [self.blocks, ]
        return self.result, self.blocks
Пример #27
0
    def __run_dynamic(self):
        """
        Run the task function using dynamic task scheduling.

        The root rank divides the data into sub-tasks and sends the tasks to available MPI
        processes on request.

        :return: Tuple with the following elements:

            1) List with the results from the local execution of the task_function. Each
               entry is the result from one return of the task_function.
            2) List of block_indexes. Each block_index is a tuple with the selection used to
               divide the data into sub-blocks. In the case of static decomposition we have
               a range slice object along the axes used for decomposition.

        """
        try:
            from omsi.shared.log import log_helper
        except ImportError:
            from pactolus.third_party.log import log_helper
        import time
        rank = get_rank(comm=self.comm)
        size = get_size(comm=self.comm)

        if size < 2:
            warnings.warn('DYNAMIC task scheduling requires at least 2 MPI ranks. Using STATIC scheduling instead.')
            return self.__run_static_1D()

        # We are the controlling rank
        if rank == self.root:
            self.result = []
            self.blocks = []
            self.block_times = []
            # Get data shape parameters and compute the data blocks
            axes_shapes = np.asarray(self.main_data.shape)[self.split_axes]
            total_num_subblocks = np.prod(axes_shapes)
            if total_num_subblocks < size:
                if rank == self.root:
                    warnings.warn("Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle")

            # Compute the list of all possible blocks
            base_blocks = [[slice(None)]] * len(self.main_data.shape)
            for axis_index in self.split_axes:
                base_blocks[axis_index] = range(self.main_data.shape[axis_index])
            block_tuples = itertools.product(*base_blocks)

            # Communicate blocks with task ranks
            log_helper.info(__name__, "PROCESSING DATA BLOCKS")
            start_time = time.time()
            block_index = 0
            for block_selection in block_tuples:
                request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                self.comm.send((block_index, block_selection),
                               dest=request_rank,
                               tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                block_index += 1
                if (block_index % 100) == 0:
                    log_helper.debug(__name__, str((block_index, total_num_subblocks, request_rank)))
            end_time = time.time()
            run_time = end_time - start_time
            log_helper.info(__name__, "TIME FOR SCHEDULING ALL TASKS: " + str(run_time))
            start_time = time.time()
            log_helper.info(__name__, "FINALIZING")
            # Terminate all ranks and receive all data from the different ranks if requested
            all_ranks_status = np.zeros(size, 'bool')
            all_ranks_status[self.root] = True
            while not np.all(all_ranks_status):

                request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                self.comm.send((None, None), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                all_ranks_status[request_rank] = True

            end_time = time.time()
            run_time = end_time - start_time
            log_helper.info(__name__, "TIME FOR FINALIZING TASKS: " + str(run_time))

        # We are a rank that has to run tasks
        else:
            # Request a new data block
            self.result = []
            self.blocks = []
            self.block_times = []
            while True:
                start_time = time.time()
                self.comm.send(rank, dest=self.root, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                block_index, block_selection = self.comm.recv(source=self.root, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                if block_index is None:
                    break
                # Execute the task_function on the given data block
                task_params = self.task_function_params
                task_params[self.main_data_param_name] = self.main_data[block_selection]
                self.result.append(self.task_function(**task_params))
                self.blocks.append(block_selection)
                # Record the timings
                end_time = time.time()
                run_time = end_time - start_time
                self.block_times.append(run_time)

        # Return the result
        return self.result, self.blocks
Пример #28
0
    def __compute_file_info(cls, filename, resolution):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis, data type for the intensities, format type

        :return: Numpy array with mz axis
        :return: string with data type
        :return: imzml file type
        :return:
        """
        reader = ImzMLParser(filename)
        # Read the first spectrum
        mz_axes, intens = reader.getspectrum(0)  # NOTE: mz_axes is a tuple
        # Read the coordinates
        coordinates = np.asarray(reader.coordinates)
        # Determine the data type for the internsity values
        dtype = np.asarray(intens).dtype.str

        # Compute the mz axis and file type
        file_type = cls.available_imzml_types['continuous']
        min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes)
        for ind in range(coordinates.shape[0]
                         ):  #for ind, loc in enumerate(reader.coordinates):
            mz, intens = reader.getspectrum(ind)
            if mz == mz_axes:
                pass
            else:
                file_type = cls.available_imzml_types['processed']
                if min_mz > np.amin(mz):
                    min_mz = np.amin(mz)
                if max_mz < np.amax(mz):
                    max_mz = np.amax(mz)
        # Reinterpolate the mz-axis if we have a processed mode imzml file
        if file_type == cls.available_imzml_types['processed']:
            f = np.ceil(1e6 * np.log(max_mz / min_mz) / resolution)
            mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f)
            log_helper.info(
                __name__, "Reinterpolated m/z axis for processed imzML file")

        # Construct the imzml metadata information
        dataset_metadata = metadata_dict()
        instrument_metadata = metadata_dict()
        method_metadata = metadata_dict()
        for k, v in reader.imzmldict.iteritems():
            dataset_metadata[k] = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=k,
                                                 ontology=None)

        # Delete the parser and read the metadata
        del reader

        # Parse the metadata for the file. We try to parse only the header and ignore the
        # <run > group in the XML file to avoid going throught the whole file again
        # while extracting the majority of the relevant metadata
        try:
            with open(filename, 'r') as ins:
                metdata_header = ''
                for line in ins:
                    if '<run' in line:
                        break
                    else:
                        metdata_header += line
                metdata_header += '</mzML>'
                metdata_header_dict = xmltodict.parse(metdata_header)['mzML']
                for k, v in metdata_header_dict.iteritems():
                    store_value = metadata_value(
                        name=k,
                        value=v,
                        unit=None,
                        description=str(k) +
                        " extracted from imzML XML header.",
                        ontology=None)
                    if k == 'instrumentConfigurationList':
                        instrument_metadata[k] = store_value
                    elif k == 'dataProcessingList':
                        method_metadata[k] = store_value
                    elif k == 'scanSettingsList':
                        dataset_metadata[k] = store_value
                    elif k == 'softwareList':
                        method_metadata[k] = store_value
                    elif k == 'sampleList':
                        method_metadata[k] = store_value
                    else:
                        dataset_metadata[k] = store_value
                dataset_metadata['imzml_xml_metadata_header'] = metadata_value(
                    name='imzml_xml_metadata_header',
                    value=metdata_header,
                    unit=None,
                    description='XML imzML header',
                    ontology=None)
        except:
            log_helper.warning(
                __name__, "Extraction of additional imzML metadata failed")

        return coordinates, np.asarray(
            mz_axes
        ), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
Пример #29
0
    def __run_dynamic(self):
        """
        Run the task function using dynamic task scheduling.

        The root rank divides the data into sub-tasks and sends the tasks to available MPI
        processes on request.

        :return: Tuple with the following elements:

            1) List with the results from the local execution of the task_function. Each
               entry is the result from one return of the task_function.
            2) List of block_indexes. Each block_index is a tuple with the selection used to
               divide the data into sub-blocks. In the case of static decomposition we have
               a range slice object along the axes used for decomposition.

        """
        from omsi.shared.log import log_helper
        import time
        rank = get_rank(comm=self.comm)
        size = get_size(comm=self.comm)

        if size < 2:
            warnings.warn('DYNAMIC task scheduling requires at least 2 MPI ranks. Using STATIC scheduling instead.')
            return self.__run_static_1D()

        # We are the controlling rank
        if rank == self.root:
            self.result = []
            self.blocks = []
            self.block_times = []
            # Get data shape parameters and compute the data blocks
            axes_shapes = np.asarray(self.main_data.shape)[self.split_axes]
            total_num_subblocks = np.prod(axes_shapes)
            if total_num_subblocks < size:
                if rank == self.root:
                    warnings.warn("Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle")

            # Compute the list of all possible blocks
            base_blocks = [[slice(None)]] * len(self.main_data.shape)
            for axis_index in self.split_axes:
                base_blocks[axis_index] = range(self.main_data.shape[axis_index])
            block_tuples = itertools.product(*base_blocks)

            # Communicate blocks with task ranks
            log_helper.info(__name__, "PROCESSING DATA BLOCKS")
            start_time = time.time()
            block_index = 0
            for block_selection in block_tuples:
                request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                self.comm.send((block_index, block_selection),
                               dest=request_rank,
                               tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                block_index += 1
                if (block_index % 100) == 0:
                    log_helper.debug(__name__, str((block_index, total_num_subblocks, request_rank)))
            end_time = time.time()
            run_time = end_time - start_time
            log_helper.info(__name__, "TIME FOR SCHEDULING ALL TASKS: " + str(run_time))
            start_time = time.time()
            log_helper.info(__name__, "FINALIZING")
            # Terminate all ranks and receive all data from the different ranks if requested
            all_ranks_status = np.zeros(size, 'bool')
            all_ranks_status[self.root] = True
            while not np.all(all_ranks_status):

                request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                self.comm.send((None, None), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                all_ranks_status[request_rank] = True

            end_time = time.time()
            run_time = end_time - start_time
            log_helper.info(__name__, "TIME FOR FINALIZING TASKS: " + str(run_time))

        # We are a rank that has to run tasks
        else:
            # Request a new data block
            self.result = []
            self.blocks = []
            self.block_times = []
            while True:
                start_time = time.time()
                self.comm.send(rank, dest=self.root, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                block_index, block_selection = self.comm.recv(source=self.root, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                if block_index is None:
                    break
                # Execute the task_function on the given data block
                task_params = self.task_function_params
                task_params[self.main_data_param_name] = self.main_data[block_selection]
                self.result.append(self.task_function(**task_params))
                self.blocks.append(block_selection)
                # Record the timings
                end_time = time.time()
                run_time = end_time - start_time
                self.block_times.append(run_time)

        # Return the result
        return self.result, self.blocks
Пример #30
0
    def __run_static_1D(self):
        """
        Run the task function using a static task decomposition schema.

        The data is divided into sub-blocks along the largest split_axis

        :return: Tuple with the following elements:

            1) List with the results from the local execution of the task_function. Each
               entry is the result from one return of the task_function. In the case of static
               execution, this is always a list of length 1.
            2) List of block_indexes. Each block_index is a tuple with the selection used to
               divide the data into sub-blocks. In the case of static decomposition we have
               a range slice object along the axes used for decomposition.

        """
        from omsi.shared.log import log_helper
        start_time = time.time()
        # Get MPI parameters
        rank = get_rank(comm=self.comm)
        size = get_size(comm=self.comm)

        # Get data shape parameters and compute the data blocks
        # Determine the longest axis along which we can split the data
        axes_shapes = np.asarray(self.main_data.shape)[self.split_axes]
        total_num_subblocks = np.prod(axes_shapes)
        if total_num_subblocks < size:
            size = total_num_subblocks
            if rank == self.root:
                log_helper.info(__name__,
                                "Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle")
        axes_sort_index = np.argsort(axes_shapes)[::-1]
        split_axis = self.split_axes[axes_sort_index[0]]
        split_axis_size = axes_shapes[split_axis]
        if split_axis_size < size:
            raise NotImplementedError("STATIC scheduling currently parallelizes only over one axis, " +
                                      "and the largest axis is too small to fill all MPI tasks")
        # Determine the size of 1D block
        block_size = int(split_axis_size / float(size) + 0.5)
        if block_size * size > split_axis_size and block_size > 1:
            block_size -= 1

        # Compute a block for every rank
        self.blocks = [slice(None)] * len(self.main_data.shape)
        start_index = rank * block_size
        stop_index = start_index + block_size
        if rank == (size-1):
            if stop_index != split_axis_size:
                stop_index = split_axis_size
        self.blocks[axes_sort_index[0]] = slice(start_index, stop_index)
        self.blocks = tuple(self.blocks)
        log_helper.info(__name__, "Rank: " + str(rank) + " Block: " + str(self.blocks))

        # Execute the task_function on the given data block
        task_params = self.task_function_params
        task_params[self.main_data_param_name] = self.main_data[self.blocks]
        self.result = self.task_function(**task_params)

        end_time = time.time()
        run_time = end_time - start_time
        self.block_times = [run_time, ]
        log_helper.info(__name__, "TIME FOR PROCESSING THE DATA BLOCK: " + str(run_time))

        # Return the output
        self.result = [self.result, ]
        self.blocks = [self.blocks, ]
        return self.result, self.blocks
Пример #31
0
    def main(self):
        """
        Execute the analysis workflow
        """
        # Do the optional MPI barrier
        if self['synchronize']:
            mpi_helper.barrier(comm=self.mpi_comm)

        # Check if we have anything to do at all
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm)
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm)
        log_helper.debug(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm)
        self.add_analysis_dependencies()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm)
        all_analyses = self.get_analyses()
        iterations = 0
        continue_running = True
        while continue_running:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis),
                                     root=self.mpi_root, comm=self.mpi_comm)
                    analysis.execute()
                    if self['reduce_memory_usage']:
                        analysis.clear_and_restore()
            # Check if there is any other tasks that we need to execute now
            num_tasks_completed, num_tasks_waiting, num_tasks_ready, num_tasks_blocked = \
                all_analyses.task_status_stats()
            if num_tasks_waiting == 0:
                log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm)
                continue_running = False
            if num_tasks_waiting > 0 and num_tasks_ready == 0:
                blocking_tasks = all_analyses.get_blocking_tasks()
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks_waiting) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies." +
                                   " The workflow will be restarted once the outputs of the blocking tasks are ready." +
                                   " Blocking tasks are: " + str(blocking_tasks),
                                   root=self.mpi_root, comm=self.mpi_comm)
                # Tell all blocking tasks that they should continue the workflow once they are ready
                # This happens in omsi.analysis.analysis_base.outputs_ready(...) function
                for block_task in blocking_tasks:
                    block_task.continue_workflow_when_ready(self)
                #  NOTE: if self['reduce_memory_usage'] is True then prior analyses were cleared, i.e.,
                #        they will be rexecuted when the workflow is restarted. It is, therefore, not recommeneded
                #        to use reduce_memory_usage option when performing interactive tasks.

                continue_running = False
            iterations += 1
        # All analyses are done, so we no longer need to coninue any analyses when we are done
        if num_tasks_blocked == 0:
            for analysis in all_analyses:
                analysis.continue_analysis_when_ready = False

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
Пример #32
0
    def main(self):
        """
        Default main function for running an analysis from the command line.
        The default implementation exposes all specified analysis parameters as command
        line options to the user. The default implementation also provides means to
        print a help text for the function.

        :raises: ValueError is raised in case that the analysis class is unknown

        """

        # Initialize the argument parser
        if self.parser is None:
            self.initialize_argument_parser()

        try:
            # Parse the command line arguments to determine the command line driver settings
            self.parse_cl_arguments()
        except:
            self.remove_output_target()
            raise

        if self.workflow_executor is None:
            self.remove_output_target()
            log_helper.error(
                __name__,
                'Missing --script parameter or worfklow_executor object')
            raise ValueError('Workflow not initalized')

        # Add and parse the command line arguments specific to the analysis to determine the analysis settings
        try:
            self.add_and_parse_workflow_arguments()
        except:
            self.remove_output_target()
            raise

        # Print the analysis settings
        if mpi_helper.get_rank() == self.mpi_root:
            self.print_settings()

        # Enable time and usage profiling
        try:
            # Enable time and usage profiling if requested
            if self.profile_analyses:
                try:
                    self.workflow_executor.analysis_tasks.enable_time_and_usage_profiling(
                        self.profile_analyses)
                except ImportError as e:
                    log_helper.warning(
                        __name__,
                        "Profiling of time and usage not available due to missing packages."
                    )
                    log_helper.warning(__name__, e.message)
            # Enable memory profiling if requested
            if self.profile_analyses_mem:
                try:
                    self.workflow_executor.analysis_tasks.enable_memory_profiling(
                        self.profile_analyses_mem)
                except ImportError as e:
                    log_helper.warning(
                        __name__,
                        "Profiling of memory usage not available due to missing packages"
                    )
                    log_helper.warning(__name__, e.message)
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Execute the analysis
        try:
            log_helper.debug(__name__,
                             'Analysis arguments: ' +
                             str(self.analysis_arguments),
                             root=self.mpi_root,
                             comm=self.mpi_comm)
            self.workflow_executor.execute()
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial
        # the condition of  mpi_helper.get_rank() ==  self.mpi_root evaluates to True because
        # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial.
        if mpi_helper.get_rank() == self.mpi_root:

            # Print usage profiles if available
            try:
                self.print_time_and_usage_profiles()
            except:
                log_helper.error(
                    __name__,
                    "An error occured while trying to print time and usage profiles",
                    root=self.mpi_root,
                    comm=self.mpi_comm)

            # Print memory profile data if available
            try:
                self.print_memory_profiles()
            except:
                log_helper.error(
                    __name__,
                    "An error occured while trying to print memory profiles",
                    root=self.mpi_root,
                    comm=self.mpi_comm)

            # Print the time it took to run the analysis
            try:
                # Parallel case: We need to compile/collect timing data from all cores
                if isinstance(
                        self.workflow_executor.run_info['execution_time'],
                        list):
                    # Time for each task to execute
                    log_helper.info(
                        __name__,
                        "Time in seconds for each analysis process: " +
                        str(self.workflow_executor.run_info['execution_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)
                    # Start times of each task
                    log_helper.info(
                        __name__,
                        "Time when each of the processes started: " +
                        str(self.workflow_executor.run_info['start_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)
                    # Stop times for each task

                    log_helper.info(
                        __name__,
                        "Time when each of the processes finished: " +
                        str(self.workflow_executor.run_info['end_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)

                    # Compile the time to execute string
                    exec_time_array = np.asarray(
                        self.workflow_executor.run_info['execution_time'],
                        dtype=float)
                    max_exec_time = str(exec_time_array.max())
                    min_exec_time = str(exec_time_array.min())
                    mean_exec_time = str(exec_time_array.mean())
                    exec_time_string = max_exec_time + " s " + \
                        "    ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )"
                # Serial case: We only have a single time to worry about
                else:
                    exec_time_string = str(self.workflow_executor.
                                           run_info['execution_time']) + " s"
                log_helper.info(__name__,
                                "Time to execute analysis: " +
                                exec_time_string,
                                root=self.mpi_root,
                                comm=self.mpi_comm)
            except:
                raise

        # Save the analysis to file
        if self.output_target is not None:
            from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager
            for analysis in self.workflow_executor.analysis_tasks:
                omsi_analysis_manager.create_analysis_static(
                    analysis_parent=self.output_target, analysis=analysis)
Пример #33
0
    def main(self):
        """
        Default main function for running an analysis from the command line.
        The default implementation exposes all specified analysis parameters as command
        line options to the user. The default implementation also provides means to
        print a help text for the function.

        :raises: ValueError is raised in case that the analysis class is unknown
        """
        # Get the analysis object if needed
        if self.add_analysis_class_arg:
            try:
                self.get_analysis_class_from_cl()
            except (ImportError, AttributeError, ValueError):
                pass

        # Initialize the argument parser
        if self.parser is None:
            self.initialize_argument_parser()

        # Check if we have a valid analysis class
        if self.analysis_class is None:
            print self.parser.print_help()
            raise ValueError('Could not determine the analysis class.')
        if not issubclass(self.analysis_class, analysis_base):
            print self.parser.print_help()
            raise ValueError('Analysis class is not a subclass of analysis_base.')

        try:
            # Parse the command line arguments to determine the command line driver settings
            self.parse_cl_arguments()
            # Add and parse the command line arguments specific to the analysis to determine the analysis settings
            self.add_and_parse_analysis_arguments()
        except:
            self.remove_output_target()
            raise

        # Print the analysis settings
        if mpi_helper.get_rank() == self.mpi_root:
            self.print_settings()

        # Call the execute function of the analysis
        try:
            # Create the analysis object
            if self.analysis_object is None:
                self.create_analysis_object()
            # Execute the analysis
            log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments),
                             root=self.mpi_root, comm=self.mpi_comm)
            self.analysis_object.execute(**self.analysis_arguments)
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial
        # the condition of  mpi_helper.get_rank() ==  self.mpi_root evaluates to True because
        # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial.
        if mpi_helper.get_rank() == self.mpi_root:
            # Print the profiling results of time and usage
            if self.analysis_object['profile_time_and_usage']:
                print ""
                print "PROFILING DATA: TIME AND USAGE"
                print ""
                self.analysis_object.get_profile_stats_object(consolidate=True).print_stats()

            # Print the profiling results for memory usage
            if self.analysis_object['profile_memory']:
                print ""
                print "PROFILING DATA: MEMORY"
                print ""
                print self.analysis_object.get_memory_profile_info()

            # Print the time it took to run the analysis
            try:
                # Parallel case: We need to compile/collect timing data from all cores
                if isinstance(self.analysis_object.run_info['execution_time'], list):
                    # Time for each task to execute
                    log_helper.info(__name__, "Time in seconds for each analysis process: " +
                                    str(self.analysis_object.run_info['execution_time']),
                                    root=self.mpi_root, comm=self.mpi_comm)
                    # Start times of each task
                    log_helper.info(__name__, "Time when each of the processes started: " +
                                    str(self.analysis_object.run_info['start_time']),
                                    root=self.mpi_root, comm=self.mpi_comm)
                    # Stop times for each task

                    log_helper.info(__name__, "Time when each of the processes finished: " +
                                    str(self.analysis_object.run_info['end_time']),
                                    root=self.mpi_root, comm=self.mpi_comm)

                    # Compile the time to execute string
                    exec_time_array = np.asarray(self.analysis_object.run_info['execution_time'], dtype=float)
                    max_exec_time = str(exec_time_array.max())
                    min_exec_time = str(exec_time_array.min())
                    mean_exec_time = str(exec_time_array.mean())
                    exec_time_string = max_exec_time + " s " + \
                        "    ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )"
                # Serial case: We only have a single time to worry about
                else:
                    exec_time_string = str(self.analysis_object.run_info['execution_time']) + " s"
                log_helper.info(__name__, "Time to execute analysis: " + exec_time_string,
                                root=self.mpi_root, comm=self.mpi_comm)
            except:
                raise

        # Save the analysis to file
        if self.output_target is not None:
            from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager
            omsi_analysis_manager.create_analysis_static(analysis_parent=self.output_target,
                                                         analysis=self.analysis_object)
Пример #34
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {'file_lookup_table': file_lookup_table,
                               'ms1_mass_tol': ms1_mass_tol,
                               'ms2_mass_tol': ms2_mass_tol,
                               'neutralizations': neutralizations,
                               'max_depth': max_depth}

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([spectrum_indexes, ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [0, ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.execute_analysis,                    # Execute this function
                    task_function_params={'file_lookup_table': file_lookup_table},  # Reuse the file_lookup_table
                    main_data=spectrum_indexes,                             # Process the spectra independently
                    split_axes=split_axis,                                  # Split along axes
                    main_data_param_name='spectrum_indexes',                # data input param
                    root=self.mpi_root,                                     # The root MPI task
                    schedule=self['schedule'],                              # Parallel scheduling scheme
                    comm=self.mpi_comm)                                     # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0,), dtype='f4')
                id_data = np.zeros((0,), dtype='a100')
                name = np.zeros((0,), dtype='a100')
                mass = np.zeros((0,), dtype='f4')
                n_peaks = np.zeros((0,), dtype='i4')
                n_match = np.zeros((0,), dtype='i4')

                use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index+1), 2]
                   if spectrum_index < (num_spectra-1)
                   else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(scan_list=[current_peaks_list, ],
                                                                        ms1_mz=[current_parent_mass, ],
                                                                        params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(score_frag_dag.make_pactolus_hit_table(
                        pactolus_results=hit_matrix[current_index],
                        table_file=file_lookup_table,
                        original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(current_hit_table['n_peaks'][score_index])
                        n_match.append(current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)
Пример #35
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__,
                         'Reading inputs',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__,
                             'Preparing file lookup table',
                             comm=self.mpi_comm,
                             root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                        tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                    path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {
            'file_lookup_table': file_lookup_table,
            'ms1_mass_tol': ms1_mass_tol,
            'ms2_mass_tol': ms2_mass_tol,
            'neutralizations': neutralizations,
            'max_depth': max_depth
        }

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([
                    spectrum_indexes,
                ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__,
                                 'Preparing parallel execution',
                                 comm=self.mpi_comm,
                                 root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [
                    0,
                ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.
                    execute_analysis,  # Execute this function
                    task_function_params={
                        'file_lookup_table': file_lookup_table
                    },  # Reuse the file_lookup_table
                    main_data=
                    spectrum_indexes,  # Process the spectra independently
                    split_axes=split_axis,  # Split along axes
                    main_data_param_name='spectrum_indexes',  # data input param
                    root=self.mpi_root,  # The root MPI task
                    schedule=self['schedule'],  # Parallel scheduling scheme
                    comm=self.mpi_comm)  # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0, ), dtype='f4')
                id_data = np.zeros((0, ), dtype='a100')
                name = np.zeros((0, ), dtype='a100')
                mass = np.zeros((0, ), dtype='f4')
                n_peaks = np.zeros((0, ), dtype='i4')
                n_match = np.zeros((0, ), dtype='i4')

                use_dynamic_schedule = (
                    self['schedule'] ==
                    mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank()
                                            == self.mpi_root
                                            and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__,
                         'Processing spectra',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index + 1),
                                           2] if spectrum_index <
                       (num_spectra - 1) else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__,
                                time_str,
                                comm=self.mpi_comm,
                                root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2),
                                          dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(
                precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(
                scan_list=[
                    current_peaks_list,
                ],
                ms1_mz=[
                    current_parent_mass,
                ],
                params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(
                        score_frag_dag.make_pactolus_hit_table(
                            pactolus_results=hit_matrix[current_index],
                            table_file=file_lookup_table,
                            original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index,
                                                               0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(
                            current_hit_table['n_peaks'][score_index])
                        n_match.append(
                            current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(
            mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)
Пример #36
0
    def __compute_file_info(cls, filename, resolution):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis, data type for the intensities, format type

        :return: Numpy array with mz axis
        :return: string with data type
        :return: imzml file type
        :return:
        """
        reader = ImzMLParser(filename)
        # Read the first spectrum
        mz_axes, intens = reader.getspectrum(0)   # NOTE: mz_axes is a tuple
        # Read the coordinates
        coordinates = np.asarray(reader.coordinates)

        # #Start the data at [0,0,0]
        # coordinates[:,0] = coordinates[:,0] - np.amin(coordinates,axis=0)[0]
        # coordinates[:,1] = coordinates[:,1] - np.amin(coordinates,axis=0)[1]
        # coordinates[:,2] = coordinates[:,2] - np.amin(coordinates,axis=0)[2]

        # Determine the data type for the internsity values
        dtype = np.asarray(intens).dtype.str

        # Compute the mz axis and file type
        file_type = cls.available_imzml_types['continuous']
        min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes)
        for ind in range(coordinates.shape[0]):      #for ind, loc in enumerate(reader.coordinates):
            mz, intens = reader.getspectrum(ind)
            if mz == mz_axes:
                pass
            else:
                file_type = cls.available_imzml_types['processed']
                if min_mz > np.amin(mz):
                    min_mz = np.amin(mz)
                if max_mz < np.amax(mz):
                    max_mz = np.amax(mz)
        # Reinterpolate the mz-axis if we have a processed mode imzml file
        if file_type == cls.available_imzml_types['processed']:
            f = np.ceil(1e6 * np.log(max_mz/min_mz)/resolution)
            mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f)
            log_helper.info(__name__, "Reinterpolated m/z axis for processed imzML file")

        # Construct the imzml metadata information
        dataset_metadata = metadata_dict()
        instrument_metadata = metadata_dict()
        method_metadata = metadata_dict()
        for k, v in reader.imzmldict.iteritems():
            dataset_metadata[k] = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=k,
                                                 ontology=None)

        # Delete the parser and read the metadata
        del reader

        # Parse the metadata for the file. We try to parse only the header and ignore the
        # <run > group in the XML file to avoid going throught the whole file again
        # while extracting the majority of the relevant metadata
        try:
            with open(filename, 'r') as ins:
                metdata_header = ''
                for line in ins:
                    if '<run' in line:
                        break
                    else:
                        metdata_header += line
                metdata_header += '</mzML>'
                metdata_header_dict = xmltodict.parse(metdata_header)['mzML']
                for k, v in metdata_header_dict.iteritems():
                    store_value = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=str(k) + " extracted from imzML XML header.",
                                                 ontology=None)
                    if k == 'instrumentConfigurationList':
                        instrument_metadata[k] = store_value
                    elif k == 'dataProcessingList':
                        method_metadata[k] = store_value
                    elif k == 'scanSettingsList':
                        dataset_metadata[k] = store_value
                    elif k == 'softwareList':
                        method_metadata[k] = store_value
                    elif k =='sampleList':
                        method_metadata[k] = store_value
                    else:
                        dataset_metadata[k] = store_value
                dataset_metadata['imzml_xml_metadata_header'] = metadata_value(name='imzml_xml_metadata_header',
                                                                               value=metdata_header,
                                                                               unit=None,
                                                                               description='XML imzML header',
                                                                               ontology=None)
        except:
            log_helper.warning(__name__, "Extraction of additional imzML metadata failed")

        return coordinates, np.asarray(mz_axes), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata