예제 #1
0
    def record_postexecute(self, execution_time=None):
        """
        Function used to record runtime information after the task we want to track is comleted, e.g.
        the `execute_analysis(...)` function of a standard analysis.

        The function may be overwritten in child classes to add recording of
        additional runtime information.

        When overwriting the function we should call super(...,self).runinfo_record_postexecute(execution_time)
        in the custom version to ensure that the execution and end_time are properly
        recorded.

        :param execution_time: The total time it took to execute the analysis. May be None, in which
            case the function will attempt to compute the execution time based on the start_time
            (if available) and the the current time.

        :param comm: Used for logging only. The MPI communicator to be used. Default value is None,
            in which case MPI.COMM_WORLD is used.

        """
        log_helper.debug(__name__, 'Recording post-execution runtime data', root=self.mpi_root, comm=self.mpi_comm)
        # Finalize recording of post execution provenance
        self['end_time'] = unicode(datetime.datetime.now())
        if execution_time is not None:
            self['execution_time'] = unicode(execution_time)
        elif 'start_time' in self:
            start_time = run_info_dict.string_to_time(self['start_time'])
            stop_time = run_info_dict.string_to_time(self['end_time'])
            self['execution_time'] = unicode(stop_time - start_time)    # TODO: This only gives execution time in full seconds right now
        else:
            self['execution_time'] = None
        # Attempt to record psutil data
        try:
            import psutil
            process = psutil.Process()
            self['memory_info_after'] = unicode(process.memory_info())
        except ImportError:
            log_helper.warning(__name__, 'psutil not installed. Recording of part of runtime information not possible',
                               root=self.mpi_root, comm=self.mpi_comm)
        except:
            warnings.warn("Recording of psutil-based runtime information failed: "+str(sys.exc_info()))

        # Record the time and use profiling data if possible
        if self.__time_and_use_profiler is not None:
            self.__time_and_use_profiler.disable()
            self.__time_and_use_profiler.create_stats()
            self['profile'] = unicode(self.__time_and_use_profiler.stats)
            # Save the summary statistics for the profiling data
            stats_io = StringIO.StringIO()
            profiler_stats = pstats.Stats(self.__time_and_use_profiler, stream=stats_io).sort_stats('cumulative')
            profiler_stats.print_stats()
            self['profile_stats'] = stats_io.getvalue()

        # Record the memory profiling data if possible
        if self.__memory_profiler is not None and self.get_profile_memory():
            log_helper.debug(__name__, 'Recording memory profiling data', root=self.mpi_root, comm=self.mpi_comm)
            mem_stats_io = StringIO.StringIO()
            memory_profiler.show_results(self.__memory_profiler, stream=mem_stats_io)
            self['profile_mem'] = unicode(self.__memory_profiler.code_map)
            self['profile_mem_stats'] = mem_stats_io.getvalue()
예제 #2
0
    def get_additional_analysis_dependencies(self):
        """
        Compute a list of all dependencies of the current list of analyses (excluding analyses that
        are already in the the list of tasks.

        :return: analysis_task_list of all analysis dependencies
        """
        from omsi.dataformat.omsi_file.common import omsi_file_common
        from omsi.analysis.base import analysis_base

        missing_dependencies = analysis_task_list()
        for analysis_obj in self:
            for dependency_param_obj in analysis_obj.get_all_dependency_data():
                dependency_analysis = dependency_param_obj['data'][
                    'omsi_object']
                if isinstance(dependency_analysis, analysis_base):
                    if dependency_analysis not in self:
                        missing_dependencies.add(dependency_analysis)
                elif isinstance(dependency_analysis, omsi_file_common):
                    pass  # Ignore dependencies on data files. We do not need to execute those
                else:
                    log_helper.warning(
                        __name__,
                        'Unknown dependency object type that cannot be processed by workflow.'
                        + str(dependency_param_obj))
        return missing_dependencies
예제 #3
0
    def enable_profile_time_and_usage(self, enable=True):
        """
        Enable/disable time and usage profiling

        :param enable: boolean to enable (True) or disable (False) time and usage profiling

        """
        if PROFILE_AVAILABLE:
            if not enable and self.__profile_time_and_usage:
                log_helper.debug(__name__,
                                 "Disabled time and usage profiling. ",
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
            if enable and not self.__profile_time_and_usage:
                log_helper.debug(__name__,
                                 "Enabled time and usage profiling. ",
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
            self.__profile_time_and_usage = enable
        else:
            self.__profile_time_and_usage = False
            if enable:
                log_helper.warning(
                    __name__, 'Profiling of time and usage not available.' +
                    ' Missing profile and/or pstats package')
예제 #4
0
    def enable_profile_memory(self, enable=True):
        """
        Enable/disable profiling of memory usage

        :param enable: boolean to enable (True) or disable (False) memory profiling

        """
        if PROFILE_MEMORY_AVAILABLE:
            if not enable and self.__profile_memory:
                log_helper.debug(__name__,
                                 "Disabled memory profiling. ",
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
            if enable and not self.__profile_memory:
                log_helper.debug(__name__,
                                 "Enabled memory profiling. ",
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
            self.__profile_memory = enable
        else:
            self.__profile_memory = False
            if enable:
                log_helper.warning(
                    __name__, 'Profiling of memory usage not available.' +
                    ' Missing memory_profiler or StringIO package')
예제 #5
0
    def main(self):
        """Execute the analysis workflow"""
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty")
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow")
        log_helper.info(__name__, "Adding all dependencies")
        self.add_analysis_dependencies()

        # Record the runtime information
        log_helper.debug(__name__, "Recording runtime information")
        self.run_info.clear()
        self.run_info.record_preexecute()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow")
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(
                        analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__,
                                     "Execute analysis: " + str(analysis))
                    analysis.execute()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.")
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(
                    __name__,
                    "Workflow could not be fully executed. " + str(num_tasks) +
                    " remain in the queue but cannot be completed due to unresolved dependencies."
                )
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG')

        # Record the runtime information after we are done with the workflow
        self.run_info.record_postexecute()
        self.run_info.gather()
예제 #6
0
    def main(self):
        """
        Execute the analysis workflow
        """
        # Do the optional MPI barrier
        if self['synchronize']:
            mpi_helper.barrier(comm=self.mpi_comm)

        # Check if we have anything to do at all
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm)
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm)
        log_helper.info(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm)
        self.add_analysis_dependencies()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm)
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis),
                                     root=self.mpi_root, comm=self.mpi_comm)
                    analysis.execute()
                    if self['reduce_memory_usage']:
                        analysis.clear_and_restore()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm)
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies.",
                                   root=self.mpi_root, comm=self.mpi_comm)
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
예제 #7
0
    def get_files_from_dir(cls, dirname):
        """
        Get a list of all basenames of all img files in a given directory.
        Note: The basenames include the dirname.
        """
        filelist = []
        for l in os.listdir(dirname):
            currname = os.path.join(dirname, l)
            filename_only, extension = os.path.splitext(currname)
            if os.path.isfile(currname) and currname.lower().endswith(".imzml"):
                if os.path.isfile(filename_only + '.ibd'):
                    filelist.append(currname)
                else:
                    log_helper.warning(__name__, 'Could not find binary .ibd file for file %s . Skipping conversion of this file.' % currname)

        return filelist
예제 #8
0
    def main(self):
        """Execute the analysis workflow"""
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty")
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow")
        log_helper.info(__name__, "Adding all dependencies")
        self.add_analysis_dependencies()

        # Record the runtime information
        log_helper.debug(__name__, "Recording runtime information")
        self.run_info.clear()
        self.run_info.record_preexecute()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow")
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis))
                    analysis.execute()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.")
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies.")
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG')

        # Record the runtime information after we are done with the workflow
        self.run_info.record_postexecute()
        self.run_info.gather()
예제 #9
0
파일: generic.py 프로젝트: wholtz/BASTet
    def read_from_omsi_file(self,
                            analysis_object,
                            load_data=True,
                            load_parameters=True,
                            load_runtime_data=True,
                            dependencies_omsi_format=True,
                            ignore_type_conflict=False):
        """
        See `omsi.analysis.analysis_base.read_from_omsi_file(...)` for details.
        The function is overwritten here mainly to initialize the self.real_analysis_type
        instance variable but otherwise uses the default behavior.

        """
        # Attempt to add all analysis parameters to avoid warnings when setting the parameters during
        # the data load process, when we would set parameters that are not defined yet
        try:
            parameter_list = analysis_object.get_all_parameter_data(
                load_data=False, exclude_dependencies=False)
            for param in parameter_list:
                # Ignore the profiling parameters as they are added by the analysis base class already
                if param['name'] in [
                        'profile_time_and_usage', 'profile_memory'
                ]:
                    continue
                self.add_parameter(name=param['name'],
                                   help=param['help'],
                                   dtype=param['dtype'])
        except:
            log_helper.warning(__name__, "Could not generate all parameters.")
        # Load the data as usual
        output_val = super(analysis_generic, self).read_from_omsi_file(
            analysis_object=analysis_object,
            load_data=load_data,
            load_parameters=load_parameters,
            load_runtime_data=load_runtime_data,
            dependencies_omsi_format=dependencies_omsi_format,
            ignore_type_conflict=ignore_type_conflict)
        # Fill in the list of output names
        self.data_names = [
            dat['name'] for dat in self._analysis_base__data_list
        ]
        # Load the real data type.
        self.real_analysis_type = unicode(
            analysis_object.get_analysis_type()[:])
        # Return the output data
        return output_val
예제 #10
0
    def get_files_from_dir(cls, dirname):
        """
        Get a list of all basenames of all img files in a given directory.
        Note: The basenames include the dirname.
        """
        filelist = []
        for l in os.listdir(dirname):
            currname = os.path.join(dirname, l)
            filename_only, extension = os.path.splitext(currname)
            if os.path.isfile(currname) and currname.endswith(".imzML"):
                if os.path.isfile(filename_only + '.ibd'):
                    filelist.append(currname)
                else:
                    log_helper.warning(
                        __name__,
                        'Could not find binary .ibd file for file %s . Skipping conversion of this file.'
                        % currname)

        return filelist
예제 #11
0
    def enable_profile_time_and_usage(self, enable=True):
        """
        Enable/disable time and usage profiling

        :param enable: boolean to enable (True) or disable (False) time and usage profiling

        """
        if PROFILE_AVAILABLE:
            if not enable and self.__profile_time_and_usage:
                log_helper.debug(__name__, "Disabled time and usage profiling. ",
                                 root=self.mpi_root, comm=self.mpi_comm)
            if enable and not self.__profile_time_and_usage:
                log_helper.debug(__name__, "Enabled time and usage profiling. ",
                                 root=self.mpi_root, comm=self.mpi_comm)
            self.__profile_time_and_usage = enable
        else:
            self.__profile_time_and_usage = False
            if enable:
                log_helper.warning(__name__, 'Profiling of time and usage not available.' +
                                   ' Missing profile and/or pstats package')
예제 #12
0
    def enable_profile_memory(self, enable=True):
        """
        Enable/disable profiling of memory usage

        :param enable: boolean to enable (True) or disable (False) memory profiling

        """
        if PROFILE_MEMORY_AVAILABLE:
            if not enable and self.__profile_memory:
                log_helper.debug(__name__, "Disabled memory profiling. ",
                                 root=self.mpi_root, comm=self.mpi_comm)
            if enable and not self.__profile_memory:
                log_helper.debug(__name__, "Enabled memory profiling. ",
                                 root=self.mpi_root, comm=self.mpi_comm)
            self.__profile_memory = enable
        else:
            self.__profile_memory = False
            if enable:
                log_helper.warning(__name__, 'Profiling of memory usage not available.' +
                                   ' Missing memory_profiler or StringIO package')
예제 #13
0
파일: generic.py 프로젝트: biorack/BASTet
    def read_from_omsi_file(self,
                            analysis_object,
                            load_data=True,
                            load_parameters=True,
                            load_runtime_data=True,
                            dependencies_omsi_format=True,
                            ignore_type_conflict=False):
        """
        See `omsi.analysis.analysis_base.read_from_omsi_file(...)` for details.
        The function is overwritten here mainly to initialize the self.real_analysis_type
        instance variable but otherwise uses the default behavior.

        """
        # Attempt to add all analysis parameters to avoid warnings when setting the parameters during
        # the data load process, when we would set parameters that are not defined yet
        try:
            parameter_list = analysis_object.get_all_parameter_data(load_data=False,
                                                                    exclude_dependencies=False)
            for param in parameter_list:
                # Ignore the profiling parameters as they are added by the analysis base class already
                if param['name'] in ['profile_time_and_usage', 'profile_memory']:
                    continue
                self.add_parameter(name=param['name'],
                                   help=param['help'],
                                   dtype=param['dtype'])
        except:
            log_helper.warning(__name__, "Could not generate all parameters.")
        # Load the data as usual
        output_val = super(analysis_generic, self).read_from_omsi_file(
            analysis_object=analysis_object,
            load_data=load_data,
            load_parameters=load_parameters,
            load_runtime_data=load_runtime_data,
            dependencies_omsi_format=dependencies_omsi_format,
            ignore_type_conflict=ignore_type_conflict)
        # Fill in the list of output names
        self.data_names = [dat['name'] for dat in self._analysis_base__data_list]
        # Load the real data type.
        self.real_analysis_type = unicode(analysis_object.get_analysis_type()[:])
        # Return the output data
        return output_val
예제 #14
0
파일: common.py 프로젝트: biorack/BASTet
    def get_additional_analysis_dependencies(self):
        """
        Compute a list of all dependencies of the current list of analyses (excluding analyses that
        are already in the the list of tasks.

        :return: analysis_task_list of all analysis dependencies
        """
        from omsi.dataformat.omsi_file.common import omsi_file_common
        from omsi.analysis.base import analysis_base

        missing_dependencies = analysis_task_list()
        for analysis_obj in self:
            for dependency_param_obj in analysis_obj.get_all_dependency_data():
                dependency_analysis = dependency_param_obj['data']['omsi_object']
                if isinstance(dependency_analysis, analysis_base):
                    if dependency_analysis not in self:
                        missing_dependencies.add(dependency_analysis)
                elif isinstance(dependency_analysis, omsi_file_common):
                    pass  # Ignore dependencies on data files. We do not need to execute those
                else:
                    log_helper.warning(__name__, 'Unknown dependency object type that cannot be processed by workflow.'
                                       + str(dependency_param_obj))
        return missing_dependencies
예제 #15
0
    def main(self):
        """
        Execute the analysis workflow
        """
        # Do the optional MPI barrier
        if self['synchronize']:
            mpi_helper.barrier(comm=self.mpi_comm)

        # Check if we have anything to do at all
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm)
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm)
        log_helper.debug(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm)
        self.add_analysis_dependencies()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm)
        all_analyses = self.get_analyses()
        iterations = 0
        continue_running = True
        while continue_running:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis),
                                     root=self.mpi_root, comm=self.mpi_comm)
                    analysis.execute()
                    if self['reduce_memory_usage']:
                        analysis.clear_and_restore()
            # Check if there is any other tasks that we need to execute now
            num_tasks_completed, num_tasks_waiting, num_tasks_ready, num_tasks_blocked = \
                all_analyses.task_status_stats()
            if num_tasks_waiting == 0:
                log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm)
                continue_running = False
            if num_tasks_waiting > 0 and num_tasks_ready == 0:
                blocking_tasks = all_analyses.get_blocking_tasks()
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks_waiting) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies." +
                                   " The workflow will be restarted once the outputs of the blocking tasks are ready." +
                                   " Blocking tasks are: " + str(blocking_tasks),
                                   root=self.mpi_root, comm=self.mpi_comm)
                # Tell all blocking tasks that they should continue the workflow once they are ready
                # This happens in omsi.analysis.analysis_base.outputs_ready(...) function
                for block_task in blocking_tasks:
                    block_task.continue_workflow_when_ready(self)
                #  NOTE: if self['reduce_memory_usage'] is True then prior analyses were cleared, i.e.,
                #        they will be rexecuted when the workflow is restarted. It is, therefore, not recommeneded
                #        to use reduce_memory_usage option when performing interactive tasks.

                continue_running = False
            iterations += 1
        # All analyses are done, so we no longer need to coninue any analyses when we are done
        if num_tasks_blocked == 0:
            for analysis in all_analyses:
                analysis.continue_analysis_when_ready = False

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
예제 #16
0
파일: analysis.py 프로젝트: biorack/BASTet
    def __populate_analysis__(cls,
                              analysis_group,
                              analysis):
        """
        Populate the given h5py group with the analysis data.

        NOTE: This is a private helper function. Use the corresponding create_analysis function
        of omsi_file_experiment to create a completely new analysis.

        NOTE: At this point we assume that all in-memory dependencies have been resolved. If not,
        then the raw data associated with the given parameter will be saved instead.

        :param analysis_group: h5py group in which the analysis data should be stored.
        :param analysis: Instance of omsi.analysis.analysis_base defining the analysis
        :type analysis: omsi.analysis.analysis_base:

        :returns: The omsi_file_analysis object for the newly created analysis group. The analysis data is
                  automatically written to file by this function so no addition work is required.

        """
        from omsi.datastructures.analysis_data import analysis_data
        from omsi.dataformat.omsi_file.dependencies import omsi_file_dependencies
        from omsi.analysis.base import analysis_base

        # 1. Write the analysis name
        analysis_identifier_data = analysis_group.require_dataset(
            name=unicode(omsi_format_analysis.analysis_identifier),
            shape=(1,),
            dtype=omsi_format_common.str_type)
        if omsi_format_common.str_type_unicode:
            analysis_identifier_data[0] = analysis.get_analysis_identifier()
        else:
            analysis_identifier_data[0] = str(analysis.get_analysis_identifier())

        # 2. Write the analysis type
        analysis_type_data = analysis_group.require_dataset(name=unicode(omsi_format_analysis.analysis_type),
                                                            shape=(1,),
                                                            dtype=omsi_format_common.str_type)
        if omsi_format_common.str_type_unicode:
            analysis_type_data[0] = analysis.get_analysis_type()
        else:
            analysis_type_data[0] = str(analysis.get_analysis_type())

        # 3. Write the analysis data
        try:
            analysis.write_analysis_data(analysis_group=analysis_group)
        except NotImplementedError:
            for ana_data in analysis.get_all_analysis_data():
                cls.__write_omsi_analysis_data__(analysis_group, ana_data)

        # 4. Determine all dependencies and parameters that we need to write
        dependencies = []  # [dep['data'] for dep in analysis.get_all_dependency_data()]
        parameters = []
        # 4.1 Resolve in-memory dependencies if possible
        for dependent_parameter in analysis.get_all_dependency_data():
            # 4.1.1 We have an in-memory dependency
            if isinstance(dependent_parameter['data']['omsi_object'], analysis_base):
                # 4.1.1.1 We can resolve the dependency to an object in an HDF5 file
                if dependent_parameter['data']['omsi_object'].has_omsi_analysis_storage():
                    # Create a new dependency that points to the approbriate file location
                    # NOTE: We do not modify the dependency in the analysis object that we save
                    #       but we only change it for the purpose of storage
                    new_dep = dependent_parameter['data'].copy()
                    new_dep_omsi_object = None
                    # Check if we can find an analysis data store within the same parent (or at least file)
                    parent_filename = os.path.abspath(analysis_group.file.filename)
                    for analysis_store in dependent_parameter['data']['omsi_object'].get_omsi_analysis_storage():
                        analysis_store_filename = os.path.abspath(analysis_store.managed_group.file.filename)
                        if analysis_store.name == analysis_group.parent.name and \
                                analysis_store_filename == parent_filename:
                            new_dep_omsi_object = analysis_store
                            break
                        elif analysis_store_filename == parent_filename:
                            new_dep_omsi_object = analysis_store

                    # We could not find a prior data store within the same file so use one from another file
                    if new_dep_omsi_object is None:
                        dep_object = dependent_parameter['data']['omsi_object']
                        new_dep['omsi_object'] = dep_object.get_omsi_analysis_storage()[0]
                    else:
                        new_dep['omsi_object'] = new_dep_omsi_object
                    # Append it to the list of dependencies
                    dependencies.append(new_dep)
                # 4.1.1.2  We cannot resolve the dependency and need to store it as an parameter instead
                else:
                    # Replace the dependency with the actual data and save it as a parameter instead
                    new_param = dependent_parameter.copy()
                    new_param['data'] = new_param['data'].get_data()
                    parameters.append(new_param)

            # 4.1.2 We have a file-based dependencies so keep it as is and add it to the list of dependencies
            else:
                dependencies.append(dependent_parameter['data'])

        # 4.2 Add all regular parameters to the list of parameters
        parameters += analysis.get_all_parameter_data(exclude_dependencies=True)

        # 5. Write all the parameters
        parameter_group = analysis_group.require_group(omsi_format_analysis.analysis_parameter_group)
        for param_data in parameters:
            if param_data['required'] or param_data.data_set() or param_data['default'] is not None:
                temp_data = param_data.get_data_or_default()
                if temp_data is not None:
                    anadata = analysis_data(name=param_data['name'],
                                            data=param_data.get_data_or_default(),
                                            dtype=param_data['dtype'])
                    cls.__write_omsi_analysis_data__(parameter_group, anadata)
                    # Try to add the help string attribute
                    try:
                        help_attr = omsi_format_analysis.analysis_parameter_help_attr
                        parameter_group[param_data['name']].attrs[help_attr] = param_data['help']
                    except KeyError:
                        pass

        # 6. Write all the runtime execution information
        runinfo_group = analysis_group.require_group(omsi_format_analysis.analysis_runinfo_group)
        for run_info_key, run_info_value in analysis.get_all_run_info().items():
            # Generate an analysis_data object in order to use the
            # __write_omsi_analysis_data function to write the data
            if isinstance(run_info_value, unicode) or isinstance(run_info_value, str):
                anadata = analysis_data(name=unicode(run_info_key),
                                        data=run_info_value,
                                        dtype=omsi_format_common.str_type)
            else:
                dat = np.asarray(run_info_value)
                if len(dat.shape) == 0:
                    dat = np.asarray([run_info_value])
                anadata = analysis_data(name=unicode(run_info_key),
                                        data=dat,
                                        dtype=dat.dtype)
            cls.__write_omsi_analysis_data__(runinfo_group, anadata)

        # 7. Write all dependencies
        omsi_file_dependencies.__create__(parent_group=analysis_group,
                                          dependencies_data_list=dependencies)

        # 8. Execute the custom data write for the analysis
        analysis.add_custom_data_to_omsi_file(analysis_group)

        # 9. Create the output object
        re = omsi_file_analysis(analysis_group)

        # 10. Save the output object in the ist of omsi analysis data stores as part of the analysis object
        analysis.omsi_analysis_storage.append(re)

        # 11. Check if we need to pickle and save the analysis class in case this is a custom class that is not part of BASTet
        try:
            from omsi.analysis.analysis_views import analysis_views
            _ = analysis_views.analysis_name_to_class(analysis.get_analysis_type())
        except NameError:
            class_pickle = cloudpickle.dumps(analysis.__class__)
            # Convert the pickle string to an uint8 array to avoid problems
            # with storing string with NULL characters in HDF5
            class_pickle_arr = np.fromstring(class_pickle,
                                            dtype=omsi_format_analysis.analysis_class_pickle_np_dtype)
            analysis_group[unicode(omsi_format_analysis.analysis_class)] = class_pickle_arr
        except:
            log_helper.warning(__name__, "Could not save the analysis class.")
            pass

        # 12. Retrun the new omsi_file_analysis object
        return re
예제 #17
0
    def record_preexecute(self):
        """
        Record basic runtime information in this dict before the exeuction is started.


        Function used to record runtime information prior to executing the process we want to track, e.g.,
        the `execute_analysis(...)` of a standard analysis.

        The function may be overwritten in child classes to add recording of
        additional runtime information. All runtime data should be recorded in the
        main dict (i.e, self). This ensures in the case of standard analysis that
        the data is stored in the HDF5 file. Other data should be stored in separate
        variables that we may add to the object.

        When overwriting the function we should typically call super(...,self).runinfo_record_pretexecute()
        last in the custom version to ensure that the start_time is properly recorded right before
        the execution of the analysis.

        """
        log_helper.debug(__name__, 'Recording pre-execution runtime data', root=self.mpi_root, comm=self.mpi_comm)
        # Record basic runtime environment information using the platform module
        try:
            self['architecture'] = unicode(platform.architecture())
            self['java_ver'] = unicode(platform.java_ver())
            self['libc_ver'] = unicode(platform.libc_ver())
            self['linux_distribution'] = unicode(platform.linux_distribution())
            self['mac_ver'] = unicode(platform.mac_ver())
            self['machine'] = unicode(platform.machine())
            self['node'] = unicode(platform.node())
            self['platform'] = unicode(platform.platform())
            self['processor'] = unicode(platform.processor())
            self['python_branch'] = unicode(platform.python_branch())
            self['python_build'] = unicode(platform.python_build())
            self['python_compiler'] = unicode(platform.python_compiler())
            self['python_implementation'] = unicode(platform.python_implementation())
            self['python_revision'] = unicode(platform.python_revision())
            self['python_version'] = unicode(platform.python_version())
            self['release'] = unicode(platform.release())
            self['system'] = unicode(platform.system())
            self['uname'] = unicode(platform.uname())
            self['version'] = unicode(platform.version())
            self['win32_ver'] = unicode(platform.win32_ver())
        except:
            warnings.warn("WARNING: Recording of platform provenance failed: " + str(sys.exc_info()))

        # Attempt to record the svn version information
        try:
            import subprocess
            self['svn_ver'] = subprocess.check_output('svnversion').rstrip('\n')
        except ImportError:
            log_helper.warning(__name__, 'Recording of svn version not possible. subprocess not installed',
                               root=self.mpi_root, comm=self.mpi_comm)
        except:
            warnings.warn("Recording of svn version information failed: "+str(sys.exc_info()))

        # Attempt to record software library version
        try:
            import numpy as np
            self['numpy_version_full_version'] = unicode(np.version.full_version)
            self['numpy_version_release'] = unicode(np.version.release)
            self['numpy_version_git_revision'] = unicode(np.version.git_revision)
        except ImportError:
            log_helper.warning(__name__, 'Recording of numpy version not possible.',
                               root=self.mpi_root, comm=self.mpi_comm)

        # Attempt to record psutil data
        try:
            import psutil
            self['logical_cpu_count'] = unicode(psutil.cpu_count())
            self['cpu_count'] = unicode(psutil.cpu_count(logical=False))
            process = psutil.Process()
            self['open_files'] = unicode(process.open_files())
            self['memory_info_before'] = unicode(process.memory_info())
        except ImportError:
            log_helper.warning(__name__, 'psutil not installed. Recording of part of runtime information not possible',
                               root=self.mpi_root, comm=self.mpi_comm)
        except:
            warnings.warn("Recording of psutil-based runtime information failed: "+str(sys.exc_info()))

        # Record the start time for the analysis
        self['start_time'] = unicode(datetime.datetime.now())

        # Enable time and usage profiling if requested
        if self.__profile_time_and_usage:
            self.__time_and_use_profiler = Profile()
            self.__time_and_use_profiler.enable()
예제 #18
0
        process = Popen('%s -c "from mpi4py import MPI as mpi"'%( \
                         sys.executable), shell=True, stderr=PIPE, stdout=PIPE)
        import_failed = process.wait()
        return not import_failed

    MPI_AVAILABLE = test_mpi_available()

    if MPI_AVAILABLE:
        from mpi4py import MPI
except ImportError:
    MPI_AVAILABLE = False

if not MPI_AVAILABLE:
    try:
        from omsi.shared.log import log_helper
        log_helper.warning(__name__, "MPI not available. Running in serial.")
    except:
        print "MPI not available. Running in serial."

import numpy as np
import itertools
import warnings
import time
import os


class parallel_over_axes(object):
    """
    Helper class used to parallelize the execution of a function using MPI by splitting the
    input data into sub-blocks along a given set of axes.
예제 #19
0
        process = Popen('%s -c "from mpi4py import MPI as mpi"'%( \
                         sys.executable), shell=True, stderr=PIPE, stdout=PIPE)
        import_failed = process.wait()
        return not import_failed

    MPI_AVAILABLE = test_mpi_available()

    if MPI_AVAILABLE:
        from mpi4py import MPI
except ImportError:
    MPI_AVAILABLE = False

if not MPI_AVAILABLE:
    try:
        from omsi.shared.log import log_helper
        log_helper.warning(__name__, "MPI not available. Running in serial.")
    except:
        print "MPI not available. Running in serial."

import numpy as np
import itertools
import warnings
import time


class parallel_over_axes(object):
    """
    Helper class used to parallelize the execution of a function using MPI by splitting the
    input data into sub-blocks along a given set of axes.

    :ivar task_function: The function we should run.
예제 #20
0
    def add_and_parse_workflow_arguments(self):
        """
        The function assumes that the command line parser has been setup using the initialize_argument_parser(..)

        This function is responsible for adding all command line arguments that are specific to the workflow and
        to then parse those arguments and save the relevant data in the self.analysis_arguments dictionary.
        Command-line arguments that are specific to the command line driver are removed, so that only
        arguments that can be consumed by the analysis are handed to the analysis.

        *Side effects:* The function sets ``self.analysis_arguments`` and updates the analysis parameters of the
                        analyses stored in ``self.workflow_executor.analysis_tasks``

        """
        # Ensure that we have a workflow executor instantiated. This should usually not happen.
        if self.workflow_executor is None:
            log_helper.warning(
                __name__,
                "Late creation of the workflow executor in add_and_parse_workflow_arguments",
                root=self.mpi_root,
                comm=self.mpi_comm)
            self.create_workflow_executor_object()

        # Ensure that all analysis identifiers are set to a defined value
        self.workflow_executor.analysis_tasks.set_undefined_analysis_identifiers(
        )

        # Ensure that all analysis identifiers are unique
        if not self.workflow_executor.analysis_tasks.analysis_identifiers_unique(
        ):
            log_helper.warning(
                __name__,
                "The workflow contains multiple analyses with the same user-defined "
                +
                "identifier. Colliding identifiers will be modified to ensure uniqueness",
                root=self.mpi_root,
                comm=self.mpi_comm)
            self.workflow_executor.analysis_tasks.make_analysis_identifiers_unique(
                self)

        # Ensure that the prefix of the workflow executor does not interfere with the prefix of an analysis
        all_analysis_identifiers = self.workflow_executor.analysis_tasks.get_all_analysis_identifiers(
        )
        if self.workflow_executor.workflow_identifier in all_analysis_identifiers:
            log_helper.warning(
                __name__,
                "The identifier of the workflow executor collides with the identifier "
                +
                "of an analysis. Updating the identifier of the workflow executor to be unique",
                root=self.mpi_root,
                comm=self.mpi_comm)
            while self.workflow_executor.workflow_identifier in all_analysis_identifiers:
                self.workflow_executor.workflow_identifier += '_'

        # Add all arguments from our workflow executor
        target_seperator = self.identifier_argname_seperator
        if self.workflow_executor is not None:
            for analysis in self.workflow_executor.analysis_tasks:
                # Create the group for the analysis in general
                analysis_group = self.parser.add_argument_group(
                    title=analysis.get_analysis_identifier() + " : " +
                    analysis.get_analysis_type())
                arg_group_name = analysis.get_analysis_identifier(
                ) + target_seperator + analysis.get_analysis_type()
                self.custom_argument_groups[arg_group_name] = analysis_group

                # Create groups for all argument groups of the analysis
                analysis_arg_groups = {}
                arg_group_dict = {
                    arg_param.get_group_name():
                    arg_param.get_group_description()
                    for arg_param in analysis.get_all_parameter_data()
                    if arg_param.get_group_name() is not None
                }
                for group_name, group_description in arg_group_dict.iteritems(
                ):
                    ana_arg_group_name = arg_group_name + ":" + group_name
                    analysis_arg_groups[
                        group_name] = self.parser.add_argument_group(
                            title=ana_arg_group_name,
                            description=group_description)
                    self.custom_argument_groups[
                        ana_arg_group_name] = analysis_arg_groups[group_name]

                # Add all undefined parameters of the current analysis
                for arg_param in analysis.get_all_parameter_data():
                    # If the parameter is notset
                    if not arg_param.data_set():
                        # Add the parameter to the argument parser
                        arg_name = "--" + analysis.get_analysis_identifier() + \
                                   self.identifier_argname_seperator + arg_param['name']
                        arg_action = 'store'
                        arg_default = arg_param['default']
                        arg_required = arg_param['required'] and (arg_default
                                                                  is None)
                        arg_type = arg_param['dtype']
                        arg_choices = arg_param['choices']
                        arg_help = arg_param['help']
                        arg_dest = analysis.get_analysis_identifier(
                        ) + target_seperator + arg_param['name']
                        arg_group = arg_param.get_group_name()

                        # Determine the group the argument belongs to
                        argument_group = self.required_argument_group if arg_required else analysis_group
                        if arg_group in analysis_arg_groups:
                            argument_group = analysis_arg_groups[arg_group]

                        # Add the argument to the proper group
                        argument_group.add_argument(
                            arg_name,  # <-- Required, user specified arg name
                            action=
                            arg_action,  #     Constant. We define this not the user.
                            # nargs=1,                    Don't use. Leave as default
                            # const=None,                 Don't use this type of action
                            default=
                            arg_default,  # <-- Optional default value of the argument
                            type=arg_type,  # <-- Optional dtype of the argument
                            choices=
                            arg_choices,  # <-- Optional Key may be missing.
                            required=arg_required,  # <-- Optional
                            help=arg_help,  # <-- Required
                            # metavar               #     Don't use. Positional analysis
                            #                       #     arguments are not allowed
                            dest=arg_dest
                        )  #     Automatically determined by the name

        # Add the arguments of the workflow executor
        if 'workflow_executor' not in self.custom_argument_groups:
            workflow_executor_group = self.parser.add_argument_group(
                title='optional workflow executor options',
                description=
                'Additional, optional settings for the workflow execution controls'
            )
            self.custom_argument_groups[
                'workflow_executor'] = workflow_executor_group
        else:
            log_helper.warning(
                __name__,
                'The workflow exectutor parser group already exists. ' +
                'Workflow options are added to the main parser instead',
                root=self.mpi_root,
                comm=self.mpi_comm)
            workflow_executor_group = self.parser
        for arg_param in self.workflow_executor.get_all_parameter_data():
            # Add the parameter to the argument parser
            arg_name = "--" + self.workflow_executor.workflow_identifier + target_seperator + arg_param[
                'name']
            arg_action = 'store'
            arg_default = arg_param['default']
            arg_required = arg_param['required'] and (arg_default is None)
            arg_type = arg_param['dtype']
            arg_choices = arg_param['choices']
            arg_help = arg_param['help']
            arg_dest = self.workflow_executor.workflow_identifier + target_seperator + arg_param[
                'name']
            argument_group = self.required_argument_group if arg_required else workflow_executor_group
            # Add the argument to the proper group
            argument_group.add_argument(
                arg_name,  # <-- Required, user specified arg name
                action=arg_action,  #     Constant. We define this not the user.
                # nargs=1,                    Don't use. Leave as default
                # const=None,                 Don't use. We don't use this type of action
                default=
                arg_default,  # <-- Optional default value for the argument
                type=arg_type,  # <-- Optional dtype of the argument
                choices=arg_choices,  # <-- Optional Key may be missing.
                required=arg_required,  # <-- Optional
                help=arg_help,  # <-- Required
                # metavar               #     Don't use. Positional analysis arguments
                #                       #     are not allowed
                dest=arg_dest)  #     Automatically determined by the name

        # Add the help argument
        self.optional_argument_group.add_argument(
            '-h',
            '--help',
            action='help',
            default=argparse.SUPPRESS,
            help='show this help message and exit')

        # Remove the arguments from this driver that cannot be understood by the analysis
        parsed_arguments = vars(self.parser.parse_args())
        parsed_arguments.pop(self.profile_arg_name, None)
        parsed_arguments.pop(self.output_save_arg_name, None)
        parsed_arguments.pop(self.profile_mem_arg_name, None)
        parsed_arguments.pop(self.log_level_arg_name, None)
        parsed_arguments.pop(self.script_arg_name, None)

        # Consume the command line arguments for the workflow executor and the analysis
        self.workflow_executor_arguments = {}
        for arg_param in self.workflow_executor.get_all_parameter_data():
            arg_dest = self.workflow_executor.workflow_identifier + target_seperator + arg_param[
                'name']
            if arg_dest in parsed_arguments:
                param_value = parsed_arguments.pop(arg_dest)
                self.workflow_executor[arg_param['name']] = param_value
                self.workflow_executor_arguments[
                    arg_param['name']] = param_value

        # Consume the arguments for the different analyses
        self.analysis_arguments = parsed_arguments
        for arg_key, arg_value in self.analysis_arguments.iteritems():
            ana_identifier, arg_key = arg_key.split(target_seperator)
            self.workflow_executor.analysis_tasks[ana_identifier][
                arg_key] = arg_value
            # Make sure we use the user-specified log level, even if it is set differently in the scripts
            if self.user_log_level is not None:
                log_helper.set_log_level(
                    level=log_helper.log_levels[self.user_log_level])
예제 #21
0
    def main(self):
        """
        Default main function for running an analysis from the command line.
        The default implementation exposes all specified analysis parameters as command
        line options to the user. The default implementation also provides means to
        print a help text for the function.

        :raises: ValueError is raised in case that the analysis class is unknown

        """

        # Initialize the argument parser
        if self.parser is None:
            self.initialize_argument_parser()

        try:
            # Parse the command line arguments to determine the command line driver settings
            self.parse_cl_arguments()
        except:
            self.remove_output_target()
            raise

        if self.workflow_executor is None:
            self.remove_output_target()
            log_helper.error(
                __name__,
                'Missing --script parameter or worfklow_executor object')
            raise ValueError('Workflow not initalized')

        # Add and parse the command line arguments specific to the analysis to determine the analysis settings
        try:
            self.add_and_parse_workflow_arguments()
        except:
            self.remove_output_target()
            raise

        # Print the analysis settings
        if mpi_helper.get_rank() == self.mpi_root:
            self.print_settings()

        # Enable time and usage profiling
        try:
            # Enable time and usage profiling if requested
            if self.profile_analyses:
                try:
                    self.workflow_executor.analysis_tasks.enable_time_and_usage_profiling(
                        self.profile_analyses)
                except ImportError as e:
                    log_helper.warning(
                        __name__,
                        "Profiling of time and usage not available due to missing packages."
                    )
                    log_helper.warning(__name__, e.message)
            # Enable memory profiling if requested
            if self.profile_analyses_mem:
                try:
                    self.workflow_executor.analysis_tasks.enable_memory_profiling(
                        self.profile_analyses_mem)
                except ImportError as e:
                    log_helper.warning(
                        __name__,
                        "Profiling of memory usage not available due to missing packages"
                    )
                    log_helper.warning(__name__, e.message)
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Execute the analysis
        try:
            log_helper.debug(__name__,
                             'Analysis arguments: ' +
                             str(self.analysis_arguments),
                             root=self.mpi_root,
                             comm=self.mpi_comm)
            self.workflow_executor.execute()
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial
        # the condition of  mpi_helper.get_rank() ==  self.mpi_root evaluates to True because
        # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial.
        if mpi_helper.get_rank() == self.mpi_root:

            # Print usage profiles if available
            try:
                self.print_time_and_usage_profiles()
            except:
                log_helper.error(
                    __name__,
                    "An error occured while trying to print time and usage profiles",
                    root=self.mpi_root,
                    comm=self.mpi_comm)

            # Print memory profile data if available
            try:
                self.print_memory_profiles()
            except:
                log_helper.error(
                    __name__,
                    "An error occured while trying to print memory profiles",
                    root=self.mpi_root,
                    comm=self.mpi_comm)

            # Print the time it took to run the analysis
            try:
                # Parallel case: We need to compile/collect timing data from all cores
                if isinstance(
                        self.workflow_executor.run_info['execution_time'],
                        list):
                    # Time for each task to execute
                    log_helper.info(
                        __name__,
                        "Time in seconds for each analysis process: " +
                        str(self.workflow_executor.run_info['execution_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)
                    # Start times of each task
                    log_helper.info(
                        __name__,
                        "Time when each of the processes started: " +
                        str(self.workflow_executor.run_info['start_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)
                    # Stop times for each task

                    log_helper.info(
                        __name__,
                        "Time when each of the processes finished: " +
                        str(self.workflow_executor.run_info['end_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)

                    # Compile the time to execute string
                    exec_time_array = np.asarray(
                        self.workflow_executor.run_info['execution_time'],
                        dtype=float)
                    max_exec_time = str(exec_time_array.max())
                    min_exec_time = str(exec_time_array.min())
                    mean_exec_time = str(exec_time_array.mean())
                    exec_time_string = max_exec_time + " s " + \
                        "    ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )"
                # Serial case: We only have a single time to worry about
                else:
                    exec_time_string = str(self.workflow_executor.
                                           run_info['execution_time']) + " s"
                log_helper.info(__name__,
                                "Time to execute analysis: " +
                                exec_time_string,
                                root=self.mpi_root,
                                comm=self.mpi_comm)
            except:
                raise

        # Save the analysis to file
        if self.output_target is not None:
            from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager
            for analysis in self.workflow_executor.analysis_tasks:
                omsi_analysis_manager.create_analysis_static(
                    analysis_parent=self.output_target, analysis=analysis)
예제 #22
0
파일: generic.py 프로젝트: biorack/BASTet
from omsi.analysis.base import analysis_base
from omsi.datastructures.analysis_data import data_dtypes
from omsi.datastructures.dependency_data import dependency_dict
from omsi.datastructures.analysis_data import parameter_data
from omsi.shared.log import log_helper

try:
    import cloudpickle   # Use the version of cloud-pickle installed on the system
    log_helper.debug(__name__, "Using system cloudpickle module")
except ImportError:
    try:
        import omsi.shared.third_party.cloudpickle as cloudpickle
        log_helper.debug(__name__, "Using fallback cloudpickle version")
    except ImportError:
        log_helper.warning(__name__, "cloudpickle could not be imported. Using standard pickle instead. " +
                           " Some features may not be available.")
        import pickle as cloudpickle
import numpy as np


def bastet_analysis(output_names=None, parameter_specs=None, name_key="undefined"):
    """
    Decorator used to wrap a function and replace it with an analysis_generic object
    that behaves like a function but adds the ability for saving the
    analysis to file and tracking provenance

    This is essentially the same as analysis_generic.from_function(....).

    :param func: The function to be wrapped
    :param output_names: Optional list of strings with the names of the outputs
    :param parameter_specs: Optional list of omsi.datastructures.analysis_data.parameter_data with
예제 #23
0
    def main(self):
        """
        Default main function for running an analysis from the command line.
        The default implementation exposes all specified analysis parameters as command
        line options to the user. The default implementation also provides means to
        print a help text for the function.

        :raises: ValueError is raised in case that the analysis class is unknown
        """
        # Get the analysis object if needed
        if self.add_analysis_class_arg:
            try:
                self.get_analysis_class_from_cl()
            except (ImportError, AttributeError, ValueError):
                pass

        # Initialize the argument parser
        if self.parser is None:
            self.initialize_argument_parser()

        # Check if we have a valid analysis class
        if self.analysis_class is None:
            print self.parser.print_help()
            raise ValueError('Could not determine the analysis class.')
        if not issubclass(self.analysis_class, analysis_base):
            print self.parser.print_help()
            raise ValueError('Analysis class is not a subclass of analysis_base.')

        try:
            # Parse the command line arguments to determine the command line driver settings
            self.parse_cl_arguments()
            # Add and parse the command line arguments specific to the analysis to determine the analysis settings
            self.add_and_parse_analysis_arguments()
        except:
            self.remove_output_target()
            raise

        # Print the analysis settings
        if mpi_helper.get_rank() == self.mpi_root:
            self.print_settings()

        # Call the execute function of the analysis
        try:
            # Create the analysis object
            analysis_object = self.analysis_class()
            # Enable time and usage profiling if requested
            if self.profile_analysis:
                try:
                    analysis_object.enable_time_and_usage_profiling(self.profile_analysis)
                except ImportError as e:
                    log_helper.warning(__name__, "Profiling of time and usage not available due to missing packages.")
                    log_helper.warning(__name__, e.message)
            # Enable memory profiling if requested
            if self.profile_analysis_mem:
                try:
                    analysis_object.enable_memory_profiling(self.profile_analysis_mem)
                except ImportError as e:
                    log_helper.warning(__name__, "Profiling of memory usage not available due to missing packages")
                    log_helper.warning(__name__, e.message)
            # Execute the analysis
            log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments))
            analysis_object.execute(**self.analysis_arguments)
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial
        # the condition of  mpi_helper.get_rank() ==  self.mpi_root evaluates to True because
        # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial.
        if mpi_helper.get_rank() == self.mpi_root:
            # Print the profiling results of time and usage
            if self.profile_analysis:
                print ""
                print "PROFILING DATA: TIME AND USAGE"
                print ""
                analysis_object.get_profile_stats_object(consolidate=True).print_stats()

            # Print the profiling results for memory usage
            if self.profile_analysis_mem:
                print ""
                print "PROFILING DATA: MEMORY"
                print ""
                print analysis_object.get_memory_profile_info()

            # Print the time it took to run the analysis
            try:
                # Parallel case: We need to compile/collect timing data from all cores
                if isinstance(analysis_object.run_info['execution_time'] , list):
                    # Time for each task to execute
                    log_helper.info(__name__, "Time in seconds for each analysis process: " +
                                     str(analysis_object.run_info['execution_time']))
                    # Start times of each task
                    log_helper.info(__name__, "Time when each of the processes started: " +
                                              str(analysis_object.run_info['start_time']))
                    # Stop times for each task

                    log_helper.info(__name__, "Time when each of the processes finished: " +
                                              str(analysis_object.run_info['end_time']))

                    # Compile the time to execute string
                    exec_time_array = np.asarray(analysis_object.run_info['execution_time'], dtype=float)
                    max_exec_time = str(exec_time_array.max())
                    min_exec_time = str(exec_time_array.min())
                    mean_exec_time = str(exec_time_array.mean())
                    exec_time_string = max_exec_time + " s " + \
                                       "    ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )"
                # Serial case: We only have a single time to worry about
                else:
                    exec_time_string = str(analysis_object.run_info['execution_time']) + " s"
                log_helper.info(__name__, "Time to execute analysis: " + exec_time_string)
            except:
                raise

        # Save the analysis to file
        if self.output_target is not None:
            from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager
            omsi_analysis_manager.create_analysis_static(analysis_parent=self.output_target,
                                                         analysis=analysis_object)
예제 #24
0
    def __compute_file_info(cls, filename, resolution):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis, data type for the intensities, format type

        :return: Numpy array with mz axis
        :return: string with data type
        :return: imzml file type
        :return:
        """
        reader = ImzMLParser(filename)
        # Read the first spectrum
        mz_axes, intens = reader.getspectrum(0)  # NOTE: mz_axes is a tuple
        # Read the coordinates
        coordinates = np.asarray(reader.coordinates)
        # Determine the data type for the internsity values
        dtype = np.asarray(intens).dtype.str

        # Compute the mz axis and file type
        file_type = cls.available_imzml_types['continuous']
        min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes)
        for ind in range(coordinates.shape[0]
                         ):  #for ind, loc in enumerate(reader.coordinates):
            mz, intens = reader.getspectrum(ind)
            if mz == mz_axes:
                pass
            else:
                file_type = cls.available_imzml_types['processed']
                if min_mz > np.amin(mz):
                    min_mz = np.amin(mz)
                if max_mz < np.amax(mz):
                    max_mz = np.amax(mz)
        # Reinterpolate the mz-axis if we have a processed mode imzml file
        if file_type == cls.available_imzml_types['processed']:
            f = np.ceil(1e6 * np.log(max_mz / min_mz) / resolution)
            mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f)
            log_helper.info(
                __name__, "Reinterpolated m/z axis for processed imzML file")

        # Construct the imzml metadata information
        dataset_metadata = metadata_dict()
        instrument_metadata = metadata_dict()
        method_metadata = metadata_dict()
        for k, v in reader.imzmldict.iteritems():
            dataset_metadata[k] = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=k,
                                                 ontology=None)

        # Delete the parser and read the metadata
        del reader

        # Parse the metadata for the file. We try to parse only the header and ignore the
        # <run > group in the XML file to avoid going throught the whole file again
        # while extracting the majority of the relevant metadata
        try:
            with open(filename, 'r') as ins:
                metdata_header = ''
                for line in ins:
                    if '<run' in line:
                        break
                    else:
                        metdata_header += line
                metdata_header += '</mzML>'
                metdata_header_dict = xmltodict.parse(metdata_header)['mzML']
                for k, v in metdata_header_dict.iteritems():
                    store_value = metadata_value(
                        name=k,
                        value=v,
                        unit=None,
                        description=str(k) +
                        " extracted from imzML XML header.",
                        ontology=None)
                    if k == 'instrumentConfigurationList':
                        instrument_metadata[k] = store_value
                    elif k == 'dataProcessingList':
                        method_metadata[k] = store_value
                    elif k == 'scanSettingsList':
                        dataset_metadata[k] = store_value
                    elif k == 'softwareList':
                        method_metadata[k] = store_value
                    elif k == 'sampleList':
                        method_metadata[k] = store_value
                    else:
                        dataset_metadata[k] = store_value
                dataset_metadata['imzml_xml_metadata_header'] = metadata_value(
                    name='imzml_xml_metadata_header',
                    value=metdata_header,
                    unit=None,
                    description='XML imzML header',
                    ontology=None)
        except:
            log_helper.warning(
                __name__, "Extraction of additional imzML metadata failed")

        return coordinates, np.asarray(
            mz_axes
        ), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
예제 #25
0
    def s_read_acqu(filename):
        """Construct an m/z axis for the given acqu file.

           :param filename: String with the name+path for the acqu file.
           :type filename: string

           :returns: Return dictonary with the parsed metadata information

        """
        # Read the complete acqu file
        acqu = open(filename, 'r')
        lines = acqu.readlines()  # read all lines of the file into a list
        acqu.close()
        #
        # Parse the acqu file and store all data in a python dictonary
        #
        acqu_dict = {}
        curr_line = 0
        while curr_line < len(lines):
            # Skip lines with no data
            if len(lines[curr_line].rstrip("\n").rstrip("\r").rstrip(
                    " ")) == 0:
                curr_line += 1
                continue
            # All variables should start with ## in the acqu file
            if not lines[curr_line].startswith("##"):
                log_helper.warning(__name__,  "WARNING: Error while reading line" + str(curr_line) + \
                      " of the acqu file. The error may have occured on the previous line")
                if curr_line > 0:
                    log_helper.debug(
                        __name__,
                        str(curr_line - 1) + ": " + lines[curr_line - 1])
                log_helper.debug(str(curr_line) + ": " + lines[curr_line])
                curr_line += 1
                continue

            sl = lines[curr_line].split("=")
            key = sl[0]
            # Remove beginning spaces and endline and tabs at the end from the
            # value
            value = sl[1].lstrip(' ').rstrip("\n").rstrip("\r").rstrip(" ")

            # Try to convert the value to a number
            is_number = False
            try:
                value = int(value)
                is_number = True
            except ValueError:
                try:
                    value = float(value)
                    is_number = True
                except ValueError:
                    pass

            # Check whether the entry defines a vector of numbers
            unicode_value = unicode(value)
            if not is_number and unicode_value.startswith(
                    "(") and unicode_value.endswith(")"):
                # How many values and lines do we need to read?
                sv = unicode_value.lstrip("(").rstrip(")").split("..")
                # low = int(sv[0])
                high = int(sv[1])
                num_vals = high + 1
                vals_per_line = 8
                num_lines = int(math.ceil(num_vals / float(vals_per_line)))
                # Read all the values into a list and convert the numbers if
                # possible
                value = []
                curr_line += 1
                # print str(num_lines) + " : " + sv[0] + "..." + sv[1]
                for _ in range(0, num_lines):

                    sl = lines[curr_line].rstrip("\n").rstrip("\r").rstrip(
                        " ").split(" ")
                    try:
                        sconv = [int(ix) for ix in sl]
                    except ValueError:
                        try:
                            sconv = [float(ix) for ix in sl]
                        except ValueError:
                            sconv = sl
                    value = value + sconv
                    curr_line += 1

                acqu_dict[key] = value
            else:
                acqu_dict[key] = value
                curr_line += 1

        return acqu_dict
예제 #26
0
파일: generic.py 프로젝트: wholtz/BASTet
from omsi.analysis.base import analysis_base
from omsi.datastructures.analysis_data import data_dtypes
from omsi.datastructures.dependency_data import dependency_dict
from omsi.datastructures.analysis_data import parameter_data
from omsi.shared.log import log_helper

try:
    import cloudpickle  # Use the version of cloud-pickle installed on the system
    log_helper.debug(__name__, "Using system cloudpickle module")
except ImportError:
    try:
        import omsi.shared.third_party.cloudpickle as cloudpickle
        log_helper.debug(__name__, "Using fallback cloudpickle version")
    except ImportError:
        log_helper.warning(
            __name__,
            "cloudpickle could not be imported. Using standard pickle instead. "
            + " Some features may not be available.")
        import pickle as cloudpickle
import numpy as np


def bastet_analysis(output_names=None,
                    parameter_specs=None,
                    name_key="undefined"):
    """
    Decorator used to wrap a function and replace it with an analysis_generic object
    that behaves like a function but adds the ability for saving the
    analysis to file and tracking provenance

    This is essentially the same as analysis_generic.from_function(....).
예제 #27
0
파일: img_file.py 프로젝트: biorack/BASTet
    def __init__(self, hdr_filename=None, t2m_filename=None, img_filename=None, basename=None, requires_slicing=True):
        """Open an img file for data reading.

            :param hdr_filename: The name of the hdr header file
            :type hdr_filename: string

            :param t2m_filename: The name of the t2m_filename
            :type t2m_filename: string

            :param img_filename: The name of the img data file
            :type img_filename: string

            :param basename: Instead of img_filename, t2m_filename, and hdr_filename one may also supply just
                             a single basename. The basename is completed with the .img, .t2m, .hdr extension
                             to load the data.
            :type basename: string

            :param requires_slicing: Unused here. Slicing is always supported by this reader.
            :type requires_slicing: Boolean

            :raises ValueError: In case that basename and hdr_filename, t2m_filename, and img_filename are specified.
        """
        super(img_file, self).__init__(basename, requires_slicing)
        self.data_type = 'uint16'
        self.shape = [0, 0, 0]  # Number of pixels in x,y, and z. NOTE: Type changed to tuple later on.
        self.mz = 0  # A numpy vector with the m/z values of the instrument

        if basename and hdr_filename and t2m_filename and img_filename:
            raise ValueError(
                "Conflicting input. Provide either basename or the " +
                "hdr_filename,t2m_filename,img_filename parameters but not both.")
        if basename:
            basefile = basename
            if os.path.isdir(basename):
                filelist = self.get_files_from_dir(basename)
                log_helper.log_var(__name__, filelist=filelist)
                if len(filelist) > 0:
                    basefile = filelist[0]
                else:
                    raise ValueError("No valid img file found in the given directory.")
            elif basefile.endswith(".img") and os.path.exists(basefile):
                basefile = basefile.rstrip(".img")
            elif basefile.endswith(".hdr") and os.path.exists(basefile):
                basefile = basefile.rstrip(".hdr")
            elif basefile.endswith(".t2m") and os.path.exists(basefile):
                basefile = basefile.rstrip(".t2m")

            log_helper.log_var(__name__, basefile=basefile)
            if os.path.exists(basefile + ".hdr") and \
                    os.path.exists(basefile + ".t2m") and \
                    os.path.exists(basefile + ".img"):
                hdr_filename = basefile + ".hdr"
                t2m_filename = basefile + ".t2m"
                img_filename = basefile + ".img"
            else:
                raise ValueError("No valid img file found for the given basename.")
        elif hdr_filename and t2m_filename and img_filename:
            pass  # Nothing to be done
        else:
            raise ValueError("Missing input parameter. Either provide: " +
                             " i) basename or ii) hdr_filename, t2m_filename, img_filename")

        # Initialize the x and y length
        hdr = open(hdr_filename, 'rb')
        hdrdata = np.fromfile(file=hdr_filename, dtype='int16', count=-1)
        self.shape[0] = int(hdrdata[23])
        self.shape[1] = int(hdrdata[22])
        hdr.close()

        # Initialize the z length
        t2m = open(t2m_filename, 'rb')
        self.mz = np.fromfile(file=t2m, dtype='float32', count=-1)
        self.shape[2] = self.mz.shape[0]
        t2m.close()

        # Convert the shape variable to the expected tuple
        self.shape = tuple(self.shape)

        # Open the img file with the spectrum data
        self.img_filename = img_filename
        self.file_opened = False
        try:
            self.m_img_file = np.memmap(filename=self.img_filename,
                                        dtype=self.data_type,
                                        shape=self.shape,
                                        mode='r',
                                        order='C')
            self.file_opened = True
        except ValueError:
            # Check if the size of the file matches what we expect
            imgsize = os.stat(self.img_filename).st_size
            itemsize = np.dtype(self.data_type).itemsize
            expectednumvalues = int(self.shape[0]) * int(self.shape[1]) * int(self.shape[2])
            expectedsize = expectednumvalues * int(itemsize)
            sizedifference = expectedsize - imgsize
            log_helper.warning(__name__ , "IMG size: " + str(imgsize) + " Expected size: " + \
                                          str(expectedsize) + "  (difference="+str(sizedifference) + ")")
            if imgsize < expectedsize:
                # Check whether the missing data aligns with images or spectra
                slicesize = int(self.shape[0]) * int(self.shape[1]) * itemsize
                spectrumsize = int(self.shape[2]) * itemsize
                percentmissing = float(sizedifference)/float(expectedsize)
                valuesmissing = float(sizedifference) / itemsize
                warnings.warn("WARNING: Missing "+str(sizedifference) +
                              " bytes in img file (missing " + str(valuesmissing) +
                              " intensity values; "+str(percentmissing)+"%)." +
                              " Expected shape: "+str(self.shape))
                # Define how we should deal with the error
                expandslice = (sizedifference % slicesize) == 0
                expandspectra = (sizedifference % spectrumsize) == 0
                if not expandslice:
                    expandspectra = True
                # Complete missing spectra
                if expandspectra:
                    warnings.warn("Dealing with missing data in img file by completing last spectra with 0's.")
                    # TODO np.require create an in-memory copy of the full data. Allow usage of memmap'ed tempfile.
                    tempmap = np.require(np.memmap(filename=self.img_filename,
                                                   dtype=self.data_type,
                                                   mode='r',
                                                   order='C'),
                                         requirements=['O', 'C'])
                    # Extend the memmap to the expected size
                    tempmap.resize((expectednumvalues, ))
                    # Reshape the memmap to the expected shape
                    self.m_img_file = tempmap.reshape(self.shape, order='C')
                    self.file_opened = True
                # Complete missing slices
                elif expandslice:
                    slicesmissing = sizedifference / slicesize
                    self.mz = self.mz[:(-slicesmissing)]
                    warnings.warn("Dealing with missing data in img file by updating he m/z axis.." +
                                  " It looks like the m/z axis data may be inconsistent" +
                                  " with the binary data. Removing "+str(slicesmissing) +
                                  " bins from the m/z axis.")
                    self.shape = list(self.shape)
                    self.shape[2] = self.mz.shape[0]
                    self.shape = tuple(self.shape)
                    self.m_img_file = np.memmap(filename=self.img_filename,
                                                dtype=self.data_type,
                                                shape=self.shape,
                                                mode='r',
                                                order='C')
                    self.file_opened = True
                else:
                    raise
            else:
                raise
        except:
            log_helper.error(__name__, "Error while opening the img file: " + img_filename)
            raise
예제 #28
0
    def record_postexecute(self, execution_time=None):
        """
        Function used to record runtime information after the task we want to track is comleted, e.g.
        the `execute_analysis(...)` function of a standard analysis.

        The function may be overwritten in child classes to add recording of
        additional runtime information.

        When overwriting the function we should call super(...,self).runinfo_record_postexecute(execution_time)
        in the custom version to ensure that the execution and end_time are properly
        recorded.

        :param execution_time: The total time it took to execute the analysis. May be None, in which
            case the function will attempt to compute the execution time based on the start_time
            (if available) and the the current time.

        :param comm: Used for logging only. The MPI communicator to be used. Default value is None,
            in which case MPI.COMM_WORLD is used.

        """
        log_helper.debug(__name__,
                         'Recording post-execution runtime data',
                         root=self.mpi_root,
                         comm=self.mpi_comm)
        # Finalize recording of post execution provenance
        self['end_time'] = unicode(datetime.datetime.now())
        if execution_time is not None:
            self['execution_time'] = unicode(execution_time)
        elif 'start_time' in self:
            start_time = run_info_dict.string_to_time(self['start_time'])
            stop_time = run_info_dict.string_to_time(self['end_time'])
            self['execution_time'] = unicode(
                stop_time - start_time
            )  # TODO: This only gives execution time in full seconds right now
        else:
            self['execution_time'] = None
        # Attempt to record psutil data
        try:
            import psutil
            process = psutil.Process()
            self['memory_info_after'] = unicode(process.memory_info())
        except ImportError:
            log_helper.warning(
                __name__,
                'psutil not installed. Recording of part of runtime information not possible',
                root=self.mpi_root,
                comm=self.mpi_comm)
        except:
            warnings.warn(
                "Recording of psutil-based runtime information failed: " +
                str(sys.exc_info()))

        # Record the time and use profiling data if possible
        if self.__time_and_use_profiler is not None:
            self.__time_and_use_profiler.disable()
            self.__time_and_use_profiler.create_stats()
            self['profile'] = unicode(self.__time_and_use_profiler.stats)
            # Save the summary statistics for the profiling data
            stats_io = StringIO.StringIO()
            profiler_stats = pstats.Stats(
                self.__time_and_use_profiler,
                stream=stats_io).sort_stats('cumulative')
            profiler_stats.print_stats()
            self['profile_stats'] = stats_io.getvalue()

        # Record the memory profiling data if possible
        if self.__memory_profiler is not None and self.get_profile_memory():
            log_helper.debug(__name__,
                             'Recording memory profiling data',
                             root=self.mpi_root,
                             comm=self.mpi_comm)
            mem_stats_io = StringIO.StringIO()
            memory_profiler.show_results(self.__memory_profiler,
                                         stream=mem_stats_io)
            self['profile_mem'] = unicode(self.__memory_profiler.code_map)
            self['profile_mem_stats'] = mem_stats_io.getvalue()
예제 #29
0
    def s_read_acqu(filename):
        """Construct an m/z axis for the given acqu file.

           :param filename: String with the name+path for the acqu file.
           :type filename: string

           :returns: Return dictonary with the parsed metadata information

        """
        # Read the complete acqu file
        acqu = open(filename, 'r')
        lines = acqu.readlines()  # read all lines of the file into a list
        acqu.close()
        #
        # Parse the acqu file and store all data in a python dictonary
        #
        acqu_dict = {}
        curr_line = 0
        while curr_line < len(lines):
            # Skip lines with no data
            if len(lines[curr_line].rstrip("\n").rstrip("\r").rstrip(" ")) == 0:
                curr_line += 1
                continue
            # All variables should start with ## in the acqu file
            if not lines[curr_line].startswith("##"):
                log_helper.warning(__name__,  "WARNING: Error while reading line" + str(curr_line) + \
                      " of the acqu file. The error may have occured on the previous line")
                if curr_line > 0:
                    log_helper.debug(__name__, str(curr_line - 1) + ": " + lines[curr_line - 1])
                log_helper.debug(str(curr_line) + ": " + lines[curr_line])
                curr_line += 1
                continue

            sl = lines[curr_line].split("=")
            key = sl[0]
            # Remove beginning spaces and endline and tabs at the end from the
            # value
            value = sl[1].lstrip(' ').rstrip("\n").rstrip("\r").rstrip(" ")

            # Try to convert the value to a number
            is_number = False
            try:
                value = int(value)
                is_number = True
            except ValueError:
                try:
                    value = float(value)
                    is_number = True
                except ValueError:
                    pass

            # Check whether the entry defines a vector of numbers
            unicode_value = unicode(value)
            if not is_number and unicode_value.startswith("(") and unicode_value.endswith(")"):
                # How many values and lines do we need to read?
                sv = unicode_value.lstrip("(").rstrip(")").split("..")
                # low = int(sv[0])
                high = int(sv[1])
                num_vals = high + 1
                vals_per_line = 8
                num_lines = int(math.ceil(num_vals / float(vals_per_line)))
                # Read all the values into a list and convert the numbers if
                # possible
                value = []
                curr_line += 1
                # print str(num_lines) + " : " + sv[0] + "..." + sv[1]
                for _ in range(0, num_lines):

                    sl = lines[curr_line].rstrip("\n").rstrip("\r").rstrip(" ").split(" ")
                    try:
                        sconv = [int(ix) for ix in sl]
                    except ValueError:
                        try:
                            sconv = [float(ix) for ix in sl]
                        except ValueError:
                            sconv = sl
                    value = value + sconv
                    curr_line += 1

                acqu_dict[key] = value
            else:
                acqu_dict[key] = value
                curr_line += 1

        return acqu_dict
예제 #30
0
    def v_qspectrum(cls, analysis_object, x, y, viewer_option=0):
        """Implement support for qspectrum URL requests for the viewer"""
        # Retrieve the h5py objects for the requried datasets from the local peak finding
        if viewer_option == 0:
            from omsi.shared.data_selection import check_selection_string, selection_type, selection_to_indexlist
            import numpy as np
            peak_mz = analysis_object['peak_mz']
            peak_values = analysis_object['peak_value']
            array_indices = analysis_object['peak_arrayindex'][:]
            indata_mz = analysis_object['indata_mz']
            # Determine the shape of the original raw data
            if (indata_mz is None) or (array_indices is None):
                return None, None
            num_x = array_indices[:, 0].max()
            num_y = array_indices[:, 1].max()
            num_mz = indata_mz.shape[0]
            num_spectra = array_indices.shape[0]
            # Determine the size of the selection and the set of selected items
            x_list = selection_to_indexlist(x, num_x)
            y_list = selection_to_indexlist(y, num_y)
            if (check_selection_string(x) == selection_type['indexlist']) and \
                    (check_selection_string(y) == selection_type['indexlist']):
                if len(x_list) == len(y_list):
                    items = [(x_list[i], y_list[i])
                             for i in xrange(0, len(x_list))]
                else:
                    return None, None
            else:
                items = [0] * (len(x_list) * len(y_list))
                index = 0
                for xi in x_list:
                    for yi in y_list:
                        items[index] = (xi, yi)
                        index += 1

            shape_x = len(items)
            shape_y = 1
            shape_z = num_mz
            # Initialize the data cube to be returned
            data = np.zeros((shape_x, shape_y, shape_z),
                            dtype=peak_values.dtype)
            # Fill the non-zero locations for the data cube with data
            for ni, ci in enumerate(items):
                try:
                    # Pixel indices may be out of order (e.g, when we use MPI) so we look up the pixel location
                    current_index = np.nonzero(
                        np.logical_and(array_indices[0] == ci[0],
                                       array_indices[1] == ci[1]))[0][0]
                except:
                    log_helper.warning(
                        __name__,
                        "Requested pixel not found: " + str(items[ni]))
                    continue
                current_dx = ni
                current_dy = 0
                start_index = array_indices[current_index][2]
                if current_index < num_spectra:
                    end_index = array_indices[(current_index + 1)][2]
                else:
                    end_index = peak_values.size
                if start_index != end_index:
                    temp_values = peak_values[start_index:end_index]
                    temp_mz = peak_mz[start_index:end_index]
                    data[current_dx, current_dy, temp_mz] = temp_values
                else:
                    # The start and end index may be the same in case that
                    # no peaks for found for the given spectrum
                    # The data is already initialized to 0 so there is nothing to do here
                    pass

            if len(items) == 1:
                data = data.reshape((shape_x, shape_z))

            # Return the spectra and indicate that no customMZ data values (i.e. None) are needed
            return data, None

        elif viewer_option > 0:
            return super(omsi_findpeaks_local,
                         cls).v_qspectrum(analysis_object, x, y,
                                          viewer_option - 1)
        else:
            return None, None
예제 #31
0
    def record_preexecute(self):
        """
        Record basic runtime information in this dict before the exeuction is started.


        Function used to record runtime information prior to executing the process we want to track, e.g.,
        the `execute_analysis(...)` of a standard analysis.

        The function may be overwritten in child classes to add recording of
        additional runtime information. All runtime data should be recorded in the
        main dict (i.e, self). This ensures in the case of standard analysis that
        the data is stored in the HDF5 file. Other data should be stored in separate
        variables that we may add to the object.

        When overwriting the function we should typically call super(...,self).runinfo_record_pretexecute()
        last in the custom version to ensure that the start_time is properly recorded right before
        the execution of the analysis.

        """
        log_helper.debug(__name__,
                         'Recording pre-execution runtime data',
                         root=self.mpi_root,
                         comm=self.mpi_comm)
        # Record basic runtime environment information using the platform module
        try:
            self['architecture'] = unicode(platform.architecture())
            self['java_ver'] = unicode(platform.java_ver())
            self['libc_ver'] = unicode(platform.libc_ver())
            self['linux_distribution'] = unicode(platform.linux_distribution())
            self['mac_ver'] = unicode(platform.mac_ver())
            self['machine'] = unicode(platform.machine())
            self['node'] = unicode(platform.node())
            self['platform'] = unicode(platform.platform())
            self['processor'] = unicode(platform.processor())
            self['python_branch'] = unicode(platform.python_branch())
            self['python_build'] = unicode(platform.python_build())
            self['python_compiler'] = unicode(platform.python_compiler())
            self['python_implementation'] = unicode(
                platform.python_implementation())
            self['python_revision'] = unicode(platform.python_revision())
            self['python_version'] = unicode(platform.python_version())
            self['release'] = unicode(platform.release())
            self['system'] = unicode(platform.system())
            self['uname'] = unicode(platform.uname())
            self['version'] = unicode(platform.version())
            self['win32_ver'] = unicode(platform.win32_ver())
        except:
            warnings.warn(
                "WARNING: Recording of platform provenance failed: " +
                str(sys.exc_info()))

        # Attempt to record the svn version information
        try:
            import subprocess
            self['svn_ver'] = subprocess.check_output('svnversion').rstrip(
                '\n')
        except ImportError:
            log_helper.warning(
                __name__,
                'Recording of svn version not possible. subprocess not installed',
                root=self.mpi_root,
                comm=self.mpi_comm)
        except:
            warnings.warn("Recording of svn version information failed: " +
                          str(sys.exc_info()))

        # Attempt to record software library version
        try:
            import numpy as np
            self['numpy_version_full_version'] = unicode(
                np.version.full_version)
            self['numpy_version_release'] = unicode(np.version.release)
            self['numpy_version_git_revision'] = unicode(
                np.version.git_revision)
        except ImportError:
            log_helper.warning(__name__,
                               'Recording of numpy version not possible.',
                               root=self.mpi_root,
                               comm=self.mpi_comm)

        # Attempt to record psutil data
        try:
            import psutil
            self['logical_cpu_count'] = unicode(psutil.cpu_count())
            self['cpu_count'] = unicode(psutil.cpu_count(logical=False))
            process = psutil.Process()
            self['open_files'] = unicode(process.open_files())
            self['memory_info_before'] = unicode(process.memory_info())
        except ImportError:
            log_helper.warning(
                __name__,
                'psutil not installed. Recording of part of runtime information not possible',
                root=self.mpi_root,
                comm=self.mpi_comm)
        except:
            warnings.warn(
                "Recording of psutil-based runtime information failed: " +
                str(sys.exc_info()))

        # Record the start time for the analysis
        self['start_time'] = unicode(datetime.datetime.now())

        # Enable time and usage profiling if requested
        if self.__profile_time_and_usage:
            self.__time_and_use_profiler = Profile()
            self.__time_and_use_profiler.enable()
예제 #32
0
    def __init__(self,
                 hdr_filename=None,
                 t2m_filename=None,
                 img_filename=None,
                 basename=None,
                 requires_slicing=True):
        """Open an img file for data reading.

            :param hdr_filename: The name of the hdr header file
            :type hdr_filename: string

            :param t2m_filename: The name of the t2m_filename
            :type t2m_filename: string

            :param img_filename: The name of the img data file
            :type img_filename: string

            :param basename: Instead of img_filename, t2m_filename, and hdr_filename one may also supply just
                             a single basename. The basename is completed with the .img, .t2m, .hdr extension
                             to load the data.
            :type basename: string

            :param requires_slicing: Unused here. Slicing is always supported by this reader.
            :type requires_slicing: Boolean

            :raises ValueError: In case that basename and hdr_filename, t2m_filename, and img_filename are specified.
        """
        super(img_file, self).__init__(basename, requires_slicing)
        self.data_type = 'uint16'
        self.shape = [
            0, 0, 0
        ]  # Number of pixels in x,y, and z. NOTE: Type changed to tuple later on.
        self.mz = 0  # A numpy vector with the m/z values of the instrument

        if basename and hdr_filename and t2m_filename and img_filename:
            raise ValueError(
                "Conflicting input. Provide either basename or the " +
                "hdr_filename,t2m_filename,img_filename parameters but not both."
            )
        if basename:
            basefile = basename
            if os.path.isdir(basename):
                filelist = self.get_files_from_dir(basename)
                log_helper.log_var(__name__, filelist=filelist)
                if len(filelist) > 0:
                    basefile = filelist[0]
                else:
                    raise ValueError(
                        "No valid img file found in the given directory.")
            elif basefile.endswith(".img") and os.path.exists(basefile):
                basefile = basefile.rstrip(".img")
            elif basefile.endswith(".hdr") and os.path.exists(basefile):
                basefile = basefile.rstrip(".hdr")
            elif basefile.endswith(".t2m") and os.path.exists(basefile):
                basefile = basefile.rstrip(".t2m")

            log_helper.log_var(__name__, basefile=basefile)
            if os.path.exists(basefile + ".hdr") and \
                    os.path.exists(basefile + ".t2m") and \
                    os.path.exists(basefile + ".img"):
                hdr_filename = basefile + ".hdr"
                t2m_filename = basefile + ".t2m"
                img_filename = basefile + ".img"
            else:
                raise ValueError(
                    "No valid img file found for the given basename.")
        elif hdr_filename and t2m_filename and img_filename:
            pass  # Nothing to be done
        else:
            raise ValueError(
                "Missing input parameter. Either provide: " +
                " i) basename or ii) hdr_filename, t2m_filename, img_filename")

        # Initialize the x and y length
        hdr = open(hdr_filename, 'rb')
        hdrdata = np.fromfile(file=hdr_filename, dtype='int16', count=-1)
        self.shape[0] = int(hdrdata[23])
        self.shape[1] = int(hdrdata[22])
        hdr.close()

        # Initialize the z length
        t2m = open(t2m_filename, 'rb')
        self.mz = np.fromfile(file=t2m, dtype='float32', count=-1)
        self.shape[2] = self.mz.shape[0]
        t2m.close()

        # Convert the shape variable to the expected tuple
        self.shape = tuple(self.shape)

        # Open the img file with the spectrum data
        self.img_filename = img_filename
        self.file_opened = False
        try:
            self.m_img_file = np.memmap(filename=self.img_filename,
                                        dtype=self.data_type,
                                        shape=self.shape,
                                        mode='r',
                                        order='C')
            self.file_opened = True
        except ValueError:
            # Check if the size of the file matches what we expect
            imgsize = os.stat(self.img_filename).st_size
            itemsize = np.dtype(self.data_type).itemsize
            expectednumvalues = int(self.shape[0]) * int(self.shape[1]) * int(
                self.shape[2])
            expectedsize = expectednumvalues * int(itemsize)
            sizedifference = expectedsize - imgsize
            log_helper.warning(__name__ , "IMG size: " + str(imgsize) + " Expected size: " + \
                                          str(expectedsize) + "  (difference="+str(sizedifference) + ")")
            if imgsize < expectedsize:
                # Check whether the missing data aligns with images or spectra
                slicesize = int(self.shape[0]) * int(self.shape[1]) * itemsize
                spectrumsize = int(self.shape[2]) * itemsize
                percentmissing = float(sizedifference) / float(expectedsize)
                valuesmissing = float(sizedifference) / itemsize
                warnings.warn("WARNING: Missing " + str(sizedifference) +
                              " bytes in img file (missing " +
                              str(valuesmissing) + " intensity values; " +
                              str(percentmissing) + "%)." +
                              " Expected shape: " + str(self.shape))
                # Define how we should deal with the error
                expandslice = (sizedifference % slicesize) == 0
                expandspectra = (sizedifference % spectrumsize) == 0
                if not expandslice:
                    expandspectra = True
                # Complete missing spectra
                if expandspectra:
                    warnings.warn(
                        "Dealing with missing data in img file by completing last spectra with 0's."
                    )
                    # TODO np.require create an in-memory copy of the full data. Allow usage of memmap'ed tempfile.
                    tempmap = np.require(np.memmap(filename=self.img_filename,
                                                   dtype=self.data_type,
                                                   mode='r',
                                                   order='C'),
                                         requirements=['O', 'C'])
                    # Extend the memmap to the expected size
                    tempmap.resize((expectednumvalues, ))
                    # Reshape the memmap to the expected shape
                    self.m_img_file = tempmap.reshape(self.shape, order='C')
                    self.file_opened = True
                # Complete missing slices
                elif expandslice:
                    slicesmissing = sizedifference / slicesize
                    self.mz = self.mz[:(-slicesmissing)]
                    warnings.warn(
                        "Dealing with missing data in img file by updating he m/z axis.."
                        +
                        " It looks like the m/z axis data may be inconsistent"
                        + " with the binary data. Removing " +
                        str(slicesmissing) + " bins from the m/z axis.")
                    self.shape = list(self.shape)
                    self.shape[2] = self.mz.shape[0]
                    self.shape = tuple(self.shape)
                    self.m_img_file = np.memmap(filename=self.img_filename,
                                                dtype=self.data_type,
                                                shape=self.shape,
                                                mode='r',
                                                order='C')
                    self.file_opened = True
                else:
                    raise
            else:
                raise
        except:
            log_helper.error(
                __name__, "Error while opening the img file: " + img_filename)
            raise
예제 #33
0
    def __populate_analysis__(cls, analysis_group, analysis):
        """
        Populate the given h5py group with the analysis data.

        NOTE: This is a private helper function. Use the corresponding create_analysis function
        of omsi_file_experiment to create a completely new analysis.

        NOTE: At this point we assume that all in-memory dependencies have been resolved. If not,
        then the raw data associated with the given parameter will be saved instead.

        :param analysis_group: h5py group in which the analysis data should be stored.
        :param analysis: Instance of omsi.analysis.analysis_base defining the analysis
        :type analysis: omsi.analysis.analysis_base:

        :returns: The omsi_file_analysis object for the newly created analysis group. The analysis data is
                  automatically written to file by this function so no addition work is required.

        """
        from omsi.datastructures.analysis_data import analysis_data
        from omsi.dataformat.omsi_file.dependencies import omsi_file_dependencies
        from omsi.analysis.base import analysis_base

        # 1. Write the analysis name
        analysis_identifier_data = analysis_group.require_dataset(
            name=unicode(omsi_format_analysis.analysis_identifier),
            shape=(1, ),
            dtype=omsi_format_common.str_type)
        if omsi_format_common.str_type_unicode:
            analysis_identifier_data[0] = analysis.get_analysis_identifier()
        else:
            analysis_identifier_data[0] = str(
                analysis.get_analysis_identifier())

        # 2. Write the analysis type
        analysis_type_data = analysis_group.require_dataset(
            name=unicode(omsi_format_analysis.analysis_type),
            shape=(1, ),
            dtype=omsi_format_common.str_type)
        if omsi_format_common.str_type_unicode:
            analysis_type_data[0] = analysis.get_analysis_type()
        else:
            analysis_type_data[0] = str(analysis.get_analysis_type())

        # 3. Write the analysis data
        try:
            analysis.write_analysis_data(analysis_group=analysis_group)
        except NotImplementedError:
            for ana_data in analysis.get_all_analysis_data():
                cls.__write_omsi_analysis_data__(analysis_group, ana_data)

        # 4. Determine all dependencies and parameters that we need to write
        dependencies = [
        ]  # [dep['data'] for dep in analysis.get_all_dependency_data()]
        parameters = []
        # 4.1 Resolve in-memory dependencies if possible
        for dependent_parameter in analysis.get_all_dependency_data():
            # 4.1.1 We have an in-memory dependency
            if isinstance(dependent_parameter['data']['omsi_object'],
                          analysis_base):
                # 4.1.1.1 We can resolve the dependency to an object in an HDF5 file
                if dependent_parameter['data'][
                        'omsi_object'].has_omsi_analysis_storage():
                    # Create a new dependency that points to the approbriate file location
                    # NOTE: We do not modify the dependency in the analysis object that we save
                    #       but we only change it for the purpose of storage
                    new_dep = dependent_parameter['data'].copy()
                    new_dep_omsi_object = None
                    # Check if we can find an analysis data store within the same parent (or at least file)
                    parent_filename = os.path.abspath(
                        analysis_group.file.filename)
                    for analysis_store in dependent_parameter['data'][
                            'omsi_object'].get_omsi_analysis_storage():
                        analysis_store_filename = os.path.abspath(
                            analysis_store.managed_group.file.filename)
                        if analysis_store.name == analysis_group.parent.name and \
                                analysis_store_filename == parent_filename:
                            new_dep_omsi_object = analysis_store
                            break
                        elif analysis_store_filename == parent_filename:
                            new_dep_omsi_object = analysis_store

                    # We could not find a prior data store within the same file so use one from another file
                    if new_dep_omsi_object is None:
                        dep_object = dependent_parameter['data']['omsi_object']
                        new_dep[
                            'omsi_object'] = dep_object.get_omsi_analysis_storage(
                            )[0]
                    else:
                        new_dep['omsi_object'] = new_dep_omsi_object
                    # Append it to the list of dependencies
                    dependencies.append(new_dep)
                # 4.1.1.2  We cannot resolve the dependency and need to store it as an parameter instead
                else:
                    # Replace the dependency with the actual data and save it as a parameter instead
                    new_param = dependent_parameter.copy()
                    new_param['data'] = new_param['data'].get_data()
                    parameters.append(new_param)

            # 4.1.2 We have a file-based dependencies so keep it as is and add it to the list of dependencies
            else:
                dependencies.append(dependent_parameter['data'])

        # 4.2 Add all regular parameters to the list of parameters
        parameters += analysis.get_all_parameter_data(
            exclude_dependencies=True)

        # 5. Write all the parameters
        parameter_group = analysis_group.require_group(
            omsi_format_analysis.analysis_parameter_group)
        for param_data in parameters:
            if param_data['required'] or param_data.data_set(
            ) or param_data['default'] is not None:
                temp_data = param_data.get_data_or_default()
                if temp_data is not None:
                    anadata = analysis_data(
                        name=param_data['name'],
                        data=param_data.get_data_or_default(),
                        dtype=param_data['dtype'])
                    cls.__write_omsi_analysis_data__(parameter_group, anadata)
                    # Try to add the help string attribute
                    try:
                        help_attr = omsi_format_analysis.analysis_parameter_help_attr
                        parameter_group[param_data['name']].attrs[
                            help_attr] = param_data['help']
                    except KeyError:
                        pass

        # 6. Write all the runtime execution information
        runinfo_group = analysis_group.require_group(
            omsi_format_analysis.analysis_runinfo_group)
        for run_info_key, run_info_value in analysis.get_all_run_info().items(
        ):
            # Generate an analysis_data object in order to use the
            # __write_omsi_analysis_data function to write the data
            if isinstance(run_info_value, unicode) or isinstance(
                    run_info_value, str):
                anadata = analysis_data(name=unicode(run_info_key),
                                        data=run_info_value,
                                        dtype=omsi_format_common.str_type)
            else:
                dat = np.asarray(run_info_value)
                if len(dat.shape) == 0:
                    dat = np.asarray([run_info_value])
                anadata = analysis_data(name=unicode(run_info_key),
                                        data=dat,
                                        dtype=dat.dtype)
            cls.__write_omsi_analysis_data__(runinfo_group, anadata)

        # 7. Write all dependencies
        omsi_file_dependencies.__create__(parent_group=analysis_group,
                                          dependencies_data_list=dependencies)

        # 8. Execute the custom data write for the analysis
        analysis.add_custom_data_to_omsi_file(analysis_group)

        # 9. Create the output object
        re = omsi_file_analysis(analysis_group)

        # 10. Save the output object in the ist of omsi analysis data stores as part of the analysis object
        analysis.omsi_analysis_storage.append(re)

        # 11. Check if we need to pickle and save the analysis class in case this is a custom class that is not part of BASTet
        try:
            from omsi.analysis.analysis_views import analysis_views
            _ = analysis_views.analysis_name_to_class(
                analysis.get_analysis_type())
        except NameError:
            class_pickle = cloudpickle.dumps(analysis.__class__)
            # Convert the pickle string to an uint8 array to avoid problems
            # with storing string with NULL characters in HDF5
            class_pickle_arr = np.fromstring(
                class_pickle,
                dtype=omsi_format_analysis.analysis_class_pickle_np_dtype)
            analysis_group[unicode(
                omsi_format_analysis.analysis_class)] = class_pickle_arr
        except:
            log_helper.warning(__name__, "Could not save the analysis class.")
            pass

        # 12. Retrun the new omsi_file_analysis object
        return re
예제 #34
0
    def __compute_file_info(cls, filename, resolution):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis, data type for the intensities, format type

        :return: Numpy array with mz axis
        :return: string with data type
        :return: imzml file type
        :return:
        """
        reader = ImzMLParser(filename)
        # Read the first spectrum
        mz_axes, intens = reader.getspectrum(0)   # NOTE: mz_axes is a tuple
        # Read the coordinates
        coordinates = np.asarray(reader.coordinates)

        # #Start the data at [0,0,0]
        # coordinates[:,0] = coordinates[:,0] - np.amin(coordinates,axis=0)[0]
        # coordinates[:,1] = coordinates[:,1] - np.amin(coordinates,axis=0)[1]
        # coordinates[:,2] = coordinates[:,2] - np.amin(coordinates,axis=0)[2]

        # Determine the data type for the internsity values
        dtype = np.asarray(intens).dtype.str

        # Compute the mz axis and file type
        file_type = cls.available_imzml_types['continuous']
        min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes)
        for ind in range(coordinates.shape[0]):      #for ind, loc in enumerate(reader.coordinates):
            mz, intens = reader.getspectrum(ind)
            if mz == mz_axes:
                pass
            else:
                file_type = cls.available_imzml_types['processed']
                if min_mz > np.amin(mz):
                    min_mz = np.amin(mz)
                if max_mz < np.amax(mz):
                    max_mz = np.amax(mz)
        # Reinterpolate the mz-axis if we have a processed mode imzml file
        if file_type == cls.available_imzml_types['processed']:
            f = np.ceil(1e6 * np.log(max_mz/min_mz)/resolution)
            mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f)
            log_helper.info(__name__, "Reinterpolated m/z axis for processed imzML file")

        # Construct the imzml metadata information
        dataset_metadata = metadata_dict()
        instrument_metadata = metadata_dict()
        method_metadata = metadata_dict()
        for k, v in reader.imzmldict.iteritems():
            dataset_metadata[k] = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=k,
                                                 ontology=None)

        # Delete the parser and read the metadata
        del reader

        # Parse the metadata for the file. We try to parse only the header and ignore the
        # <run > group in the XML file to avoid going throught the whole file again
        # while extracting the majority of the relevant metadata
        try:
            with open(filename, 'r') as ins:
                metdata_header = ''
                for line in ins:
                    if '<run' in line:
                        break
                    else:
                        metdata_header += line
                metdata_header += '</mzML>'
                metdata_header_dict = xmltodict.parse(metdata_header)['mzML']
                for k, v in metdata_header_dict.iteritems():
                    store_value = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=str(k) + " extracted from imzML XML header.",
                                                 ontology=None)
                    if k == 'instrumentConfigurationList':
                        instrument_metadata[k] = store_value
                    elif k == 'dataProcessingList':
                        method_metadata[k] = store_value
                    elif k == 'scanSettingsList':
                        dataset_metadata[k] = store_value
                    elif k == 'softwareList':
                        method_metadata[k] = store_value
                    elif k =='sampleList':
                        method_metadata[k] = store_value
                    else:
                        dataset_metadata[k] = store_value
                dataset_metadata['imzml_xml_metadata_header'] = metadata_value(name='imzml_xml_metadata_header',
                                                                               value=metdata_header,
                                                                               unit=None,
                                                                               description='XML imzML header',
                                                                               ontology=None)
        except:
            log_helper.warning(__name__, "Extraction of additional imzML metadata failed")

        return coordinates, np.asarray(mz_axes), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata