def calculate_metric_or_summary( annotated_variant_data: AnnotatedVariantData, evaluation_data: EvaluationData, report: Union[Type[PerformanceMetric], Type[PerformanceSummary]] ) -> Dict[Plugin, Any]: """ Calculates a metrics or a summary for all plugins in the annotated variant data. Parameters ---------- annotated_variant_data : AnnotatedVariantData The annotated variant data evaluation_data : EvaluationData The evaluation data report: Union[Type[PerformanceMetric], Type[PerformanceSummary]] The performance summary or metric that should be calculated Returns ------- Dict[Plugin, Any] A dictionary where the keys are the plugins and the result from the calculations are the values """ log.debug(f"Calculate {report.name()}") rv = {} for score in annotated_variant_data.scores: rv[score.plugin] = report.calculate( score, evaluation_data.interpreted_classes) return rv
def run(self, variant_information_table: DataFrame) -> DataFrame: """ Run the plugin on the `variant_information_table` Before running the plugin the :meth:`compatibility <vpmbench.plugin.Plugin.is_compatible_with_data>` of the data with the plugin is tested. Next the :meth:`~vpmbench.plugin.EntryPoint.run` method of the entry_point is called with the `variant_information_table`. The result of the entry_point is :meth:`validated <vpmbench.plugin.Plugin._validate_score_table>` to ensure that each variant from the variant_information_table got a valid score assigned. Finally, the score column is renamed using the :meth:`~vpmbench.plugin.Plugin.score_column_name`. The resulting Dataframe consists of two columns: * UID: The UID of the variants * :meth:`~vpmbench.plugin.Plugin.score_column_name`: The scores from the prioritization method Parameters ---------- variant_information_table The variant information table Returns ------- DataFrame The plugin result. """ self.is_compatible_with_data(variant_information_table) log.debug(f"Invoke method: {self.name}") score_table = self.entry_point.run(variant_information_table) log.debug(f"Finish method: {self.name}") self._validate_score_table(variant_information_table, score_table) return score_table.rename(columns={"SCORE": self.score_column_name})
def invoke_methods(plugins: List[Plugin], variant_data: DataFrame, cpu_count: int = -1) -> AnnotatedVariantData: """ Invoke multiple prioritization methods given as a list of `plugins` on the `variant_data` in parallel. Calls :func:`vpmbench.api.invoke_method` for each in plugin in `plugins` on the `variant_data`. The compatibility of the `plugins` with the `variant_data` are checked via :meth:`Plugin.is_compatible_with_data <vpmbench.plugin.Plugin.is_compatible_with_data>`. If `cpu_count` is -1 then (number of cpus-1) are used to run the plugins in parallel; set to one 1 disable parallel execution. The resulting annotated variant data is constructed by collecting the outputs of the plugin use them as input for :meth:`AnnotatedVariantData.from_results <vpmbench.data.AnnotatedVariantData.from_results>`. Parameters ---------- variant_data : pandas.DataFrame The variant data which should be processed by the plugins plugins : List[Plugin] A list of plugins that should be invoked cpu_count : int The numbers of cpus that should be used to invoke the plugins in parallel Returns ------- AnnotatedVariantData The variant data annotated with the scores from the prioritization methods """ map(lambda plugin: plugin.is_compatible_with_data(variant_data), plugins) if cpu_count == -1: cpu_count = mp.cpu_count() - 1 log.info(f"Invoke methods") log.debug(f"#CPUs: {cpu_count}") pool = mp.Pool(cpu_count) jobs = [ pool.apply_async(invoke_method, args=(plugin, variant_data)) for plugin in plugins ] plugin_results = [] for job in jobs: plugin_results.append(job.get()) pool.close() return AnnotatedVariantData.from_results(variant_data, plugin_results)
def extract_evaluation_data( evaluation_data_path: Union[str, Path], extractor: Type[Extractor] = ClinVarVCFExtractor) -> EvaluationData: """ Extract the EvaluationData from the evaluation input data. Parses the evaluation the evaluation input data given by the `evaluation_data_path` using the `extractor`. Parameters ---------- evaluation_data_path : Union[str, Path] The path to the evaluation input data extractor : Type[Extractor] The extractor that should be used to parse the evaluation input data Returns ------- EvaluationData The evaluation data extracted from `evaluation_input_data` using the `extractor` """ log.info(f"Extract data from {evaluation_data_path} ") log.debug(f"Used extractor: {extractor}!") return extractor.extract(evaluation_data_path)
def extract(cls, file_path: Union[str, Path]) -> EvaluationData: """ Extract the :class:`~vpmbench.data.EvaluationData` from the file at `file_path`. This function calls :meth:`~vpmbench.extractor.Extractor._extract` and uses :meth:`vpmbench.data.EvaluationData.validate` to check if the evaluation data is valid. Parameters ---------- file_path The file path to evaluation input data Returns ------- EvaluationData The validated evaluation data Raises ------ RuntimeError If the file can not be parsed SchemaErrors If the validation of the extracted data fails """ try: table = cls._extract(file_path) except Exception: raise RuntimeError( f"Can't parse data at '{file_path}' with '{cls.__name__}'. \nMaybe the data does not exist, or is not " f"compatible with the Extractor.\n If the data exists use absolute path." ) log.debug("Extracted Data:") log.debug(table.variant_data.head(10)) table.validate() return table
def run_pipeline(with_data: Union[str, Path], reporting: List[Union[Type[PerformanceMetric], Type[PerformanceSummary]]], using: Callable[[Plugin], Any] = None, extractor: Type[Extractor] = ClinVarVCFExtractor, plugin_path: Union[str, Path] = DEFAULT_PLUGIN_PATH, cpu_count: int = -1) -> PerformanceReport: log.info("Run pipeline") log.debug(f'Starting time: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}') evaluation_data: EvaluationData = extract_evaluation_data( with_data, extractor) plugins: List[Plugin] = load_plugins(plugin_path, using) if len(plugins) == 0: raise RuntimeError(f"Can' find plugins in {plugin_path}") annotated_variants: AnnotatedVariantData = invoke_methods( plugins, evaluation_data.variant_data, cpu_count) reports = calculate_metrics_and_summaries(annotated_variants, evaluation_data, reporting) log.info("Stop pipeline") log.debug( f'Finishing time: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}') report = PerformanceReport(evaluation_data, annotated_variants, reports) return report
def load_plugins(plugin_path: Union[str, Path], plugin_selection: Optional[Callable[[Plugin], bool]] = None) -> \ List[Plugin]: """ Load all plugins from the `plugin_directory` and applies the plugin selection to filter them. If `plugin_selection` is `None` all plugins in the `plugin_path` are returned. Parameters ---------- plugin_path : Union[str, PathLike] The path to your plugin directory plugin_selection : Optional[Callable[[Plugin], bool]] The selection function that should be applied to filter the plugins Returns ------- List[Plugin] The list of plugins loaded from the `plugin_path` """ log.info(f"Load plugins from {plugin_path}") plugin_path = Path(plugin_path).resolve().absolute() log.debug(f"Absolute plugin path: {plugin_path}") found_plugins = [ load_plugin(manifest) for manifest in plugin_path.glob("*/**/manifest.yaml") ] log.debug( f"Found {len(found_plugins)} plugins: {[plugin.name for plugin in found_plugins]}" ) if filter: filtered_plugins = list(filter(plugin_selection, found_plugins)) log.debug( f"Returning {len(filtered_plugins)} filtered plugins: {[plugin.name for plugin in filtered_plugins]}" ) return filtered_plugins log.debug(f"Returning {len(found_plugins)} plugins: {found_plugins}") return found_plugins