class GPComposer(Composer): """ Genetic programming based composer :param optimiser: optimiser generated in GPComposerBuilder :param metrics: metrics used to define the quality of found solution. :param composer_requirements: requirements for composition process :param initial_chain: defines the initial state of the population. If None then initial population is random. """ def __init__( self, optimiser=None, composer_requirements: Optional[GPComposerRequirements] = None, metrics: Union[List[MetricsEnum], MetricsEnum] = None, initial_chain: Optional[Chain] = None, logger: Log = None): super().__init__(metrics=metrics, composer_requirements=composer_requirements, initial_chain=initial_chain) self.cache = OperationsCache() self.optimiser = optimiser self.cache_path = None self.use_existing_cache = False if not logger: self.log = default_log(__name__) else: self.log = logger def compose_chain( self, data: InputData, is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Chain, List[Chain]]: """ Function for optimal chain structure searching :param data: InputData for chain composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune chain after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_chain: obtained result after composing: one chain for single-objective optimization; For the multi-objective case, the list of the chain is returned. In the list, the chains are ordered by the descending of primary metric (the first is the best) """ if self.composer_requirements.max_chain_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for chain composition is not defined') train_data, test_data = train_test_data_setup( data, sample_split_ration_for_tasks[data.task.task_type]) if self.cache_path is None: self.cache.clear() else: self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) metric_function_for_nodes = partial(self.composer_metric, self.metrics, train_data, test_data) best_chain = self.optimiser.optimise( metric_function_for_nodes, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') if is_tune: self.tune_chain(best_chain, data, self.composer_requirements.max_lead_time) return best_chain def composer_metric(self, metrics, train_data: InputData, test_data: InputData, chain: Chain) -> Optional[Tuple[Any]]: try: validate(chain) chain.log = self.log if type(metrics) is not list: metrics = [metrics] if self.cache is not None: # TODO improve cache chain.fit_from_cache(self.cache) if not chain.is_fitted: self.log.debug( f'Chain {chain.root_node.descriptive_id} fit started') chain.fit(input_data=train_data, time_constraint=self.composer_requirements. max_chain_fit_time) self.cache.save_chain(chain) evaluated_metrics = () for metric in metrics: if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) evaluated_metrics = evaluated_metrics + (metric_func( chain, reference_data=test_data), ) self.log.debug( f'Chain {chain.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}' ) except Exception as ex: self.log.info(f'Chain assessment warning: {ex}. Continue.') evaluated_metrics = None return evaluated_metrics @staticmethod def tune_chain(chain: Chain, data: InputData, time_limit): raise NotImplementedError() @property def history(self): return self.optimiser.history
class GPComposer(Composer): """ Genetic programming based composer :param optimiser: optimiser generated in GPComposerBuilder :param metrics: metrics used to define the quality of found solution. :param composer_requirements: requirements for composition process :param initial_pipeline: defines the initial state of the population. If None then initial population is random. """ def __init__( self, optimiser=None, composer_requirements: Optional[GPComposerRequirements] = None, metrics: Union[List[MetricsEnum], MetricsEnum] = None, initial_pipeline: Optional[Pipeline] = None, logger: Log = None): super().__init__(metrics=metrics, composer_requirements=composer_requirements, initial_pipeline=initial_pipeline) self.cache = OperationsCache() self.optimiser = optimiser self.cache_path = None self.use_existing_cache = False if not logger: self.log = default_log(__name__) else: self.log = logger def compose_pipeline( self, data: Union[InputData, MultiModalData], is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Pipeline, List[Pipeline]]: """ Function for optimal pipeline structure searching :param data: InputData for pipeline composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization; For the multi-objective case, the list of the graph is returned. In the list, the pipelines are ordered by the descending of primary metric (the first is the best) """ self.optimiser.graph_generation_params.advisor.task = data.task if self.composer_requirements.max_pipeline_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for graph composition is not defined') if self.composer_requirements.cv_folds is not None: objective_function_for_pipeline = self._cv_validation_metric_build( data) else: self.log.info( "Hold out validation for graph composing was applied.") split_ratio = sample_split_ratio_for_tasks[data.task.task_type] train_data, test_data = train_test_data_setup(data, split_ratio) objective_function_for_pipeline = partial(self.composer_metric, self.metrics, train_data, test_data) if self.cache_path is None: self.cache.clear() else: self.cache.clear(tmp_only=True) self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) best_pipeline = self.optimiser.optimise( objective_function_for_pipeline, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') self.cache.clear() if is_tune: self.tune_pipeline(best_pipeline, data, self.composer_requirements.timeout) return best_pipeline def _cv_validation_metric_build(self, data): """ Prepare function for metric evaluation based on task """ if isinstance(data, MultiModalData): raise NotImplementedError( 'Cross-validation is not supported for multi-modal data') task_type = data.task.task_type if task_type is TaskTypesEnum.ts_forecasting: # Perform time series cross validation self.log.info( "Time series cross validation for pipeline composing was applied." ) if self.composer_requirements.validation_blocks is None: self.log.info( 'For ts cross validation validation_blocks number was changed from None to 3 blocks' ) self.composer_requirements.validation_blocks = 3 metric_function_for_nodes = partial( ts_metric_calculation, data, self.composer_requirements.cv_folds, self.composer_requirements.validation_blocks, self.metrics, log=self.log) else: self.log.info( "KFolds cross validation for pipeline composing was applied.") metric_function_for_nodes = partial( table_metric_calculation, data, self.composer_requirements.cv_folds, self.metrics, log=self.log) return metric_function_for_nodes def composer_metric(self, metrics, train_data: Union[InputData, MultiModalData], test_data: Union[InputData, MultiModalData], pipeline: Pipeline) -> Optional[Tuple[Any]]: try: validate(pipeline) pipeline.log = self.log if type(metrics) is not list: metrics = [metrics] if self.cache is not None: # TODO improve cache pipeline.fit_from_cache(self.cache) if not pipeline.is_fitted: self.log.debug( f'Pipeline {pipeline.root_node.descriptive_id} fit started' ) pipeline.fit(input_data=train_data, time_constraint=self.composer_requirements. max_pipeline_fit_time) try: self.cache.save_pipeline(pipeline) except Exception as ex: self.log.info(f'Cache can not be saved: {ex}. Continue.') evaluated_metrics = () for metric in metrics: if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) evaluated_metrics = evaluated_metrics + (metric_func( pipeline, reference_data=test_data), ) self.log.debug( f'Pipeline {pipeline.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}' ) # enforce memory cleaning pipeline.unfit() gc.collect() except Exception as ex: self.log.info(f'Pipeline assessment warning: {ex}. Continue.') evaluated_metrics = None return evaluated_metrics @staticmethod def tune_pipeline(pipeline: Pipeline, data: InputData, time_limit): raise NotImplementedError() @property def history(self): return self.optimiser.history