示例#1
0
class GPComposer(Composer):
    """
    Genetic programming based composer
    :param optimiser: optimiser generated in GPComposerBuilder
    :param metrics: metrics used to define the quality of found solution.
    :param composer_requirements: requirements for composition process
    :param initial_chain: defines the initial state of the population. If None then initial population is random.
    """
    def __init__(
            self,
            optimiser=None,
            composer_requirements: Optional[GPComposerRequirements] = None,
            metrics: Union[List[MetricsEnum], MetricsEnum] = None,
            initial_chain: Optional[Chain] = None,
            logger: Log = None):

        super().__init__(metrics=metrics,
                         composer_requirements=composer_requirements,
                         initial_chain=initial_chain)

        self.cache = OperationsCache()

        self.optimiser = optimiser
        self.cache_path = None
        self.use_existing_cache = False

        if not logger:
            self.log = default_log(__name__)
        else:
            self.log = logger

    def compose_chain(
        self,
        data: InputData,
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Chain, List[Chain]]:
        """ Function for optimal chain structure searching

        :param data: InputData for chain composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune chain after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description

        :return best_chain: obtained result after composing: one chain for single-objective optimization;
            For the multi-objective case, the list of the chain is returned.
            In the list, the chains are ordered by the descending of primary metric (the first is the best)
        """

        if self.composer_requirements.max_chain_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for chain composition is not defined')

        train_data, test_data = train_test_data_setup(
            data, sample_split_ration_for_tasks[data.task.task_type])
        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        metric_function_for_nodes = partial(self.composer_metric, self.metrics,
                                            train_data, test_data)

        best_chain = self.optimiser.optimise(
            metric_function_for_nodes,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')

        if is_tune:
            self.tune_chain(best_chain, data,
                            self.composer_requirements.max_lead_time)
        return best_chain

    def composer_metric(self, metrics, train_data: InputData,
                        test_data: InputData,
                        chain: Chain) -> Optional[Tuple[Any]]:
        try:
            validate(chain)
            chain.log = self.log

            if type(metrics) is not list:
                metrics = [metrics]

            if self.cache is not None:
                # TODO improve cache
                chain.fit_from_cache(self.cache)

            if not chain.is_fitted:
                self.log.debug(
                    f'Chain {chain.root_node.descriptive_id} fit started')
                chain.fit(input_data=train_data,
                          time_constraint=self.composer_requirements.
                          max_chain_fit_time)
                self.cache.save_chain(chain)

            evaluated_metrics = ()
            for metric in metrics:
                if callable(metric):
                    metric_func = metric
                else:
                    metric_func = MetricsRepository().metric_by_id(metric)
                evaluated_metrics = evaluated_metrics + (metric_func(
                    chain, reference_data=test_data), )

            self.log.debug(
                f'Chain {chain.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}'
            )

        except Exception as ex:
            self.log.info(f'Chain assessment warning: {ex}. Continue.')
            evaluated_metrics = None

        return evaluated_metrics

    @staticmethod
    def tune_chain(chain: Chain, data: InputData, time_limit):
        raise NotImplementedError()

    @property
    def history(self):
        return self.optimiser.history
示例#2
0
class GPComposer(Composer):
    """
    Genetic programming based composer
    :param optimiser: optimiser generated in GPComposerBuilder
    :param metrics: metrics used to define the quality of found solution.
    :param composer_requirements: requirements for composition process
    :param initial_pipeline: defines the initial state of the population. If None then initial population is random.
    """
    def __init__(
            self,
            optimiser=None,
            composer_requirements: Optional[GPComposerRequirements] = None,
            metrics: Union[List[MetricsEnum], MetricsEnum] = None,
            initial_pipeline: Optional[Pipeline] = None,
            logger: Log = None):

        super().__init__(metrics=metrics,
                         composer_requirements=composer_requirements,
                         initial_pipeline=initial_pipeline)

        self.cache = OperationsCache()

        self.optimiser = optimiser
        self.cache_path = None
        self.use_existing_cache = False

        if not logger:
            self.log = default_log(__name__)
        else:
            self.log = logger

    def compose_pipeline(
        self,
        data: Union[InputData, MultiModalData],
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Pipeline, List[Pipeline]]:
        """ Function for optimal pipeline structure searching
        :param data: InputData for pipeline composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description
        :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization;
            For the multi-objective case, the list of the graph is returned.
            In the list, the pipelines are ordered by the descending of primary metric (the first is the best)
        """

        self.optimiser.graph_generation_params.advisor.task = data.task

        if self.composer_requirements.max_pipeline_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for graph composition is not defined')

        if self.composer_requirements.cv_folds is not None:
            objective_function_for_pipeline = self._cv_validation_metric_build(
                data)
        else:
            self.log.info(
                "Hold out validation for graph composing was applied.")
            split_ratio = sample_split_ratio_for_tasks[data.task.task_type]
            train_data, test_data = train_test_data_setup(data, split_ratio)
            objective_function_for_pipeline = partial(self.composer_metric,
                                                      self.metrics, train_data,
                                                      test_data)

        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache.clear(tmp_only=True)
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        best_pipeline = self.optimiser.optimise(
            objective_function_for_pipeline,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')
        self.cache.clear()
        if is_tune:
            self.tune_pipeline(best_pipeline, data,
                               self.composer_requirements.timeout)
        return best_pipeline

    def _cv_validation_metric_build(self, data):
        """ Prepare function for metric evaluation based on task """
        if isinstance(data, MultiModalData):
            raise NotImplementedError(
                'Cross-validation is not supported for multi-modal data')
        task_type = data.task.task_type
        if task_type is TaskTypesEnum.ts_forecasting:
            # Perform time series cross validation
            self.log.info(
                "Time series cross validation for pipeline composing was applied."
            )
            if self.composer_requirements.validation_blocks is None:
                self.log.info(
                    'For ts cross validation validation_blocks number was changed from None to 3 blocks'
                )
                self.composer_requirements.validation_blocks = 3
            metric_function_for_nodes = partial(
                ts_metric_calculation,
                data,
                self.composer_requirements.cv_folds,
                self.composer_requirements.validation_blocks,
                self.metrics,
                log=self.log)
        else:
            self.log.info(
                "KFolds cross validation for pipeline composing was applied.")
            metric_function_for_nodes = partial(
                table_metric_calculation,
                data,
                self.composer_requirements.cv_folds,
                self.metrics,
                log=self.log)

        return metric_function_for_nodes

    def composer_metric(self, metrics, train_data: Union[InputData,
                                                         MultiModalData],
                        test_data: Union[InputData, MultiModalData],
                        pipeline: Pipeline) -> Optional[Tuple[Any]]:
        try:
            validate(pipeline)
            pipeline.log = self.log

            if type(metrics) is not list:
                metrics = [metrics]

            if self.cache is not None:
                # TODO improve cache
                pipeline.fit_from_cache(self.cache)

            if not pipeline.is_fitted:
                self.log.debug(
                    f'Pipeline {pipeline.root_node.descriptive_id} fit started'
                )
                pipeline.fit(input_data=train_data,
                             time_constraint=self.composer_requirements.
                             max_pipeline_fit_time)
                try:
                    self.cache.save_pipeline(pipeline)
                except Exception as ex:
                    self.log.info(f'Cache can not be saved: {ex}. Continue.')

            evaluated_metrics = ()
            for metric in metrics:
                if callable(metric):
                    metric_func = metric
                else:
                    metric_func = MetricsRepository().metric_by_id(metric)
                evaluated_metrics = evaluated_metrics + (metric_func(
                    pipeline, reference_data=test_data), )

            self.log.debug(
                f'Pipeline {pipeline.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}'
            )

            # enforce memory cleaning
            pipeline.unfit()
            gc.collect()
        except Exception as ex:
            self.log.info(f'Pipeline assessment warning: {ex}. Continue.')
            evaluated_metrics = None

        return evaluated_metrics

    @staticmethod
    def tune_pipeline(pipeline: Pipeline, data: InputData, time_limit):
        raise NotImplementedError()

    @property
    def history(self):
        return self.optimiser.history