示例#1
0
def test_cache_historical_state_using(data_setup):
    cache = OperationsCache()
    train, _ = data_setup
    pipeline = pipeline_first()

    # pipeline fitted, model goes to cache
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    new_node = SecondaryNode(operation_type='logit')
    old_node = pipeline.root_node.nodes_from[0]

    # change child node to new one
    pipeline.update_node(old_node=old_node,
                         new_node=new_node)
    # cache is not actual
    assert not cache.get(pipeline.root_node)
    # fit modified pipeline
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    # cache is actual now
    assert cache.get(pipeline.root_node)

    # change node back
    pipeline.update_node(old_node=pipeline.root_node.nodes_from[0],
                         new_node=old_node)
    # cache is actual without new fitting,
    # because the cached model was saved after first fit
    assert cache.get(pipeline.root_node)
示例#2
0
def test_cache_actuality_after_subtree_change_to_identical(data_setup):
    """The non-affected nodes has actual cache after changing the subtree to other pre-fitted subtree"""
    cache = OperationsCache()
    train, _ = data_setup
    pipeline = pipeline_first()
    other_pipeline = pipeline_second()
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    other_pipeline.fit(input_data=train)
    cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0]))

    pipeline.update_subtree(pipeline.root_node.nodes_from[0],
                            other_pipeline.root_node.nodes_from[0])

    nodes_with_actual_cache = [node for node in pipeline.nodes if node not in [pipeline.root_node]]

    # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual
    assert all([cache.get(node) is not None for node in nodes_with_actual_cache])
    # affected root node has no any actual cache
    assert cache.get(pipeline.root_node) is None
示例#3
0
def test_cache_actuality_after_primary_node_changed_to_subtree(data_setup):
    """ The non-affected nodes has actual cache after changing the primary node to pre-fitted subtree"""
    cache = OperationsCache()
    train, _ = data_setup
    pipeline = pipeline_first()
    other_pipeline = pipeline_second()
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    other_pipeline.fit(input_data=train)
    pipeline.update_subtree(pipeline.root_node.nodes_from[0].nodes_from[0],
                            other_pipeline.root_node.nodes_from[0])
    cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0]))
    root_parent_first = pipeline.root_node.nodes_from[0]

    nodes_with_non_actual_cache = [pipeline.root_node, root_parent_first]
    nodes_with_actual_cache = [node for node in pipeline.nodes if node not in nodes_with_non_actual_cache]

    # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual
    assert all([cache.get(node) for node in nodes_with_actual_cache])
    # affected root nodes and their childs has no any actual cache
    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
示例#4
0
def test_cache_actuality_after_model_change(data_setup):
    """The non-affected nodes has actual cache after changing the model"""

    cache = OperationsCache()

    pipeline = pipeline_first()
    train, _ = data_setup
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    new_node = SecondaryNode(operation_type='logit')
    pipeline.update_node(old_node=pipeline.root_node.nodes_from[0],
                         new_node=new_node)

    root_parent_first = pipeline.root_node.nodes_from[0]

    nodes_with_non_actual_cache = [pipeline.root_node, root_parent_first]
    nodes_with_actual_cache = [node for node in pipeline.nodes if node not in nodes_with_non_actual_cache]

    # non-affected nodes are actual
    assert all([cache.get(node) is not None for node in nodes_with_actual_cache])
    # affected nodes and their childs has no any actual cache
    assert all([cache.get(node) is None for node in nodes_with_non_actual_cache])
示例#5
0
def test_multi_pipeline_caching_with_cache(data_setup):
    train, _ = data_setup
    cache = OperationsCache()

    main_pipeline = pipeline_second()
    other_pipeline = pipeline_first()

    # fit other_pipeline that contains the parts identical to main_pipeline
    other_pipeline.fit(input_data=train)
    cache.save_pipeline(other_pipeline)

    nodes_with_non_actual_cache = [main_pipeline.root_node, main_pipeline.root_node.nodes_from[0]] + \
                                  [_ for _ in main_pipeline.root_node.nodes_from[0].nodes_from]
    nodes_with_actual_cache = [node for node in main_pipeline.nodes if node not in nodes_with_non_actual_cache]

    # check that using of other_pipeline make identical of the main_pipeline fitted,
    # despite the main_pipeline.fit() was not called
    assert all([cache.get(node) for node in nodes_with_actual_cache])
    # the non-identical parts are still not fitted
    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])

    # check the same case with another pipelines
    cache = OperationsCache()

    main_pipeline = pipeline_fourth()

    prev_pipeline_first = pipeline_third()
    prev_pipeline_second = pipeline_fifth()

    prev_pipeline_first.fit(input_data=train)
    cache.save_pipeline(prev_pipeline_first)
    prev_pipeline_second.fit(input_data=train)
    cache.save_pipeline(prev_pipeline_second)

    nodes_with_non_actual_cache = [main_pipeline.root_node, main_pipeline.root_node.nodes_from[1]]
    nodes_with_actual_cache = [child for child in main_pipeline.root_node.nodes_from[0].nodes_from]

    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
    assert all([cache.get(node) for node in nodes_with_actual_cache])
示例#6
0
class GPComposer(Composer):
    """
    Genetic programming based composer
    :param optimiser: optimiser generated in GPComposerBuilder
    :param metrics: metrics used to define the quality of found solution.
    :param composer_requirements: requirements for composition process
    :param initial_pipeline: defines the initial state of the population. If None then initial population is random.
    """
    def __init__(
            self,
            optimiser=None,
            composer_requirements: Optional[GPComposerRequirements] = None,
            metrics: Union[List[MetricsEnum], MetricsEnum] = None,
            initial_pipeline: Optional[Pipeline] = None,
            logger: Log = None):

        super().__init__(metrics=metrics,
                         composer_requirements=composer_requirements,
                         initial_pipeline=initial_pipeline)

        self.cache = OperationsCache()

        self.optimiser = optimiser
        self.cache_path = None
        self.use_existing_cache = False

        if not logger:
            self.log = default_log(__name__)
        else:
            self.log = logger

    def compose_pipeline(
        self,
        data: Union[InputData, MultiModalData],
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Pipeline, List[Pipeline]]:
        """ Function for optimal pipeline structure searching
        :param data: InputData for pipeline composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description
        :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization;
            For the multi-objective case, the list of the graph is returned.
            In the list, the pipelines are ordered by the descending of primary metric (the first is the best)
        """

        self.optimiser.graph_generation_params.advisor.task = data.task

        if self.composer_requirements.max_pipeline_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for graph composition is not defined')

        if self.composer_requirements.cv_folds is not None:
            objective_function_for_pipeline = self._cv_validation_metric_build(
                data)
        else:
            self.log.info(
                "Hold out validation for graph composing was applied.")
            split_ratio = sample_split_ratio_for_tasks[data.task.task_type]
            train_data, test_data = train_test_data_setup(data, split_ratio)
            objective_function_for_pipeline = partial(self.composer_metric,
                                                      self.metrics, train_data,
                                                      test_data)

        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache.clear(tmp_only=True)
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        best_pipeline = self.optimiser.optimise(
            objective_function_for_pipeline,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')
        self.cache.clear()
        if is_tune:
            self.tune_pipeline(best_pipeline, data,
                               self.composer_requirements.timeout)
        return best_pipeline

    def _cv_validation_metric_build(self, data):
        """ Prepare function for metric evaluation based on task """
        if isinstance(data, MultiModalData):
            raise NotImplementedError(
                'Cross-validation is not supported for multi-modal data')
        task_type = data.task.task_type
        if task_type is TaskTypesEnum.ts_forecasting:
            # Perform time series cross validation
            self.log.info(
                "Time series cross validation for pipeline composing was applied."
            )
            if self.composer_requirements.validation_blocks is None:
                self.log.info(
                    'For ts cross validation validation_blocks number was changed from None to 3 blocks'
                )
                self.composer_requirements.validation_blocks = 3
            metric_function_for_nodes = partial(
                ts_metric_calculation,
                data,
                self.composer_requirements.cv_folds,
                self.composer_requirements.validation_blocks,
                self.metrics,
                log=self.log)
        else:
            self.log.info(
                "KFolds cross validation for pipeline composing was applied.")
            metric_function_for_nodes = partial(
                table_metric_calculation,
                data,
                self.composer_requirements.cv_folds,
                self.metrics,
                log=self.log)

        return metric_function_for_nodes

    def composer_metric(self, metrics, train_data: Union[InputData,
                                                         MultiModalData],
                        test_data: Union[InputData, MultiModalData],
                        pipeline: Pipeline) -> Optional[Tuple[Any]]:
        try:
            validate(pipeline)
            pipeline.log = self.log

            if type(metrics) is not list:
                metrics = [metrics]

            if self.cache is not None:
                # TODO improve cache
                pipeline.fit_from_cache(self.cache)

            if not pipeline.is_fitted:
                self.log.debug(
                    f'Pipeline {pipeline.root_node.descriptive_id} fit started'
                )
                pipeline.fit(input_data=train_data,
                             time_constraint=self.composer_requirements.
                             max_pipeline_fit_time)
                try:
                    self.cache.save_pipeline(pipeline)
                except Exception as ex:
                    self.log.info(f'Cache can not be saved: {ex}. Continue.')

            evaluated_metrics = ()
            for metric in metrics:
                if callable(metric):
                    metric_func = metric
                else:
                    metric_func = MetricsRepository().metric_by_id(metric)
                evaluated_metrics = evaluated_metrics + (metric_func(
                    pipeline, reference_data=test_data), )

            self.log.debug(
                f'Pipeline {pipeline.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}'
            )

            # enforce memory cleaning
            pipeline.unfit()
            gc.collect()
        except Exception as ex:
            self.log.info(f'Pipeline assessment warning: {ex}. Continue.')
            evaluated_metrics = None

        return evaluated_metrics

    @staticmethod
    def tune_pipeline(pipeline: Pipeline, data: InputData, time_limit):
        raise NotImplementedError()

    @property
    def history(self):
        return self.optimiser.history