Пример #1
0
def test_cache_historical_state_using(data_setup):
    cache = OperationsCache()
    train, _ = data_setup
    chain = chain_first()

    # chain fitted, model goes to cache
    chain.fit(input_data=train)
    cache.save_chain(chain)
    new_node = SecondaryNode(operation_type='logit')
    old_node = chain.root_node.nodes_from[0]

    # change child node to new one
    chain.update_node(old_node=old_node, new_node=new_node)
    # cache is not actual
    assert not cache.get(chain.root_node)
    # fit modified chain
    chain.fit(input_data=train)
    cache.save_chain(chain)
    # cache is actual now
    assert cache.get(chain.root_node)

    # change node back
    chain.update_node(old_node=chain.root_node.nodes_from[0],
                      new_node=old_node)
    # cache is actual without new fitting,
    # because the cached model was saved after first fit
    assert cache.get(chain.root_node)
Пример #2
0
def test_cache_actuality_after_model_change(data_setup):
    """The non-affected nodes has actual cache after changing the model"""

    cache = OperationsCache()

    chain = chain_first()
    train, _ = data_setup
    chain.fit(input_data=train)
    cache.save_chain(chain)
    new_node = SecondaryNode(operation_type='logit')
    chain.update_node(old_node=chain.root_node.nodes_from[0],
                      new_node=new_node)

    root_parent_first = chain.root_node.nodes_from[0]

    nodes_with_non_actual_cache = [chain.root_node, root_parent_first]
    nodes_with_actual_cache = [
        node for node in chain.nodes if node not in nodes_with_non_actual_cache
    ]

    # non-affected nodes are actual
    assert all(
        [cache.get(node) is not None for node in nodes_with_actual_cache])
    # affected nodes and their childs has no any actual cache
    assert all(
        [cache.get(node) is None for node in nodes_with_non_actual_cache])
Пример #3
0
def test_multi_chain_caching_with_cache(data_setup):
    train, _ = data_setup
    cache = OperationsCache()

    main_chain = chain_second()
    other_chain = chain_first()

    # fit other_chain that contains the parts identical to main_chain
    other_chain.fit(input_data=train)
    cache.save_chain(other_chain)

    nodes_with_non_actual_cache = [main_chain.root_node, main_chain.root_node.nodes_from[0]] + \
                                  [_ for _ in main_chain.root_node.nodes_from[0].nodes_from]
    nodes_with_actual_cache = [
        node for node in main_chain.nodes
        if node not in nodes_with_non_actual_cache
    ]

    # check that using of other_chain make identical of the main_chain fitted,
    # despite the main_chain.fit() was not called
    assert all([cache.get(node) for node in nodes_with_actual_cache])
    # the non-identical parts are still not fitted
    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])

    # check the same case with another chains
    cache = OperationsCache()

    main_chain = chain_fourth()

    prev_chain_first = chain_third()
    prev_chain_second = chain_fifth()

    prev_chain_first.fit(input_data=train)
    cache.save_chain(prev_chain_first)
    prev_chain_second.fit(input_data=train)
    cache.save_chain(prev_chain_second)

    nodes_with_non_actual_cache = [
        main_chain.root_node, main_chain.root_node.nodes_from[1]
    ]
    nodes_with_actual_cache = [
        child for child in main_chain.root_node.nodes_from[0].nodes_from
    ]

    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
    assert all([cache.get(node) for node in nodes_with_actual_cache])
Пример #4
0
    def compose_pipeline(
        self,
        data: Union[InputData, MultiModalData],
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Pipeline, List[Pipeline]]:
        """ Function for optimal pipeline structure searching
        :param data: InputData for pipeline composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description
        :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization;
            For the multi-objective case, the list of the graph is returned.
            In the list, the pipelines are ordered by the descending of primary metric (the first is the best)
        """

        self.optimiser.graph_generation_params.advisor.task = data.task

        if self.composer_requirements.max_pipeline_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for graph composition is not defined')

        if self.composer_requirements.cv_folds is not None:
            objective_function_for_pipeline = self._cv_validation_metric_build(
                data)
        else:
            self.log.info(
                "Hold out validation for graph composing was applied.")
            split_ratio = sample_split_ratio_for_tasks[data.task.task_type]
            train_data, test_data = train_test_data_setup(data, split_ratio)
            objective_function_for_pipeline = partial(self.composer_metric,
                                                      self.metrics, train_data,
                                                      test_data)

        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache.clear(tmp_only=True)
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        best_pipeline = self.optimiser.optimise(
            objective_function_for_pipeline,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')
        self.cache.clear()
        if is_tune:
            self.tune_pipeline(best_pipeline, data,
                               self.composer_requirements.timeout)
        return best_pipeline
Пример #5
0
    def compose_chain(
        self,
        data: InputData,
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Chain, List[Chain]]:
        """ Function for optimal chain structure searching

        :param data: InputData for chain composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune chain after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description

        :return best_chain: obtained result after composing: one chain for single-objective optimization;
            For the multi-objective case, the list of the chain is returned.
            In the list, the chains are ordered by the descending of primary metric (the first is the best)
        """

        if self.composer_requirements.max_chain_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for chain composition is not defined')

        train_data, test_data = train_test_data_setup(
            data, sample_split_ration_for_tasks[data.task.task_type])
        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        metric_function_for_nodes = partial(self.composer_metric, self.metrics,
                                            train_data, test_data)

        best_chain = self.optimiser.optimise(
            metric_function_for_nodes,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')

        if is_tune:
            self.tune_chain(best_chain, data,
                            self.composer_requirements.max_lead_time)
        return best_chain
Пример #6
0
def test_cache_actuality_after_subtree_change_to_identical(data_setup):
    """The non-affected nodes has actual cache after changing the subtree to other pre-fitted subtree"""
    cache = OperationsCache()
    train, _ = data_setup
    pipeline = pipeline_first()
    other_pipeline = pipeline_second()
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    other_pipeline.fit(input_data=train)
    cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0]))

    pipeline.update_subtree(pipeline.root_node.nodes_from[0],
                            other_pipeline.root_node.nodes_from[0])

    nodes_with_actual_cache = [node for node in pipeline.nodes if node not in [pipeline.root_node]]

    # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual
    assert all([cache.get(node) is not None for node in nodes_with_actual_cache])
    # affected root node has no any actual cache
    assert cache.get(pipeline.root_node) is None
Пример #7
0
def test_cache_actuality_after_primary_node_changed_to_subtree(data_setup):
    """ The non-affected nodes has actual cache after changing the primary node to pre-fitted subtree"""
    cache = OperationsCache()
    train, _ = data_setup
    pipeline = pipeline_first()
    other_pipeline = pipeline_second()
    pipeline.fit(input_data=train)
    cache.save_pipeline(pipeline)
    other_pipeline.fit(input_data=train)
    pipeline.update_subtree(pipeline.root_node.nodes_from[0].nodes_from[0],
                            other_pipeline.root_node.nodes_from[0])
    cache.save_pipeline(Pipeline(other_pipeline.root_node.nodes_from[0]))
    root_parent_first = pipeline.root_node.nodes_from[0]

    nodes_with_non_actual_cache = [pipeline.root_node, root_parent_first]
    nodes_with_actual_cache = [node for node in pipeline.nodes if node not in nodes_with_non_actual_cache]

    # non-affected nodes of initial pipeline and fitted nodes of new subtree are actual
    assert all([cache.get(node) for node in nodes_with_actual_cache])
    # affected root nodes and their childs has no any actual cache
    assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
Пример #8
0
    def __init__(
            self,
            optimiser=None,
            composer_requirements: Optional[GPComposerRequirements] = None,
            metrics: Union[List[MetricsEnum], MetricsEnum] = None,
            initial_chain: Optional[Chain] = None,
            logger: Log = None):

        super().__init__(metrics=metrics,
                         composer_requirements=composer_requirements,
                         initial_chain=initial_chain)

        self.cache = OperationsCache()

        self.optimiser = optimiser
        self.cache_path = None
        self.use_existing_cache = False

        if not logger:
            self.log = default_log(__name__)
        else:
            self.log = logger