def test_cache_historical_state_using(data_setup): cache = OperationsCache() train, _ = data_setup chain = chain_first() # chain fitted, model goes to cache chain.fit(input_data=train) cache.save_chain(chain) new_node = SecondaryNode(operation_type='logit') old_node = chain.root_node.nodes_from[0] # change child node to new one chain.update_node(old_node=old_node, new_node=new_node) # cache is not actual assert not cache.get(chain.root_node) # fit modified chain chain.fit(input_data=train) cache.save_chain(chain) # cache is actual now assert cache.get(chain.root_node) # change node back chain.update_node(old_node=chain.root_node.nodes_from[0], new_node=old_node) # cache is actual without new fitting, # because the cached model was saved after first fit assert cache.get(chain.root_node)
def test_cache_actuality_after_model_change(data_setup): """The non-affected nodes has actual cache after changing the model""" cache = OperationsCache() chain = chain_first() train, _ = data_setup chain.fit(input_data=train) cache.save_chain(chain) new_node = SecondaryNode(operation_type='logit') chain.update_node(old_node=chain.root_node.nodes_from[0], new_node=new_node) root_parent_first = chain.root_node.nodes_from[0] nodes_with_non_actual_cache = [chain.root_node, root_parent_first] nodes_with_actual_cache = [ node for node in chain.nodes if node not in nodes_with_non_actual_cache ] # non-affected nodes are actual assert all( [cache.get(node) is not None for node in nodes_with_actual_cache]) # affected nodes and their childs has no any actual cache assert all( [cache.get(node) is None for node in nodes_with_non_actual_cache])
def test_cache_actuality_after_primary_node_changed_to_subtree(data_setup): """ The non-affected nodes has actual cache after changing the primary node to pre-fitted subtree""" cache = OperationsCache() train, _ = data_setup chain = chain_first() other_chain = chain_second() chain.fit(input_data=train) cache.save_chain(chain) other_chain.fit(input_data=train) chain.update_subtree(chain.root_node.nodes_from[0].nodes_from[0], other_chain.root_node.nodes_from[0]) cache.save_chain(Chain(other_chain.root_node.nodes_from[0])) root_parent_first = chain.root_node.nodes_from[0] nodes_with_non_actual_cache = [chain.root_node, root_parent_first] nodes_with_actual_cache = [ node for node in chain.nodes if node not in nodes_with_non_actual_cache ] # non-affected nodes of initial chain and fitted nodes of new subtree are actual assert all([cache.get(node) for node in nodes_with_actual_cache]) # affected root nodes and their childs has no any actual cache assert not any([cache.get(node) for node in nodes_with_non_actual_cache])
def test_cache_actuality_after_subtree_change_to_identical(data_setup): """The non-affected nodes has actual cache after changing the subtree to other pre-fitted subtree""" cache = OperationsCache() train, _ = data_setup chain = chain_first() other_chain = chain_second() chain.fit(input_data=train) cache.save_chain(chain) other_chain.fit(input_data=train) cache.save_chain(Chain(other_chain.root_node.nodes_from[0])) chain.update_subtree(chain.root_node.nodes_from[0], other_chain.root_node.nodes_from[0]) nodes_with_actual_cache = [ node for node in chain.nodes if node not in [chain.root_node] ] # non-affected nodes of initial chain and fitted nodes of new subtree are actual assert all( [cache.get(node) is not None for node in nodes_with_actual_cache]) # affected root node has no any actual cache assert cache.get(chain.root_node) is None
def test_multi_chain_caching_with_cache(data_setup): train, _ = data_setup cache = OperationsCache() main_chain = chain_second() other_chain = chain_first() # fit other_chain that contains the parts identical to main_chain other_chain.fit(input_data=train) cache.save_chain(other_chain) nodes_with_non_actual_cache = [main_chain.root_node, main_chain.root_node.nodes_from[0]] + \ [_ for _ in main_chain.root_node.nodes_from[0].nodes_from] nodes_with_actual_cache = [ node for node in main_chain.nodes if node not in nodes_with_non_actual_cache ] # check that using of other_chain make identical of the main_chain fitted, # despite the main_chain.fit() was not called assert all([cache.get(node) for node in nodes_with_actual_cache]) # the non-identical parts are still not fitted assert not any([cache.get(node) for node in nodes_with_non_actual_cache]) # check the same case with another chains cache = OperationsCache() main_chain = chain_fourth() prev_chain_first = chain_third() prev_chain_second = chain_fifth() prev_chain_first.fit(input_data=train) cache.save_chain(prev_chain_first) prev_chain_second.fit(input_data=train) cache.save_chain(prev_chain_second) nodes_with_non_actual_cache = [ main_chain.root_node, main_chain.root_node.nodes_from[1] ] nodes_with_actual_cache = [ child for child in main_chain.root_node.nodes_from[0].nodes_from ] assert not any([cache.get(node) for node in nodes_with_non_actual_cache]) assert all([cache.get(node) for node in nodes_with_actual_cache])
class GPComposer(Composer): """ Genetic programming based composer :param optimiser: optimiser generated in GPComposerBuilder :param metrics: metrics used to define the quality of found solution. :param composer_requirements: requirements for composition process :param initial_chain: defines the initial state of the population. If None then initial population is random. """ def __init__( self, optimiser=None, composer_requirements: Optional[GPComposerRequirements] = None, metrics: Union[List[MetricsEnum], MetricsEnum] = None, initial_chain: Optional[Chain] = None, logger: Log = None): super().__init__(metrics=metrics, composer_requirements=composer_requirements, initial_chain=initial_chain) self.cache = OperationsCache() self.optimiser = optimiser self.cache_path = None self.use_existing_cache = False if not logger: self.log = default_log(__name__) else: self.log = logger def compose_chain( self, data: InputData, is_visualise: bool = False, is_tune: bool = False, on_next_iteration_callback: Optional[Callable] = None ) -> Union[Chain, List[Chain]]: """ Function for optimal chain structure searching :param data: InputData for chain composing :param is_visualise: is it needed to visualise :param is_tune: is it needed to tune chain after composing TODO integrate new tuner :param on_next_iteration_callback: TODO add description :return best_chain: obtained result after composing: one chain for single-objective optimization; For the multi-objective case, the list of the chain is returned. In the list, the chains are ordered by the descending of primary metric (the first is the best) """ if self.composer_requirements.max_chain_fit_time: set_multiprocess_start_method() if not self.optimiser: raise AttributeError( f'Optimiser for chain composition is not defined') train_data, test_data = train_test_data_setup( data, sample_split_ration_for_tasks[data.task.task_type]) if self.cache_path is None: self.cache.clear() else: self.cache = OperationsCache( self.cache_path, clear_exiting=not self.use_existing_cache) metric_function_for_nodes = partial(self.composer_metric, self.metrics, train_data, test_data) best_chain = self.optimiser.optimise( metric_function_for_nodes, on_next_iteration_callback=on_next_iteration_callback) self.log.info('GP composition finished') if is_tune: self.tune_chain(best_chain, data, self.composer_requirements.max_lead_time) return best_chain def composer_metric(self, metrics, train_data: InputData, test_data: InputData, chain: Chain) -> Optional[Tuple[Any]]: try: validate(chain) chain.log = self.log if type(metrics) is not list: metrics = [metrics] if self.cache is not None: # TODO improve cache chain.fit_from_cache(self.cache) if not chain.is_fitted: self.log.debug( f'Chain {chain.root_node.descriptive_id} fit started') chain.fit(input_data=train_data, time_constraint=self.composer_requirements. max_chain_fit_time) self.cache.save_chain(chain) evaluated_metrics = () for metric in metrics: if callable(metric): metric_func = metric else: metric_func = MetricsRepository().metric_by_id(metric) evaluated_metrics = evaluated_metrics + (metric_func( chain, reference_data=test_data), ) self.log.debug( f'Chain {chain.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}' ) except Exception as ex: self.log.info(f'Chain assessment warning: {ex}. Continue.') evaluated_metrics = None return evaluated_metrics @staticmethod def tune_chain(chain: Chain, data: InputData, time_limit): raise NotImplementedError() @property def history(self): return self.optimiser.history