class Pipeline(Graph): """ Base class used for composite model structure definition :param nodes: Node object(s) :param log: Log object to record messages :param tag: uniq part of the repository filename .. note:: fitted_on_data stores the data which were used in last pipeline fitting (equals None if pipeline hasn't been fitted yet) """ def __init__(self, nodes: Optional[Union[Node, List[Node]]] = None, log: Log = None): self.computation_time = None self.template = None self.fitted_on_data = {} self.log = log if not log: self.log = default_log(__name__) else: self.log = log super().__init__(nodes) def fit_from_scratch(self, input_data: Union[InputData, MultiModalData] = None): """ Method used for training the pipeline without using saved information :param input_data: data used for operation training """ # Clean all saved states and fit all operations self.log.info('Fit pipeline from scratch') self.unfit() self.fit(input_data, use_fitted=False) def update_fitted_on_data(self, data: InputData): characteristics = input_data_characteristics(data=data, log=self.log) self.fitted_on_data['data_type'] = characteristics[0] self.fitted_on_data['features_hash'] = characteristics[1] self.fitted_on_data['target_hash'] = characteristics[2] def _fitted_status_if_new_data(self, new_input_data: InputData, fitted_status: bool): new_data_params = input_data_characteristics(new_input_data, log=self.log) if fitted_status and self.fitted_on_data: params_names = ('data_type', 'features_hash', 'target_hash') are_data_params_different = any([ new_data_param != self.fitted_on_data[param_name] for new_data_param, param_name in zip(new_data_params, params_names) ]) if are_data_params_different: info = 'Trained operation is not actual because you are using new dataset for training. ' \ 'Parameter use_fitted value changed to False' self.log.info(info) fitted_status = False return fitted_status def _fit_with_time_limit( self, input_data: Optional[InputData] = None, use_fitted_operations=False, time: timedelta = timedelta(minutes=3) ) -> Manager: """ Run training process with time limit. Create :param input_data: data used for operation training :param use_fitted_operations: flag defining whether use saved information about previous executions or not, default True :param time: time constraint for operation fitting process (seconds) """ time = int(time.total_seconds()) manager = Manager() process_state_dict = manager.dict() fitted_operations = manager.list() p = Process(target=self._fit, args=(input_data, use_fitted_operations, process_state_dict, fitted_operations), kwargs={}) p.start() p.join(time) if p.is_alive(): p.terminate() raise TimeoutError( f'Pipeline fitness evaluation time limit is expired') self.fitted_on_data = process_state_dict['fitted_on_data'] self.computation_time = process_state_dict['computation_time'] for node_num, node in enumerate(self.nodes): self.nodes[node_num].fitted_operation = fitted_operations[node_num] return process_state_dict['train_predicted'] def _fit(self, input_data: InputData, use_fitted_operations=False, process_state_dict: Manager = None, fitted_operations: Manager = None): """ Run training process in all nodes in pipeline starting with root. :param input_data: data used for operation training :param use_fitted_operations: flag defining whether use saved information about previous executions or not, default True :param process_state_dict: this dictionary is used for saving required pipeline parameters (which were changed inside the process) in a case of operation fit time control (when process created) :param fitted_operations: this list is used for saving fitted operations of pipeline nodes """ # InputData was set directly to the primary nodes if input_data is None: use_fitted_operations = False else: use_fitted_operations = self._fitted_status_if_new_data( new_input_data=input_data, fitted_status=use_fitted_operations) if not use_fitted_operations or not self.fitted_on_data: # Don't use previous information self.unfit() self.update_fitted_on_data(input_data) with Timer(log=self.log) as t: computation_time_update = not use_fitted_operations or not self.root_node.fitted_operation or \ self.computation_time is None train_predicted = self.root_node.fit(input_data=input_data) if computation_time_update: self.computation_time = round(t.minutes_from_start, 3) if process_state_dict is None: return train_predicted else: process_state_dict['train_predicted'] = train_predicted process_state_dict['computation_time'] = self.computation_time process_state_dict['fitted_on_data'] = self.fitted_on_data for node in self.nodes: fitted_operations.append(node.fitted_operation) def fit(self, input_data: Union[InputData, MultiModalData], use_fitted=True, time_constraint: Optional[timedelta] = None): """ Run training process in all nodes in pipeline starting with root. :param input_data: data used for operation training :param use_fitted: flag defining whether use saved information about previous executions or not, default True :param time_constraint: time constraint for operation fitting (seconds) """ if not use_fitted: self.unfit() # Make copy of the input data to avoid performing inplace operations copied_input_data = copy(input_data) copied_input_data = self._assign_data_to_nodes(copied_input_data) if time_constraint is None: train_predicted = self._fit(input_data=copied_input_data, use_fitted_operations=use_fitted) else: train_predicted = self._fit_with_time_limit( input_data=copied_input_data, use_fitted_operations=use_fitted, time=time_constraint) return train_predicted @property def is_fitted(self): return all([(node.fitted_operation is not None) for node in self.nodes]) def unfit(self): """ Remove fitted operations for all nodes. """ for node in self.nodes: node.unfit() def fit_from_cache(self, cache: OperationsCache): for node in self.nodes: cached_state = cache.get(node) if cached_state: node.fitted_operation = cached_state.operation else: node.fitted_operation = None def predict(self, input_data: Union[InputData, MultiModalData], output_mode: str = 'default'): """ Run the predict process in all nodes in pipeline starting with root. :param input_data: data for prediction :param output_mode: desired form of output for operations. Available options are: 'default' (as is), 'labels' (numbers of classes - for classification) , 'probs' (probabilities - for classification =='default'), 'full_probs' (return all probabilities - for binary classification). :return: OutputData with prediction """ if not self.is_fitted: ex = 'Pipeline is not fitted yet' self.log.error(ex) raise ValueError(ex) # Make copy of the input data to avoid performing inplace operations copied_input_data = copy(input_data) copied_input_data = self._assign_data_to_nodes(copied_input_data) result = self.root_node.predict(input_data=copied_input_data, output_mode=output_mode) return result def fine_tune_all_nodes(self, loss_function: Callable, loss_params: Callable = None, input_data: Union[InputData, MultiModalData] = None, iterations=50, timeout: int = 5, cv_folds: int = None, validation_blocks: int = 3) -> 'Pipeline': """ Tune all hyperparameters of nodes simultaneously via black-box optimization using PipelineTuner. For details, see :meth:`~fedot.core.pipelines.tuning.unified.PipelineTuner.tune_pipeline` """ # Make copy of the input data to avoid performing inplace operations copied_input_data = copy(input_data) timeout = timedelta(minutes=timeout) pipeline_tuner = PipelineTuner(pipeline=self, task=copied_input_data.task, iterations=iterations, timeout=timeout) self.log.info('Start tuning of primary nodes') tuned_pipeline = pipeline_tuner.tune_pipeline( input_data=copied_input_data, loss_function=loss_function, loss_params=loss_params, cv_folds=cv_folds, validation_blocks=validation_blocks) self.log.info('Tuning was finished') return tuned_pipeline def save(self, path: str): """ Save the pipeline to the json representation with pickled fitted operations. :param path to json file with operation :return: json containing a composite operation description """ if not self.template: self.template = PipelineTemplate(self, self.log) json_object = self.template.export_pipeline(path) return json_object def load(self, path: str): """ Load the pipeline the json representation with pickled fitted operations. :param path to json file with operation """ self.nodes = [] self.template = PipelineTemplate(self, self.log) self.template.import_pipeline(path) def __eq__(self, other) -> bool: return self.root_node.descriptive_id == other.root_node.descriptive_id def __str__(self): description = { 'depth': self.depth, 'length': self.length, 'nodes': self.nodes, } return f'{description}' @property def root_node(self) -> Optional[Node]: if len(self.nodes) == 0: return None root = [ node for node in self.nodes if not any(self.operator.node_children(node)) ] if len(root) > 1: raise ValueError( f'{ERROR_PREFIX} More than 1 root_nodes in pipeline') return root[0] def _assign_data_to_nodes(self, input_data) -> Optional[InputData]: if isinstance(input_data, MultiModalData): for node in [n for n in self.nodes if isinstance(n, PrimaryNode)]: if node.operation.operation_type in input_data.keys(): node.node_data = input_data[node.operation.operation_type] node.direct_set = True else: raise ValueError(f'No data for primary node {node}') return None return input_data def print_structure(self): """ Method print information about pipeline """ print('Pipeline structure:') print(self.__str__()) for node in self.nodes: print(f'{node.operation.operation_type} - {node.custom_params}')
class AtomizedModelTemplate(OperationTemplateAbstract): def __init__(self, node: Node = None, operation_id: int = None, nodes_from: list = None, path: str = None): # Need use the imports inside the class because of the problem of circular imports. from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.template import PipelineTemplate from fedot.core.operations.atomized_model import AtomizedModel super().__init__() self.atomized_model_json_path = None self.next_pipeline_template = None self.pipeline_template = None if path: pipeline = Pipeline() pipeline.load(path) self.next_pipeline_template = AtomizedModel(pipeline) self.pipeline_template = PipelineTemplate(pipeline) if node: self._operation_to_template(node, operation_id, nodes_from) def _operation_to_template(self, node: Node, operation_id: int, nodes_from: list): from fedot.core.pipelines.template import PipelineTemplate self.operation_id = operation_id self.operation_type = node.operation.operation_type self.nodes_from = nodes_from self.pipeline_template = PipelineTemplate(node.operation.pipeline) self.atomized_model_json_path = 'nested_' + str(self.operation_id) def convert_to_dict(self) -> dict: operation_object = { 'operation_id': self.operation_id, 'operation_type': self.operation_type, 'nodes_from': self.nodes_from, 'atomized_model_json_path': self.atomized_model_json_path } return operation_object def _create_nested_path(self, path: str) -> Tuple[str, str]: """ Create folder for nested JSON operation and prepared path to save JSON's. :params path: path where to save parent JSON operation :return: absolute and relative paths to save nested JSON operation """ relative_path = os.path.join('fitted_operations', 'nested_' + str(self.operation_id)) absolute_path = os.path.join(path, relative_path) if not os.path.exists(absolute_path): os.makedirs(absolute_path) return absolute_path, relative_path def export_operation(self, path: str): absolute_path = os.path.join(path, self.atomized_model_json_path) _check_existing_path(absolute_path) self.pipeline_template.export_pipeline(absolute_path) def import_json(self, operation_object: dict): required_fields = ['operation_id', 'operation_type', 'nodes_from', 'atomized_model_json_path'] self._validate_json_operation_template(operation_object, required_fields) self.operation_id = operation_object['operation_id'] self.operation_type = operation_object['operation_type'] self.nodes_from = operation_object['nodes_from'] self.atomized_model_json_path = operation_object['atomized_model_json_path']