def generate_pipeline() -> Pipeline: node_scaling = PrimaryNode('scaling') node_first = SecondaryNode('kmeans', nodes_from=[node_scaling]) node_second = SecondaryNode('kmeans', nodes_from=[node_scaling]) node_root = SecondaryNode('logit', nodes_from=[node_first, node_second]) pipeline = Pipeline(node_root) return pipeline
def generate_pipeline() -> Pipeline: node_scaling = PrimaryNode('scaling') node_lasso = SecondaryNode('lasso', nodes_from=[node_scaling]) node_ridge = SecondaryNode('ridge', nodes_from=[node_scaling]) node_root = SecondaryNode('linear', nodes_from=[node_lasso, node_ridge]) pipeline = Pipeline(node_root) return pipeline
def get_knn_class_pipeline(k_neighbors): """ Function return pipeline with K-nn classification model in it """ node_scaling = PrimaryNode('scaling') node_final = SecondaryNode('knn', nodes_from=[node_scaling]) node_final.custom_params = {'n_neighbors': k_neighbors} pipeline = Pipeline(node_final) return pipeline
def run_pipeline_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run pipeline with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for pipeline """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) OperationTypesRepository.assign_repo('model', 'automl_repository.json') pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def get_pipeline(): node_scaling = PrimaryNode('scaling') node_ransac = SecondaryNode('ransac_lin_reg', nodes_from=[node_scaling]) node_ridge = SecondaryNode('lasso', nodes_from=[node_ransac]) pipeline = Pipeline(node_ridge) return pipeline
def boosting_mutation(pipeline: Pipeline, requirements, params, **kwargs) -> Any: """ This type of mutation adds the additional 'boosting' cascade to the existing pipeline. """ task_type = params.advisor.task.task_type decompose_operations, _ = OperationTypesRepository( 'data_operation').suitable_operation(task_type=task_type, tags=['decompose']) decompose_operation = decompose_operations[0] existing_pipeline = pipeline if len(pipeline.nodes) == 1: # to deal with single-node pipeline data_source = pipeline.nodes[0] else: data_source = PrimaryNode('scaling') decompose_parents = [existing_pipeline.root_node, data_source] node_decompose = SecondaryNode(decompose_operation, nodes_from=decompose_parents) node_boost = SecondaryNode('linear', nodes_from=[node_decompose]) node_final = SecondaryNode( choice(requirements.secondary), nodes_from=[node_boost, existing_pipeline.root_node]) pipeline.nodes.extend([node_decompose, node_final, node_boost]) return pipeline
def generate_straight_pipeline(): """ Simple linear pipeline """ node_scaling = PrimaryNode('scaling') node_ridge = SecondaryNode('ridge', nodes_from=[node_scaling]) node_linear = SecondaryNode('linear', nodes_from=[node_ridge]) pipeline = Pipeline(node_linear) return pipeline
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(f'ROC AUC for TPOT: {roc_auc_value}') node_scaling = PrimaryNode('scaling') node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling]) pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(f'ROC AUC for FEDOT: {roc_auc_value}') return roc_auc_value
def test_pipeline_hierarchy_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[first]) final = SecondaryNode(operation_type='logit', nodes_from=[second, third]) pipeline = Pipeline() for node in [first, second, third, final]: pipeline.add_node(node) pipeline.unfit() train_predicted = pipeline.fit(input_data=train) assert pipeline.root_node.descriptive_id == ( '((/n_logit_default_params;)/' 'n_logit_default_params;;(/' 'n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params') assert pipeline.length == 4 assert pipeline.depth == 3 assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.fitted_operation is not None
def ts_pipeline_with_incorrect_data_flow(): """ Connection lagged -> lagged is incorrect Connection ridge -> ar is incorrect also lagged - lagged - ridge \ ar -> final forecast lagged - ridge / """ # First level node_lagged = PrimaryNode('lagged') # Second level node_lagged_1 = SecondaryNode('lagged', nodes_from=[node_lagged]) node_lagged_2 = PrimaryNode('lagged') # Third level node_ridge_1 = SecondaryNode('ridge', nodes_from=[node_lagged_1]) node_ridge_2 = SecondaryNode('ridge', nodes_from=[node_lagged_2]) # Fourth level - root node node_final = SecondaryNode('ar', nodes_from=[node_ridge_1, node_ridge_2]) pipeline = Pipeline(node_final) return pipeline
def get_composite_pipeline(composite_flag: bool = True) -> Pipeline: node_first = PrimaryNode('cnn') node_first.custom_params = { 'image_shape': (28, 28, 1), 'architecture': 'deep', 'num_classes': 10, 'epochs': 15, 'batch_size': 128 } node_second = PrimaryNode('cnn') node_second.custom_params = { 'image_shape': (28, 28, 1), 'architecture_type': 'simplified', 'num_classes': 10, 'epochs': 10, 'batch_size': 128 } node_final = SecondaryNode('rf', nodes_from=[node_first, node_second]) if not composite_flag: node_final = SecondaryNode('rf', nodes_from=[node_first]) pipeline = Pipeline(node_final) return pipeline
def get_complex_pipeline(): """ Pipeline looking like this smoothing - lagged - ridge \ \ ridge -> final forecast / lagged - ridge / """ # First level node_smoothing = PrimaryNode('smoothing') # Second level node_lagged_1 = SecondaryNode('lagged', nodes_from=[node_smoothing]) node_lagged_2 = PrimaryNode('lagged') # Third level node_ridge_1 = SecondaryNode('ridge', nodes_from=[node_lagged_1]) node_ridge_2 = SecondaryNode('ridge', nodes_from=[node_lagged_2]) # Fourth level - root node node_final = SecondaryNode('ridge', nodes_from=[node_ridge_1, node_ridge_2]) pipeline = Pipeline(node_final) return pipeline
def get_nodes(): first_node = PrimaryNode('knn') second_node = PrimaryNode('knn') third_node = SecondaryNode('lda', nodes_from=[first_node, second_node]) root = SecondaryNode('logit', nodes_from=[third_node]) return [root, third_node, first_node, second_node]
def pipeline_with_secondary_nodes_only(): first = SecondaryNode(operation_type='logit', nodes_from=[]) second = SecondaryNode(operation_type='logit', nodes_from=[first]) pipeline = Pipeline() pipeline.add_node(first) pipeline.add_node(second) return pipeline
def pipeline_with_only_data_operations(): first = PrimaryNode(operation_type='one_hot_encoding') second = SecondaryNode(operation_type='scaling', nodes_from=[first]) final = SecondaryNode(operation_type='ransac_lin_reg', nodes_from=[second]) pipeline = Pipeline(final) return pipeline
def get_complex_regr_pipeline(): node_scaling = PrimaryNode(operation_type='scaling') node_ridge = SecondaryNode('ridge', nodes_from=[node_scaling]) node_linear = SecondaryNode('linear', nodes_from=[node_scaling]) final = SecondaryNode('xgbreg', nodes_from=[node_ridge, node_linear]) pipeline = Pipeline(final) return pipeline
def get_non_refinement_pipeline(): """ Create 3-level pipeline without class_decompose node """ node_scaling = PrimaryNode('scaling') node_rf = SecondaryNode('rf', nodes_from=[node_scaling]) node_logit = SecondaryNode('logit', nodes_from=[node_scaling]) node_xgboost = SecondaryNode('xgboost', nodes_from=[node_logit, node_rf]) pipeline = Pipeline(node_xgboost) return pipeline
def test_distance_to_primary_level(): first_node = PrimaryNode('knn') second_node = PrimaryNode('knn') third_node = SecondaryNode('lda', nodes_from=[first_node, second_node]) root = SecondaryNode('logit', nodes_from=[third_node]) distance = root.distance_to_primary_level assert distance == 2
def pipeline_with_pca() -> Pipeline: node_scaling = PrimaryNode('scaling') node_pca = SecondaryNode('pca', nodes_from=[node_scaling]) node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_final = SecondaryNode('rf', nodes_from=[node_pca, node_lda]) pipeline = Pipeline(node_final) return pipeline
def pipeline_simple() -> Pipeline: node_scaling = PrimaryNode('scaling') node_svc = SecondaryNode('svc', nodes_from=[node_scaling]) node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_final = SecondaryNode('rf', nodes_from=[node_svc, node_lda]) pipeline = Pipeline(node_final) return pipeline
def default_valid_pipeline(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[first]) final = SecondaryNode(operation_type='logit', nodes_from=[second, third]) pipeline = Pipeline(final) return pipeline
def test_ordered_subnodes_hierarchy(): first_node = PrimaryNode('knn') second_node = PrimaryNode('knn') third_node = SecondaryNode('lda', nodes_from=[first_node, second_node]) root = SecondaryNode('logit', nodes_from=[third_node]) ordered_nodes = root.ordered_subnodes_hierarchy() assert len(ordered_nodes) == 4
def pipeline_with_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second, first]) second.nodes_from.append(third) pipeline = Pipeline() for node in [first, second, third]: pipeline.add_node(node) return pipeline
def pipeline_with_multiple_roots(): first = PrimaryNode(operation_type='logit') root_first = SecondaryNode(operation_type='logit', nodes_from=[first]) root_second = SecondaryNode(operation_type='logit', nodes_from=[first]) pipeline = Pipeline() for node in [first, root_first, root_second]: pipeline.add_node(node) return pipeline
def valid_pipeline(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second]) last = SecondaryNode(operation_type='logit', nodes_from=[third]) pipeline = Pipeline() for node in [first, second, third, last]: pipeline.add_node(node) return pipeline
def pipeline_with_isolated_components(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[]) fourth = SecondaryNode(operation_type='logit', nodes_from=[third]) pipeline = Pipeline() for node in [first, second, third, fourth]: pipeline.add_node(node) return pipeline
def get_pipeline(): node_lagged_1 = PrimaryNode('lagged') node_lagged_1.custom_params = {'window_size': 120} node_lagged_2 = PrimaryNode('lagged') node_lagged_2.custom_params = {'window_size': 10} node_first = SecondaryNode('ridge', nodes_from=[node_lagged_1]) node_second = SecondaryNode('dtreg', nodes_from=[node_lagged_2]) node_final = SecondaryNode('ridge', nodes_from=[node_first, node_second]) pipeline = Pipeline(node_final) return pipeline
def create_classification_pipeline_with_preprocessing(): node_scaling = PrimaryNode('scaling') node_rfe = PrimaryNode('rfe_lin_class') xgb_node = SecondaryNode('xgboost', nodes_from=[node_scaling]) logit_node = SecondaryNode('logit', nodes_from=[node_rfe]) knn_root = SecondaryNode('knn', nodes_from=[xgb_node, logit_node]) pipeline = Pipeline(knn_root) return pipeline
def get_non_refinement_pipeline(lagged): """ Create 4-level pipeline without decompose operation """ node_lagged = PrimaryNode('lagged') node_lagged.custom_params = {'window_size': lagged} node_lasso = SecondaryNode('lasso', nodes_from=[node_lagged]) node_dtreg = SecondaryNode('dtreg', nodes_from=[node_lagged]) node_dtreg.custom_params = {'max_depth': 3} final_node = SecondaryNode('ridge', nodes_from=[node_lasso, node_dtreg]) pipeline = Pipeline(final_node) return pipeline
def roll_pipeline_structure( self, operation_object: ['OperationTemplate', 'AtomizedModelTemplate'], visited_nodes: dict, path: str = None): """ The function recursively traverses all disjoint operations and connects the operations in a pipeline. :params operation_object: operationTemplate or AtomizedOperationTemplate :params visited_nodes: array to remember which node was visited :params path: path to save :return: root_node """ if operation_object.operation_id in visited_nodes: return visited_nodes[operation_object.operation_id] if operation_object.operation_type == atomized_model_type(): atomized_model = operation_object.next_pipeline_template if operation_object.nodes_from: node = SecondaryNode(operation_type=atomized_model) else: node = PrimaryNode(operation_type=atomized_model) else: if operation_object.nodes_from: node = SecondaryNode(operation_object.operation_type) else: node = PrimaryNode(operation_object.operation_type) node.operation.params = operation_object.params node.rating = operation_object.rating if hasattr( operation_object, 'fitted_operation_path' ) and operation_object.fitted_operation_path and path is not None: path_to_operation = os.path.join( path, operation_object.fitted_operation_path) if not os.path.isfile(path_to_operation): message = f"Fitted operation on the path: {path_to_operation} does not exist." self.log.error(message) raise FileNotFoundError(message) fitted_operation = joblib.load(path_to_operation) operation_object.fitted_operation = fitted_operation node.fitted_operation = fitted_operation nodes_from = [ operation_template for operation_template in self.operation_templates if operation_template.operation_id in operation_object.nodes_from ] node.nodes_from = [ self.roll_pipeline_structure(node_from, visited_nodes, path) for node_from in nodes_from ] visited_nodes[operation_object.operation_id] = node return node