def test_pipeline_hierarchy_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[first]) final = SecondaryNode(operation_type='logit', nodes_from=[second, third]) pipeline = Pipeline() for node in [first, second, third, final]: pipeline.add_node(node) pipeline.unfit() train_predicted = pipeline.fit(input_data=train) assert pipeline.root_node.descriptive_id == ( '((/n_logit_default_params;)/' 'n_logit_default_params;;(/' 'n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params') assert pipeline.length == 4 assert pipeline.depth == 3 assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.fitted_operation is not None
def create_pipeline_with_several_nested_atomized_model() -> Pipeline: pipeline = Pipeline() atomized_op = create_atomized_model_with_several_atomized_models() node_atomized_model = PrimaryNode(operation_type=atomized_op) node_atomized_model_secondary = SecondaryNode( operation_type=create_atomized_model()) node_atomized_model_secondary.nodes_from = [node_atomized_model] node_knn = SecondaryNode('knn') node_knn.custom_params = {'n_neighbors': 9} node_knn.nodes_from = [node_atomized_model] node_knn_second = SecondaryNode('knn') node_knn_second.custom_params = {'n_neighbors': 5} node_knn_second.nodes_from = [ node_atomized_model, node_atomized_model_secondary, node_knn ] node_atomized_model_secondary_second = \ SecondaryNode(operation_type=create_atomized_model_with_several_atomized_models()) node_atomized_model_secondary_second.nodes_from = [node_knn_second] pipeline.add_node(node_atomized_model_secondary_second) return pipeline
def pipeline_with_secondary_nodes_only(): first = SecondaryNode(operation_type='logit', nodes_from=[]) second = SecondaryNode(operation_type='logit', nodes_from=[first]) pipeline = Pipeline() pipeline.add_node(first) pipeline.add_node(second) return pipeline
def pipeline_with_self_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) second.nodes_from.append(second) pipeline = Pipeline() pipeline.add_node(first) pipeline.add_node(second) return pipeline
def pipeline_with_multiple_roots(): first = PrimaryNode(operation_type='logit') root_first = SecondaryNode(operation_type='logit', nodes_from=[first]) root_second = SecondaryNode(operation_type='logit', nodes_from=[first]) pipeline = Pipeline() for node in [first, root_first, root_second]: pipeline.add_node(node) return pipeline
def pipeline_with_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second, first]) second.nodes_from.append(third) pipeline = Pipeline() for node in [first, second, third]: pipeline.add_node(node) return pipeline
def baseline_pipeline(): pipeline = Pipeline() last_node = SecondaryNode(operation_type='xgboost', nodes_from=[]) for requirement_model in ['knn', 'logit']: new_node = PrimaryNode(requirement_model) pipeline.add_node(new_node) last_node.nodes_from.append(new_node) pipeline.add_node(last_node) return pipeline
def pipeline_with_isolated_components(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[]) fourth = SecondaryNode(operation_type='logit', nodes_from=[third]) pipeline = Pipeline() for node in [first, second, third, fourth]: pipeline.add_node(node) return pipeline
def valid_pipeline(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second]) last = SecondaryNode(operation_type='logit', nodes_from=[third]) pipeline = Pipeline() for node in [first, second, third, last]: pipeline.add_node(node) return pipeline
def pipeline_third(): # QDA # | \ # RF RF pipeline = Pipeline() new_node = SecondaryNode('qda') for model_type in ('rf', 'rf'): new_node.nodes_from.append(PrimaryNode(model_type)) pipeline.add_node(new_node) [pipeline.add_node(node_from) for node_from in new_node.nodes_from] return pipeline
def test_pipeline_repr(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) pipeline = Pipeline() pipeline.add_node(final) expected_pipeline_description = "{'depth': 2, 'length': 4, 'nodes': [xgboost, logit, lda, knn]}" assert repr(pipeline) == expected_pipeline_description
def test_delete_node_with_redirection(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = SecondaryNode(operation_type='knn', nodes_from=[first, second]) final = SecondaryNode(operation_type='xgboost', nodes_from=[third]) pipeline = Pipeline() pipeline.add_node(final) pipeline.delete_node(third) assert len(pipeline.nodes) == 3 assert first in pipeline.root_node.nodes_from
def create_pipeline() -> Pipeline: pipeline = Pipeline() node_logit = PrimaryNode('logit') node_lda = PrimaryNode('lda') node_lda.custom_params = {'n_components': 1} node_xgboost = SecondaryNode('xgboost') node_xgboost.custom_params = {'n_components': 1} node_xgboost.nodes_from = [node_logit, node_lda] pipeline.add_node(node_xgboost) return pipeline
def pipeline_third(): # XG # / | \ # KNN LDA KNN root_of_tree = SecondaryNode('xgboost') for model_type in ('knn', 'lda', 'knn'): root_of_tree.nodes_from.append(PrimaryNode(model_type)) pipeline = Pipeline() for node in root_of_tree.nodes_from: pipeline.add_node(node) pipeline.add_node(root_of_tree) return pipeline
def test_update_node_in_pipeline_correct(): first = PrimaryNode(operation_type='logit') final = SecondaryNode(operation_type='xgboost', nodes_from=[first]) pipeline = Pipeline() pipeline.add_node(final) new_node = PrimaryNode('svc') replacing_node = SecondaryNode('logit', nodes_from=[new_node]) pipeline.update_node(old_node=first, new_node=replacing_node) assert replacing_node in pipeline.nodes assert new_node in pipeline.nodes assert first not in pipeline.nodes
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup): data = data_setup train, test = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) pipeline = Pipeline() for node in [first, second, third, final]: pipeline.add_node(node) first = deepcopy(first) second = deepcopy(second) third = deepcopy(third) final_shuffled = SecondaryNode(operation_type='xgboost', nodes_from=[third, first, second]) pipeline_shuffled = Pipeline() # change order of nodes in list for node in [final_shuffled, third, first, second]: pipeline_shuffled.add_node(node) train_predicted = pipeline.fit(input_data=train) train_predicted_shuffled = pipeline_shuffled.fit(input_data=train) # train results should be invariant assert pipeline.root_node.descriptive_id == pipeline_shuffled.root_node.descriptive_id assert np.equal(train_predicted.predict, train_predicted_shuffled.predict).all() test_predicted = pipeline.predict(input_data=test) test_predicted_shuffled = pipeline_shuffled.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_shuffled.predict).all() # change parents order for the nodes fitted pipeline nodes_for_change = pipeline.nodes[3].nodes_from pipeline.nodes[3].nodes_from = [nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]] pipeline.nodes[3].unfit() pipeline.fit(train) test_predicted_re_shuffled = pipeline.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_re_shuffled.predict).all()
def create_atomized_model_with_several_atomized_models() -> AtomizedModel: pipeline = Pipeline() node_atomized_model_primary = PrimaryNode( operation_type=create_atomized_model()) node_atomized_model_secondary = SecondaryNode( operation_type=create_atomized_model()) node_atomized_model_secondary_second = SecondaryNode( operation_type=create_atomized_model()) node_atomized_model_secondary_third = SecondaryNode( operation_type=create_atomized_model()) node_atomized_model_secondary.nodes_from = [node_atomized_model_primary] node_atomized_model_secondary_second.nodes_from = [ node_atomized_model_primary ] node_atomized_model_secondary_third.nodes_from = [ node_atomized_model_secondary, node_atomized_model_secondary_second ] pipeline.add_node(node_atomized_model_secondary_third) atomized_model = AtomizedModel(pipeline) return atomized_model
def test_pipeline_with_custom_params_for_model(data_setup): data = data_setup custom_params = dict(n_neighbors=1, weights='uniform', p=1) first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') final = SecondaryNode(operation_type='knn', nodes_from=[first, second]) pipeline = Pipeline() pipeline.add_node(final) pipeline_default_params = deepcopy(pipeline) pipeline.root_node.custom_params = custom_params pipeline_default_params.fit(data) pipeline.fit(data) custom_params_prediction = pipeline.predict(data).predict default_params_prediction = pipeline_default_params.predict(data).predict assert not np.array_equal(custom_params_prediction, default_params_prediction)
def compose_pipeline(self, data: InputData, initial_pipeline: Optional[Pipeline], composer_requirements: ComposerRequirements, metrics: Optional[Callable]) -> Pipeline: # TODO: fix this later? train_data = data test_data = data metric_function_for_nodes = partial(metric_for_nodes, metric_function=metrics, train_data=train_data, test_data=test_data) optimiser = RandomSearchOptimiser(self.__iter_num, PrimaryNode, SecondaryNode) best_nodes_set, history = optimiser.optimise( metric_function_for_nodes, composer_requirements.primary, composer_requirements.secondary) best_pipeline = Pipeline() [best_pipeline.add_node(nodes) for nodes in best_nodes_set] return best_pipeline
def test_pipeline_with_datamodel_fit_correct(data_setup): data = data_setup train_data, test_data = train_test_data_setup(data) pipeline = Pipeline() node_data = PrimaryNode('logit') node_first = PrimaryNode('bernb') node_second = SecondaryNode('rf') node_second.nodes_from = [node_first, node_data] pipeline.add_node(node_data) pipeline.add_node(node_first) pipeline.add_node(node_second) pipeline.fit(train_data) results = np.asarray(probs_to_labels(pipeline.predict(test_data).predict)) assert results.shape == test_data.target.shape
def pipeline_first(): # XG # | \ # XG KNN # | \ | \ # LR LDA LR LDA pipeline = Pipeline() root_of_tree, root_child_first, root_child_second = \ [SecondaryNode(model) for model in ('xgboost', 'xgboost', 'knn')] for root_node_child in (root_child_first, root_child_second): for requirement_model in ('logit', 'lda'): new_node = PrimaryNode(requirement_model) root_node_child.nodes_from.append(new_node) pipeline.add_node(new_node) pipeline.add_node(root_node_child) root_of_tree.nodes_from.append(root_node_child) pipeline.add_node(root_of_tree) return pipeline
def nodes_to_pipeline(nodes: List[Node]) -> Pipeline: pipeline = Pipeline() [pipeline.add_node(nodes) for nodes in nodes] return pipeline