def test_RandomModelGenerator_default(): dfX, y, auto_ml_config = get_automl_config() random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) # verif iterator for model in random_model_generator.iterator_default_models(): assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.node: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit")
def test_RandomModelGenerator_iterator(type_of_iterator, num_only): dfX, y, auto_ml_config = get_automl_config(num_only) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) if type_of_iterator == "default": iterator = random_model_generator.iterator_default_models() elif type_of_iterator == "block_search": iterator = random_model_generator.iterate_block_search( random_order=False) elif type_of_iterator == "block_search_random": iterator = random_model_generator.iterate_block_search( random_order=True) assert hasattr(iterator, "__iter__") # verif iterator for model in iterator: assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.node: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit")
def test_RandomModelGenerator_random(num_only, specific_hyper, only_random_forest): dfX, y, auto_ml_config = get_automl_config(num_only) if specific_hyper: auto_ml_config.specific_hyper = { ('Model', 'RandomForestClassifier'): { "n_estimators": [10, 20] } } if only_random_forest: auto_ml_config.filter_models(Model='RandomForestClassifier') random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen = [] for _ in range(10): model = random_model_generator.draw_random_graph() all_gen.append(model) assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.node: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit") rf_key = ('Model', ('Model', 'RandomForestClassifier')) if only_random_forest: assert rf_key in all_models_params if specific_hyper: if rf_key in all_models_params: assert all_models_params[rf_key]["n_estimators"] in (10, 20) if not only_random_forest: assert any([rf_key not in m[1] for m in all_gen ]) # Check that RandomForest wasn't drawn every time ### re-draw them thing with other seed ### random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs1, all_params1, all_blocks1 = zip(*all_gen) all_graphs2, all_params2, all_blocks2 = zip(*all_gen2) assert not _all_same(all_params1) assert not _all_same(all_graphs1) if not num_only: assert not _all_same(all_blocks1) # only one block all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1] all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs2_node_edges assert all_params1 == all_params2 assert all_blocks1 == all_blocks2 ### re-draw by resetting generator ### random_model_generator.random_state = 123 all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs3, all_params3, all_blocks3 = zip(*all_gen3) all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs3_node_edges assert all_params1 == all_params3 assert all_blocks1 == all_blocks3 ### Re-draw by passing a random sate random_state = check_random_state(123) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=random_state) all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs4, all_params4, all_blocks4 = zip(*all_gen4) all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs4_node_edges assert all_params1 == all_params4 assert all_blocks1 == all_blocks4
def test_RandomModelGenerator_random(): dfX, y, auto_ml_config = get_automl_config() random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen = [] for _ in range(10): model = random_model_generator.draw_random_graph() all_gen.append(model) assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.node: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit") ### re-draw them thing with other seed ### random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs1, all_params1, all_blocks1 = zip(*all_gen) all_graphs2, all_params2, all_blocks2 = zip(*all_gen2) assert not _all_same(all_params1) assert not _all_same(all_graphs1) assert not _all_same(all_blocks1) all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1] all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs2_node_edges assert all_params1 == all_params2 assert all_blocks1 == all_blocks2 ### re-draw by resetting generator ### random_model_generator.random_state = 123 all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs3, all_params3, all_blocks3 = zip(*all_gen3) all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs3_node_edges assert all_params1 == all_params3 assert all_blocks1 == all_blocks3 ### Re-draw by passing a random sate random_state = check_random_state(123) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=random_state) all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs4, all_params4, all_blocks4 = zip(*all_gen4) all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs4_node_edges assert all_params1 == all_params4 assert all_blocks1 == all_blocks4
def test_create_graphical_representation(): steps = OrderedDict( [ (("TextPreprocessing", ("TextPreprocessing", "CountVectorizerWrapper")), TypeOfVariables.TEXT), (("DimensionReduction", ("DimensionReduction", "TruncatedSVDWrapper")), TypeOfVariables.TEXT), (("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), TypeOfVariables.CAT), (("CategoryImputer", ("CategoryImputer", "CatImputer")), TypeOfVariables.CAT), (("MissingValueImputer", ("MissingValueImputer", "NumImputer")), TypeOfVariables.NUM), (("FeatureExtraction", ("FeatureExtraction", "PolynomialExtractor")), TypeOfVariables.NUM), (("Scaling", ("Scaling", "StandardScaler")), (TypeOfVariables.CAT, TypeOfVariables.NUM)), ( ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")), (TypeOfVariables.CAT, TypeOfVariables.NUM, TypeOfVariables.TEXT), ), ( ("Model", ("Model", "LightGBMClassifier")), (TypeOfVariables.CAT, TypeOfVariables.NUM, TypeOfVariables.TEXT), ), ] ) # columns = {"TEXT":["txt1","txt2"], # "CAT":["cat1","cat2","cat3"], # "NUM":["num1","num2"]} # params = {n:("param_%s" % n) for n,t in steps.items()} G, new_steps = create_graphical_representation(steps) assert isinstance(G, nx.DiGraph) assert len(new_steps) == 0 # graphviz_modelgraph(G) expected_edges = [ ( ("TextPreprocessing", ("TextPreprocessing", "CountVectorizerWrapper")), ("DimensionReduction", ("DimensionReduction", "TruncatedSVDWrapper")), ), ( ("DimensionReduction", ("DimensionReduction", "TruncatedSVDWrapper")), ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")), ), ( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("CategoryImputer", ("CategoryImputer", "CatImputer")), ), (("CategoryImputer", ("CategoryImputer", "CatImputer")), ("Scaling", ("Scaling", "StandardScaler"))), ( ("MissingValueImputer", ("MissingValueImputer", "NumImputer")), ("FeatureExtraction", ("FeatureExtraction", "PolynomialExtractor")), ), ( ("FeatureExtraction", ("FeatureExtraction", "PolynomialExtractor")), ("Scaling", ("Scaling", "StandardScaler")), ), ( ("Scaling", ("Scaling", "StandardScaler")), ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")), ), ( ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")), ("Model", ("Model", "LightGBMClassifier")), ), ] expected_nodes = [ ("TextPreprocessing", ("TextPreprocessing", "CountVectorizerWrapper")), ("DimensionReduction", ("DimensionReduction", "TruncatedSVDWrapper")), ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("CategoryImputer", ("CategoryImputer", "CatImputer")), ("MissingValueImputer", ("MissingValueImputer", "NumImputer")), ("FeatureExtraction", ("FeatureExtraction", "PolynomialExtractor")), ("Scaling", ("Scaling", "StandardScaler")), ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")), ("Model", ("Model", "LightGBMClassifier")), ] assert set(expected_edges) == set(G.edges) assert set(expected_nodes) == set(G.nodes) params = {} for n, _ in steps.items(): params[n] = {"__%s_%s__" % n[1]: "param"} res1 = convert_graph_to_code(G, params) assert isinstance(res1, tuple) assert res1[0] == "GraphPipeline" assert isinstance(res1[1], dict) assert "edges" in res1[1] assert "models" in res1[1]
def test_convert_graph_to_code(): ################################### ### ** Only one Simple Model ** ### ################################### Graph = nx.DiGraph() Graph.add_node(("Model", ("Model", "LogisticRegression"))) assert _find_first_composition_node(Graph) is None ## a) no params all_models_params = {("Model", ("Model", "LogisticRegression")): {}} model_json_code = convert_graph_to_code(Graph, all_models_params) assert model_json_code == ("LogisticRegression", {}) ## b) params all_models_params = {("Model", ("Model", "LogisticRegression")): {"C": 10}} model_json_code = convert_graph_to_code(Graph, all_models_params) assert model_json_code == ("LogisticRegression", {"C": 10}) ##################### ### ** 2 steps ** ### ##################### Graph = nx.DiGraph() Graph.add_edge( ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")), ("Model", ("Model", "RandomForestClassifier")), ) assert _find_first_composition_node(Graph) is None ## a) no params all_models_params = { ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {}, ("Model", ("Model", "RandomForestClassifier")): {}, } model_json_code = convert_graph_to_code(Graph, all_models_params) assert model_json_code == ( "GraphPipeline", { "edges": [("KMeansTransformer", "RandomForestClassifier")], "models": { "KMeansTransformer": ("KMeansTransformer", {}), "RandomForestClassifier": ("RandomForestClassifier", {}), }, }, ) ## b) no params all_models_params = { ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {"n_clusters": 5}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100}, } model_json_code = convert_graph_to_code(Graph, all_models_params) assert model_json_code == ( "GraphPipeline", { "edges": [("KMeansTransformer", "RandomForestClassifier")], "models": { "KMeansTransformer": ("KMeansTransformer", {"n_clusters": 5}), "RandomForestClassifier": ("RandomForestClassifier", {"n_estimators": 100}), }, }, ) ################################ ### ** 1 composition step ** ### ################################ ## a) no params Graph = nx.DiGraph() Graph.add_edge( ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")), ("Model", ("Model", "RandomForestClassifier")), ) assert _find_first_composition_node(Graph) == ( "TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer"), ) all_models_params = { ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {}, ("Model", ("Model", "RandomForestClassifier")): {}, } model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json_code = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}), {}) assert model_json_code == expected_json_code ## b) params Graph = nx.DiGraph() Graph.add_edge( ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")), ("Model", ("Model", "RandomForestClassifier")), ) assert _find_first_composition_node(Graph) == ( "TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer"), ) all_models_params = { ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {"ll": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 10}, } model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json_code = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {"n_estimators": 10}), {"ll": 10}) assert model_json_code == expected_json_code ########################################## ## ** 1 composition above a pipeline ** ## ########################################## ## a) no param Graph = nx.DiGraph() Graph.add_edge( ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")), ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")), ) Graph.add_edge( ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")), ("Model", ("Model", "RandomForestClassifier")), ) assert _find_first_composition_node(Graph) == ( "TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer"), ) all_models_params = { ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {}, ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {}, ("Model", ("Model", "RandomForestClassifier")): {}, } model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json_code = ( "BoxCoxTargetTransformer", ( "GraphPipeline", { "edges": [("KMeansTransformer", "RandomForestClassifier")], "models": { "KMeansTransformer": ("KMeansTransformer", {}), "RandomForestClassifier": ("RandomForestClassifier", {}), }, }, ), {}, ) assert model_json_code == expected_json_code ## b) params Graph = nx.DiGraph() Graph.add_edge( ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")), ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")), ) Graph.add_edge( ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")), ("Model", ("Model", "RandomForestClassifier")), ) all_models_params = { ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {"ll": 10}, ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {"n_clusters": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 10}, } model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json_code = ( "BoxCoxTargetTransformer", ( "GraphPipeline", { "edges": [("KMeansTransformer", "RandomForestClassifier")], "models": { "KMeansTransformer": ("KMeansTransformer", {"n_clusters": 10}), "RandomForestClassifier": ("RandomForestClassifier", {"n_estimators": 10}), }, }, ), {"ll": 10}, ) assert model_json_code == expected_json_code ######################################################### ## ** 1 composition node in the middle of the Graph ** ## ######################################################### Graph = nx.DiGraph() Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")), ) Graph.add_edge( ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")), ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")), ) Graph.add_edge( ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")), ("Model", ("Model", "RandomForestClassifier")), ) all_models_params = { ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {}, ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {"ll": 10}, ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {"n_clusters": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 10}, } model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json_code = ( "GraphPipeline", { "edges": [("NumericalEncoder", "BoxCoxTargetTransformer")], "models": { "BoxCoxTargetTransformer": ( "BoxCoxTargetTransformer", ( "GraphPipeline", { "edges": [("KMeansTransformer", "RandomForestClassifier")], "models": { "KMeansTransformer": ("KMeansTransformer", {"n_clusters": 10}), "RandomForestClassifier": ("RandomForestClassifier", {"n_estimators": 10}), }, }, ), {"ll": 10}, ), "NumericalEncoder": ("NumericalEncoder", {}), }, }, ) assert model_json_code == expected_json_code ################################################### ## ** 1 composition with several nodes bellow ** ## ################################################### Graph = nx.DiGraph() # TODO : try to do a stacking with several things bellow Graph.add_edge( ("Stacking", ("Stacking", "StackingClassifierRegressor")), ("Model", ("Model", "RandomForestClassifier")) ) Graph.add_edge( ("Stacking", ("Stacking", "StackingClassifierRegressor")), ("Model", ("Model", "LogisticRegression")) ) assert _find_first_composition_node(Graph) == ("Stacking", ("Stacking", "StackingClassifierRegressor")) # Rmk : Blending specification is missing, BUT it is enough to test the function all_models_params = { ("Stacking", ("Stacking", "StackingClassifierRegressor")): {"cv": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100}, ("Model", ("Model", "LogisticRegression")): {"C": 10}, } with pytest.raises(ValueError): model_json_code = convert_graph_to_code(Graph, all_models_params) # Unsuported for now : more than one terminal node model_json_code = convert_graph_to_code(Graph, all_models_params, _check_structure=False) expected_json_code1 = ( "StackingClassifierRegressor", [("RandomForestClassifier", {"n_estimators": 100}), ("LogisticRegression", {"C": 10})], {"cv": 10}, ) # expected_json_code2 = ( # "StackingClassifierRegressor", # [("LogisticRegression", {"C": 10}), ("RandomForestClassifier", {"n_estimators": 100})], # {"cv": 10}, # ) assert expected_json_code1 == model_json_code # or (expected_json_code2 == model_json_code) ####################################### ## ** 2 nested compositions steps ** ## ####################################### Graph = nx.DiGraph() Graph.add_edge( ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")), ("UnderOverSampler", ("UnderOverSampler", "TargetUnderSampler")), ) Graph.add_edge( ("UnderOverSampler", ("UnderOverSampler", "TargetUnderSampler")), ("Model", ("Model", "RandomForestClassifier")) ) all_models_params = {} all_models_params[("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer"))] = {"ll": 10} all_models_params[("Model", ("Model", "RandomForestClassifier"))] = {"n_estimators": 100} all_models_params[("UnderOverSampler", ("UnderOverSampler", "TargetUnderSampler"))] = {"target_ratio": "balanced"} assert _find_first_composition_node(Graph) == ( "TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer"), ) assert _find_first_composition_node( Graph, composition_already_done={("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer"))} ) == ("UnderOverSampler", ("UnderOverSampler", "TargetUnderSampler")) model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json_code = ( "BoxCoxTargetTransformer", ("TargetUnderSampler", ("RandomForestClassifier", {"n_estimators": 100}), {"target_ratio": "balanced"}), {"ll": 10}, ) assert model_json_code == expected_json_code ################################################### ## ** 1 composition with several nodes bellow ** ## ################################################### Graph = nx.DiGraph() # TODO : essayer de faire un stacking avec plusieurs trucs en dessous ## 1) with one node above Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking", "StackingClassifierRegressor")), ) Graph.add_edge( ("Stacking", ("Stacking", "StackingClassifierRegressor")), ("Model", ("Model", "RandomForestClassifier")) ) Graph.add_edge( ("Stacking", ("Stacking", "StackingClassifierRegressor")), ("Model", ("Model", "LogisticRegression")) ) all_models_params = { ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {}, ("Stacking", ("Stacking", "StackingClassifierRegressor")): {"cv": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100}, ("Model", ("Model", "LogisticRegression")): {"C": 10}, } model_json_code = convert_graph_to_code(Graph, all_models_params, _check_structure=False) expected_json_code = ( "GraphPipeline", { "edges": [("NumericalEncoder", "StackingClassifierRegressor")], "models": { "NumericalEncoder": ("NumericalEncoder", {}), "StackingClassifierRegressor": ( "StackingClassifierRegressor", [("RandomForestClassifier", {"n_estimators": 100}), ("LogisticRegression", {"C": 10})], {"cv": 10}, ), }, }, ) # Rmk : the Stacker is missing the blender, that I can't enter into the graph.. assert expected_json_code == model_json_code ### With a node above, and a blender bellow Graph = nx.DiGraph() Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking", "OutSampler")) ) Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "RandomForestClassifier"))) Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "LogisticRegression"))) Graph.add_edge(("Model", ("Model", "LogisticRegression")), ("Blender", ("Blender", "LogisticRegression"))) Graph.add_edge(("Model", ("Model", "RandomForestClassifier")), ("Blender", ("Blender", "LogisticRegression"))) all_models_params = { ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {}, ("Stacking", ("Stacking", "OutSampler")): {"cv": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100}, ("Model", ("Model", "LogisticRegression")): {"C": 10}, ("Blender", ("Blender", "LogisticRegression")): {"C": 100}, } model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json = ( "GraphPipeline", { "edges": [("NumericalEncoder", "OutSampler", "Blender_LogisticRegression")], "models": { "Blender_LogisticRegression": ("LogisticRegression", {"C": 100}), "NumericalEncoder": ("NumericalEncoder", {}), "OutSampler": ( "OutSampler", [("RandomForestClassifier", {"n_estimators": 100}), ("LogisticRegression", {"C": 10})], {"cv": 10}, ), }, }, ) assert expected_json == model_json_code ### With encoder feature going back into the Blender Graph = nx.DiGraph() Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking", "OutSampler")) ) Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "RandomForestClassifier"))) Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "LogisticRegression"))) Graph.add_edge(("Model", ("Model", "LogisticRegression")), ("Blender", ("Blender", "LogisticRegression"))) Graph.add_edge(("Model", ("Model", "RandomForestClassifier")), ("Blender", ("Blender", "LogisticRegression"))) Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Blender", ("Blender", "LogisticRegression")) ) all_models_params = { ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {}, ("Stacking", ("Stacking", "OutSampler")): {"cv": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100}, ("Model", ("Model", "LogisticRegression")): {"C": 10}, ("Blender", ("Blender", "LogisticRegression")): {"C": 100}, } model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json = ( "GraphPipeline", { "edges": [ ("NumericalEncoder", "Blender_LogisticRegression"), ("NumericalEncoder", "OutSampler", "Blender_LogisticRegression"), ], "models": { "Blender_LogisticRegression": ("LogisticRegression", {"C": 100}), "NumericalEncoder": ("NumericalEncoder", {}), "OutSampler": ( "OutSampler", [("RandomForestClassifier", {"n_estimators": 100}), ("LogisticRegression", {"C": 10})], {"cv": 10}, ), }, }, ) assert expected_json == model_json_code # Same thing but with 2 OutSampler (one per model) Graph = nx.DiGraph() Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking1", "OutSampler")) ) Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking2", "OutSampler")) ) Graph.add_edge(("Stacking", ("Stacking1", "OutSampler")), ("Model", ("Model", "RandomForestClassifier"))) Graph.add_edge(("Stacking", ("Stacking2", "OutSampler")), ("Model", ("Model", "LogisticRegression"))) Graph.add_edge(("Model", ("Model", "LogisticRegression")), ("Blender", ("Blender", "LogisticRegression"))) Graph.add_edge(("Model", ("Model", "RandomForestClassifier")), ("Blender", ("Blender", "LogisticRegression"))) Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Blender", ("Blender", "LogisticRegression")) ) all_models_params = { ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {}, ("Stacking", ("Stacking1", "OutSampler")): {"cv": 10}, ("Stacking", ("Stacking2", "OutSampler")): {"cv": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100}, ("Model", ("Model", "LogisticRegression")): {"C": 10}, ("Blender", ("Blender", "LogisticRegression")): {"C": 100}, } model_json_code = convert_graph_to_code(Graph, all_models_params) expected_json = ( "GraphPipeline", { "edges": [ ("NumericalEncoder", "Blender_LogisticRegression"), ("NumericalEncoder", "Stacking1_OutSampler", "Blender_LogisticRegression"), ("NumericalEncoder", "Stacking2_OutSampler", "Blender_LogisticRegression"), ], "models": { "Blender_LogisticRegression": ("LogisticRegression", {"C": 100}), "NumericalEncoder": ("NumericalEncoder", {}), "Stacking1_OutSampler": ("OutSampler", ("RandomForestClassifier", {"n_estimators": 100}), {"cv": 10}), "Stacking2_OutSampler": ("OutSampler", ("LogisticRegression", {"C": 10}), {"cv": 10}), }, }, ) assert expected_json == model_json_code ### Multi output ### Graph = nx.DiGraph() Graph.add_node(("Model", ("Model", "LogisticRegression"))) Graph.add_node(("Model", ("Model", "RandomForestClassifier"))) all_models_params = { ("Model", ("Model", "LogisticRegression")): {"C": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100}, } assert _find_first_composition_node(Graph) is None model_json_code = convert_graph_to_code(Graph, all_models_params, _check_structure=False) expected_json = ( "GraphPipeline", { "edges": [("LogisticRegression",), ("RandomForestClassifier",)], "models": { "LogisticRegression": ("LogisticRegression", {"C": 10}), "RandomForestClassifier": ("RandomForestClassifier", {"n_estimators": 100}), }, }, ) assert expected_json == model_json_code ### Impossible graph ### Graph = nx.DiGraph() Graph.add_edge( ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking", "OutSampler")) ) Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "RandomForestClassifier"))) Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "LogisticRegression"))) Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "ExtraTreesClassifier"))) # This edge make it impossible : it comes from the composition node ... # but doesn't have the same child as the other Graph.add_edge(("Model", ("Model", "LogisticRegression")), ("Blender", ("Blender", "LogisticRegression"))) Graph.add_edge(("Model", ("Model", "RandomForestClassifier")), ("Blender", ("Blender", "LogisticRegression"))) # graphviz_modelgraph(Graph) all_models_params = { ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {}, ("Stacking", ("Stacking", "OutSampler")): {"cv": 10}, ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100}, ("Model", ("Model", "ExtraTreesClassifier")): {"n_estimators": 200}, ("Model", ("Model", "LogisticRegression")): {"C": 10}, ("Blender", ("Blender", "LogisticRegression")): {"C": 100}, } with pytest.raises(ValueError): model_json_code = convert_graph_to_code(Graph, all_models_params, _check_structure=False)
def test_RandomModelGenerator_random(num_only, specific_hyper, only_random_forest): #num_only, specific_hyper, only_random_forest = False, True, True dfX, y, auto_ml_config = get_automl_config(num_only) if specific_hyper: auto_ml_config.specific_hyper = { ("Model", "RandomForestClassifier"): { "n_estimators": [10, 20] } } if only_random_forest: auto_ml_config.filter_models(Model="RandomForestClassifier") random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen = [] for _ in range(10): model = random_model_generator.draw_random_graph() all_gen.append(model) assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.nodes: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result sk_model = sklearn_model_from_param(result["json_code"]) assert hasattr(sk_model, "fit") rf_key = ("Model", ("Model", "RandomForestClassifier")) if only_random_forest: assert rf_key in all_models_params if specific_hyper: if rf_key in all_models_params: assert all_models_params[rf_key]["n_estimators"] in (10, 20) if ('Model', ('Model', 'RandomForestClassifier')) in Graph.nodes: # in that case I'll actually do the fitting here # I'll simplify the model to have 2 estimators (faster) all_models_params_copy = deepcopy(all_models_params) all_models_params_copy[('Model', ( 'Model', 'RandomForestClassifier'))]["n_estimators"] = 2 result = convert_graph_to_code(Graph, all_models_params_copy, also_returns_mapping=True) sk_model = sklearn_model_from_param(result["json_code"]) sub_index = np.concatenate( (np.where(y == 0)[0][0:100], np.where(y == 1)[0][0:100]), axis=0) # Needs at least 20 observations to make sure all transformers works if hasattr(sk_model, "verbose"): sk_model.verbose = True sk_model.fit(dfX.iloc[sub_index, :], y[sub_index]) yhat = sk_model.predict(dfX.head(2)) assert yhat.shape == (2, ) if not only_random_forest: assert any([rf_key not in m[1] for m in all_gen ]) # Check that RandomForest wasn't drawn every time ### re-draw them thing with other seed ### random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs1, all_params1, all_blocks1 = zip(*all_gen) all_graphs2, all_params2, all_blocks2 = zip(*all_gen2) assert not _all_same(all_params1) assert not _all_same(all_graphs1) if not num_only: assert not _all_same(all_blocks1) # only one block all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1] all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs2_node_edges assert all_params1 == all_params2 assert all_blocks1 == all_blocks2 ### re-draw by resetting generator ### random_model_generator.random_state = 123 all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs3, all_params3, all_blocks3 = zip(*all_gen3) all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs3_node_edges assert all_params1 == all_params3 assert all_blocks1 == all_blocks3 ### Re-draw by passing a random sate random_state = check_random_state(123) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=random_state) all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs4, all_params4, all_blocks4 = zip(*all_gen4) all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs4_node_edges assert all_params1 == all_params4 assert all_blocks1 == all_blocks4
def test_RandomModelGenerator_iterator(type_of_iterator, num_only): dfX, y, auto_ml_config = get_automl_config(num_only) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) if type_of_iterator == "default": iterator = random_model_generator.iterator_default_models() elif type_of_iterator == "block_search": iterator = random_model_generator.iterate_block_search( random_order=False) elif type_of_iterator == "block_search_random": iterator = random_model_generator.iterate_block_search( random_order=True) assert hasattr(iterator, "__iter__") # verif iterator for model in iterator: assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model terminal_nodes = get_terminal_nodes(Graph) assert len(terminal_nodes) == 1 assert terminal_nodes[0][0] == StepCategories.Model #graphviz_graph(Graph) assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.nodes: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result sk_model = sklearn_model_from_param(result["json_code"]) assert hasattr(sk_model, "fit") if type_of_iterator == "default" and ('Model', ( 'Model', 'RandomForestClassifier')) in Graph.nodes: # in that case I'll actually do the fitting here # I'll simplify the model to have 2 estimators (faster) all_models_params[('Model', ('Model', 'RandomForestClassifier'))]["n_estimators"] = 2 result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) sk_model = sklearn_model_from_param(result["json_code"]) sub_index = np.concatenate( (np.where(y == 0)[0][0:10], np.where(y == 1)[0][0:10]), axis=0) # Needs at least 20 observations to make sure all transformers works sk_model.fit(dfX.iloc[sub_index, :], y[sub_index]) yhat = sk_model.predict(dfX.head(2)) assert yhat.shape == (2, )