def test_invalid_init(): invalid_graph = {'Imputer': [Imputer], 'OHE': OneHotEncoder} with pytest.raises( ValueError, match='All component information should be passed in as a list'): ComponentGraph(invalid_graph) with pytest.raises( ValueError, match='may only contain str or ComponentBase subclasses'): ComponentGraph({ 'Imputer': [Imputer(numeric_impute_strategy="most_frequent")], 'OneHot': [OneHotEncoder] }) graph = { 'Imputer': [Imputer(numeric_impute_strategy='constant', numeric_fill_value=0)] } with pytest.raises( ValueError, match='may only contain str or ComponentBase subclasses'): ComponentGraph(graph) graph = { 'Imputer': ['Imputer', 'Fake'], 'Fake': ['Fake Component', 'Estimator'], 'Estimator': [ElasticNetClassifier] } with pytest.raises(MissingComponentError): ComponentGraph(graph)
def test_component_graph_sampler(): graph = { 'Imputer': [Imputer], 'OneHot': [OneHotEncoder, 'Imputer.x'], 'Undersampler': [Undersampler, 'OneHot.x'], 'Random Forest': [RandomForestClassifier, 'Undersampler.x', 'Undersampler.y'], 'Elastic Net': [ElasticNetClassifier, 'Undersampler.x', 'Undersampler.y'], 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net'] } component_graph = ComponentGraph(graph) component_graph.instantiate({}) assert component_graph.get_parents('Imputer') == [] assert component_graph.get_parents('OneHot') == ['Imputer.x'] assert component_graph.get_parents('Undersampler') == ['OneHot.x'] assert component_graph.get_parents('Random Forest') == [ 'Undersampler.x', 'Undersampler.y' ] assert component_graph.get_parents('Elastic Net') == [ 'Undersampler.x', 'Undersampler.y' ] assert component_graph.get_parents('Logistic Regression') == [ 'Random Forest', 'Elastic Net' ]
def test_init_bad_graphs(): graph = { 'Imputer': [Imputer], 'OHE': [OneHotEncoder, 'Imputer.x', 'Estimator'], 'Estimator': [RandomForestClassifier, 'OHE.x'] } with pytest.raises(ValueError, match='given graph contains a cycle'): ComponentGraph(graph) graph = { 'Imputer': [Imputer], 'OneHot_RandomForest': [OneHotEncoder, 'Imputer.x'], 'OneHot_ElasticNet': [OneHotEncoder, 'Imputer.x'], 'Random Forest': [RandomForestClassifier], 'Elastic Net': [ElasticNetClassifier], 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net'] } with pytest.raises(ValueError, match='graph is not completely connected'): ComponentGraph(graph) graph = { 'Imputer': ['Imputer'], 'OneHot_RandomForest': ['One Hot Encoder', 'Imputer.x'], 'OneHot_ElasticNet': ['One Hot Encoder', 'Imputer.x'], 'Random Forest': ['Random Forest Classifier', 'OneHot_RandomForest.x'], 'Elastic Net': ['Elastic Net Classifier'], 'Logistic Regression': ['Logistic Regression Classifier', 'Random Forest', 'Elastic Net'] } with pytest.raises(ValueError, match='graph has more than one final'): ComponentGraph(graph)
def __init__(self, parameters, random_state=0): """Machine learning pipeline made out of transformers and a estimator. Required Class Variables: component_graph (list): List of components in order. Accepts strings or ComponentBase subclasses in the list Arguments: parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary {} implies using all default values for component parameters. random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. """ self.random_state = get_random_state(random_state) if isinstance(self.component_graph, list): # Backwards compatibility self._component_graph = ComponentGraph().from_list(self.component_graph, random_state=self.random_state) else: self._component_graph = ComponentGraph(component_dict=self.component_graph, random_state=self.random_state) self._component_graph.instantiate(parameters) self.input_feature_names = {} self.input_target_name = None final_component = self._component_graph.get_last_component() self.estimator = final_component if isinstance(final_component, Estimator) else None self._estimator_name = self._component_graph.compute_order[-1] if self.estimator is not None else None self._validate_estimator_problem_type() self._is_fitted = False self._pipeline_params = parameters.get("pipeline", {})
def test_predict_repeat_estimator(mock_predict, mock_fit, X_y_binary): X, y = X_y_binary mock_predict.return_value = ww.DataColumn(pd.Series(y)) graph = { 'Imputer': [Imputer], 'OneHot_RandomForest': [OneHotEncoder, 'Imputer.x'], 'OneHot_Logistic': [OneHotEncoder, 'Imputer.x'], 'Random Forest': [RandomForestClassifier, 'OneHot_RandomForest.x'], 'Logistic Regression': [LogisticRegressionClassifier, 'OneHot_Logistic.x'], 'Final Estimator': [LogisticRegressionClassifier, 'Random Forest', 'Logistic Regression'] } component_graph = ComponentGraph(graph) component_graph.instantiate({}) component_graph.fit(X, y) assert not component_graph.get_component( 'Logistic Regression')._component_obj == component_graph.get_component( 'Final Estimator')._component_obj component_graph.predict(X) assert mock_predict.call_count == 5 assert mock_fit.call_count == 3
def test_no_instantiate_before_fit(X_y_binary): X, y = X_y_binary graph = {'Imputer': [Imputer], 'OHE': [OneHotEncoder, 'Imputer.x'], 'Estimator': [RandomForestClassifier, 'OHE.x']} component_graph = ComponentGraph(graph) with pytest.raises(ValueError, match='All components must be instantiated before fitting or predicting'): component_graph.fit(X, y)
def test_component_graph_order(example_graph): component_graph = ComponentGraph(example_graph) expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] assert expected_order == component_graph.compute_order component_graph = ComponentGraph({'Imputer': [Imputer]}) expected_order = ['Imputer'] assert expected_order == component_graph.compute_order
def test_compute_final_component_features_single_component(mock_transform, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) mock_transform.return_value = ww.DataTable(X) component_graph = ComponentGraph({'Dummy Component': [DummyTransformer]}).instantiate({}) component_graph.fit(X, y) X_t = component_graph.compute_final_component_features(X) assert_frame_equal(X, X_t.to_dataframe())
def test_fit_y_parent(mock_fit_transform, X_y_binary): X, y = X_y_binary graph = {'Imputer': [Imputer], 'OHE': [OneHotEncoder, 'Imputer.x', 'Imputer.y'], 'Random Forest': [RandomForestClassifier, 'OHE.x']} component_graph = ComponentGraph(graph).instantiate({}) mock_fit_transform.return_value = tuple((pd.DataFrame(X), pd.Series(y))) component_graph.fit(X, y) mock_fit_transform.assert_called_once()
def test_fit(mock_predict, mock_fit, mock_fit_transform, example_graph, X_y_binary): X, y = X_y_binary mock_fit_transform.return_value = ww.DataTable(X) mock_predict.return_value = ww.DataColumn(y) component_graph = ComponentGraph(example_graph).instantiate({}) component_graph.fit(X, y) assert mock_fit_transform.call_count == 3 assert mock_fit.call_count == 3 assert mock_predict.call_count == 2
def test_init(example_graph): comp_graph = ComponentGraph() assert len(comp_graph.component_dict) == 0 graph = example_graph comp_graph = ComponentGraph(graph) assert len(comp_graph.component_dict) == 6 expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] assert comp_graph.compute_order == expected_order
def test_predict_transformer_end(mock_fit_transform, mock_transform, X_y_binary): X, y = X_y_binary graph = {'Imputer': [Imputer], 'OHE': [OneHotEncoder, 'Imputer.x']} component_graph = ComponentGraph(graph).instantiate({}) mock_fit_transform.return_value = tuple((pd.DataFrame(X), pd.Series(y))) mock_transform.return_value = tuple((pd.DataFrame(X), pd.Series(y))) component_graph.fit(X, y) output = component_graph.predict(X) assert_frame_equal(pd.DataFrame(X), output.to_dataframe())
def test_iteration(example_graph): component_graph = ComponentGraph(example_graph) expected = [Imputer, OneHotEncoder, ElasticNetClassifier, OneHotEncoder, RandomForestClassifier, LogisticRegressionClassifier] iteration = [component for component in component_graph] assert iteration == expected component_graph.instantiate({'OneHot_RandomForest': {'top_n': 32}}) expected = [Imputer(), OneHotEncoder(), ElasticNetClassifier(), OneHotEncoder(top_n=32), RandomForestClassifier(), LogisticRegressionClassifier()] iteration = [component for component in component_graph] assert iteration == expected
def __init__(self, component_graph, parameters=None, custom_name=None, custom_hyperparameters=None, random_seed=0): """Machine learning pipeline made out of transformers and a estimator. Arguments: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph [Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names ["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"] parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. Defaults to None. custom_name (str): Custom name for the pipeline. Defaults to None. custom_hyperparameters (dict): Custom hyperparameter range for the pipeline. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """ self._custom_hyperparameters = custom_hyperparameters self.random_seed = random_seed self.component_graph = component_graph if isinstance(component_graph, list): # Backwards compatibility self._component_graph = ComponentGraph().from_list( component_graph, random_seed=self.random_seed) else: self._component_graph = ComponentGraph( component_dict=component_graph, random_seed=self.random_seed) self._component_graph.instantiate(parameters) self.input_feature_names = {} self.input_target_name = None self.estimator = None if len(self._component_graph.compute_order) > 0: final_component = self._component_graph.get_last_component() self.estimator = final_component if isinstance( final_component, Estimator) else None self._estimator_name = self._component_graph.compute_order[ -1] if self.estimator is not None else None self._validate_estimator_problem_type() self._is_fitted = False self._pipeline_params = None if parameters is not None: self._pipeline_params = parameters.get("pipeline", {}) self._custom_name = custom_name
def test_fit_correct_inputs(mock_ohe_fit_transform, mock_imputer_fit_transform, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) y = pd.Series(y) graph = {'Imputer': [Imputer], 'OHE': [OneHotEncoder, 'Imputer.x', 'Imputer.y']} expected_x = ww.DataTable(pd.DataFrame(index=X.index, columns=X.index).fillna(1)) expected_y = ww.DataColumn(pd.Series(index=y.index).fillna(0)) mock_imputer_fit_transform.return_value = tuple((expected_x, expected_y)) mock_ohe_fit_transform.return_value = expected_x component_graph = ComponentGraph(graph).instantiate({}) component_graph.fit(X, y) expected_x_df = expected_x.to_dataframe().astype("Int64") assert_frame_equal(expected_x_df, mock_ohe_fit_transform.call_args[0][0].to_dataframe()) assert_series_equal(expected_y.to_series(), mock_ohe_fit_transform.call_args[0][1].to_series())
def test_compute_final_component_features_nonlinear(mock_en_predict, mock_rf_predict, mock_ohe, mock_imputer, example_graph, X_y_binary): X, y = X_y_binary mock_imputer.return_value = ww.DataTable(pd.DataFrame(X)) mock_ohe.return_value = ww.DataTable(pd.DataFrame(X)) mock_en_predict.return_value = ww.DataColumn(pd.Series(np.ones(X.shape[0]))) mock_rf_predict.return_value = ww.DataColumn(pd.Series(np.zeros(X.shape[0]))) X_expected = pd.DataFrame({'Random Forest': np.zeros(X.shape[0]), 'Elastic Net': np.ones(X.shape[0])}) component_graph = ComponentGraph(example_graph).instantiate({}) component_graph.fit(X, y) X_t = component_graph.compute_final_component_features(X) assert_frame_equal(X_expected, X_t.to_dataframe()) assert mock_imputer.call_count == 2 assert mock_ohe.call_count == 4
def test_component_graph_sampler_list(): component_list = [ 'Imputer', 'One Hot Encoder', 'Undersampler', 'Random Forest Classifier' ] component_graph = ComponentGraph.from_list(component_list) assert len(component_graph.component_dict) == 4 assert component_graph.get_component('Imputer') == Imputer assert component_graph.get_component('One Hot Encoder') == OneHotEncoder assert component_graph.get_component('Undersampler') == Undersampler assert component_graph.get_component( 'Random Forest Classifier') == RandomForestClassifier assert component_graph.compute_order == component_list assert component_graph.component_dict == { 'Imputer': [Imputer], 'One Hot Encoder': [OneHotEncoder, 'Imputer.x'], 'Undersampler': [Undersampler, 'One Hot Encoder.x'], 'Random Forest Classifier': [RandomForestClassifier, 'Undersampler.x', 'Undersampler.y'] } assert component_graph.get_parents('Imputer') == [] assert component_graph.get_parents('One Hot Encoder') == ['Imputer.x'] assert component_graph.get_parents('Undersampler') == ['One Hot Encoder.x'] assert component_graph.get_parents('Random Forest Classifier') == [ 'Undersampler.x', 'Undersampler.y' ]
def test_order_x_and_y(): graph = { 'Imputer': [Imputer], 'OHE': [OneHotEncoder, 'Imputer.x', 'Imputer.y'], 'Random Forest': [RandomForestClassifier, 'OHE.x'] } component_graph = ComponentGraph(graph).instantiate({}) assert component_graph.compute_order == ['Imputer', 'OHE', 'Random Forest']
def test_instantiate_without_parameters(example_graph): graph = example_graph component_graph = ComponentGraph(graph) component_graph.instantiate({}) assert component_graph.get_component('OneHot_RandomForest').parameters['top_n'] == 10 assert component_graph.get_component('OneHot_ElasticNet').parameters['top_n'] == 10 assert component_graph.get_component('OneHot_RandomForest') is not component_graph.get_component('OneHot_ElasticNet') expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] assert component_graph.compute_order == expected_order
def test_bad_instantiate_can_reinstantiate(example_graph): component_graph = ComponentGraph(example_graph) with pytest.raises(ValueError, match='Error received when instantiating component'): component_graph.instantiate(parameters={'Elastic Net': {'max_iter': 100, 'fake_param': None}}) component_graph.instantiate({'Elastic Net': {'max_iter': 22}}) assert component_graph.get_component('Elastic Net').parameters['max_iter'] == 22
def model_family(cls): "Returns model family of this pipeline template""" component_graph = copy.copy(cls.component_graph) if isinstance(component_graph, list): return handle_component_class(component_graph[-1]).model_family else: order = ComponentGraph.generate_order(component_graph) final_component = order[-1] return handle_component_class(component_graph[final_component][0]).model_family
def test_component_graph(): graph = {'Imputer': ['Imputer'], 'OneHot_RandomForest': ['One Hot Encoder', 'Imputer.x'], 'OneHot_ElasticNet': ['One Hot Encoder', 'Imputer.x'], 'Random Forest': ['Random Forest Classifier', 'OneHot_RandomForest.x'], 'Elastic Net': ['Elastic Net Classifier', 'OneHot_ElasticNet.x'], 'Logistic Regression': ['Logistic Regression Classifier', 'Random Forest', 'Elastic Net']} component_graph = ComponentGraph(graph) return component_graph
def test_predict_empty_graph(X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) component_graph = ComponentGraph() component_graph.instantiate({}) component_graph.fit(X, y) X_t = component_graph.predict(X) assert_frame_equal(X, X_t.to_dataframe())
def test_instantiate_from_list(): component_list = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier'] component_graph = ComponentGraph().from_list(component_list) parameters = {'One Hot Encoder': {'top_n': 7}} component_graph.instantiate(parameters) assert isinstance(component_graph.get_component('Imputer'), Imputer) assert isinstance(component_graph.get_component('Random Forest Classifier'), RandomForestClassifier) assert component_graph.get_component('One Hot Encoder').parameters['top_n'] == 7
def test_from_list_repeat_component(): component_list = ['Imputer', 'One Hot Encoder', 'One Hot Encoder', RandomForestClassifier] component_graph = ComponentGraph.from_list(component_list) expected_order = ['Imputer', 'One Hot Encoder', 'One Hot Encoder_2', 'Random Forest Classifier'] assert component_graph.compute_order == expected_order component_graph.instantiate({'One Hot Encoder': {'top_n': 2}, 'One Hot Encoder_2': {'top_n': 11}}) assert component_graph.get_component('One Hot Encoder').parameters['top_n'] == 2 assert component_graph.get_component('One Hot Encoder_2').parameters['top_n'] == 11
def test_init_str_components(): graph = {'Imputer': ['Imputer'], 'OneHot_RandomForest': ['One Hot Encoder', 'Imputer.x'], 'OneHot_ElasticNet': ['One Hot Encoder', 'Imputer.x'], 'Random Forest': ['Random Forest Classifier', 'OneHot_RandomForest.x'], 'Elastic Net': ['Elastic Net Classifier', 'OneHot_ElasticNet.x'], 'Logistic Regression': ['Logistic Regression Classifier', 'Random Forest', 'Elastic Net']} comp_graph = ComponentGraph(graph) assert len(comp_graph.component_dict) == 6 expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression'] assert comp_graph.compute_order == expected_order
def test_from_list(): component_list = ['Imputer', 'One Hot Encoder', RandomForestClassifier] component_graph = ComponentGraph.from_list(component_list) assert len(component_graph.component_dict) == 3 assert component_graph.get_component('Imputer') == Imputer assert component_graph.get_component('One Hot Encoder') == OneHotEncoder assert component_graph.get_component('Random Forest Classifier') == RandomForestClassifier expected_order = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier'] assert component_graph.compute_order == expected_order assert component_graph.component_dict == { 'Imputer': [Imputer], 'One Hot Encoder': [OneHotEncoder, 'Imputer.x'], 'Random Forest Classifier': [RandomForestClassifier, 'One Hot Encoder.x'] } bad_component_list = ['Imputer', 'Fake Estimator'] with pytest.raises(MissingComponentError, match='was not found'): ComponentGraph.from_list(bad_component_list)
def test_input_feature_names(example_graph): X = pd.DataFrame({ 'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'], 'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3] }) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) component_graph = ComponentGraph(example_graph) component_graph.instantiate({ 'OneHot_RandomForest': { 'top_n': 2 }, 'OneHot_ElasticNet': { 'top_n': 3 } }) assert component_graph.input_feature_names == {} component_graph.fit(X, y) input_feature_names = component_graph.input_feature_names assert input_feature_names['Imputer'] == ['column_1', 'column_2'] assert input_feature_names['OneHot_RandomForest'] == [ 'column_1', 'column_2' ] assert input_feature_names['OneHot_ElasticNet'] == ['column_1', 'column_2'] assert input_feature_names['Random Forest'] == [ 'column_2', 'column_1_a', 'column_1_b' ] assert input_feature_names['Elastic Net'] == [ 'column_2', 'column_1_a', 'column_1_b', 'column_1_c' ] assert input_feature_names['Logistic Regression'] == [ 'Random Forest', 'Elastic Net' ]
def test_component_graph_dataset_with_different_types(): # Checks that types are converted correctly by Woodwork. Specifically, the standard scaler # should convert column_3 to float, so our code to try to convert back to the original boolean type # will catch the TypeError thrown and not convert the column. graph = {'Imputer': [Imputer], 'OneHot': [OneHotEncoder, 'Imputer.x'], 'DateTime': [DateTimeFeaturizer, 'OneHot.x'], 'Scaler': [StandardScaler, 'DateTime.x'], 'Random Forest': [RandomForestClassifier, 'Scaler.x'], 'Elastic Net': [ElasticNetClassifier, 'Scaler.x'], 'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']} X = pd.DataFrame({'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'], 'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3], 'column_3': [True, False, True, False, True, False, True, False, False]}) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) X = infer_feature_types(X, {"column_2": "categorical"}) component_graph = ComponentGraph(graph) component_graph.instantiate({}) assert component_graph.input_feature_names == {} component_graph.fit(X, y) input_feature_names = component_graph.input_feature_names assert input_feature_names['Imputer'] == ['column_1', 'column_2', 'column_3'] assert input_feature_names['OneHot'] == ['column_1', 'column_2', 'column_3'] assert input_feature_names['DateTime'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d', 'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6'] assert input_feature_names['Scaler'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d', 'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6'] assert input_feature_names['Random Forest'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d', 'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6'] assert input_feature_names['Elastic Net'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d', 'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6'] assert input_feature_names['Logistic Regression'] == ['Random Forest', 'Elastic Net']
def test_predict(mock_predict, mock_fit, example_graph, X_y_binary): X, y = X_y_binary mock_predict.return_value = ww.DataColumn(pd.Series(y)) component_graph = ComponentGraph(example_graph).instantiate({}) component_graph.fit(X, y) component_graph.predict(X) assert mock_predict.call_count == 5 # Called twice when fitting pipeline, thrice when predicting assert mock_fit.call_count == 3 # Only called during fit, not predict